Source code for synthetic_aia_mia.fetch_data.split
"""Split data into train / test using 5 folding corss validation."""
import numpy as np
from sklearn.model_selection import StratifiedKFold
[docs]
def split_numpy(data,k=0):
"""5-folding of dataset dictionary of numpy array.
:param data: Dataset where each key maps to a numpy array.
:type data: Dictionary
:param k: (Optional) Indice of the fold, can be 0,1,2,3 or 4.
:type k: int
:return: Dataset with train and test.
:rtype: Dictionary
"""
keys = list(data.keys())
n = np.shape(data[keys[0]])[0]
idx = np.linspace(0,n-1,n).astype(int)
test = (idx[int(k*0.2*(n))]<=idx)&(idx<=idx[int((k+1)*0.2*(n-1))])
train = ~test
data_split = {"train":{},"test":{}}
for key in keys:
data_split["train"][key] = data[key][train]
data_split["test"][key] = data[key][test]
return data_split
[docs]
def split_pandas(data,k=0):
"""5-folding of dataset dictionary of numpy array.
:param data: Dataset in the form of a dataframe.
:type data: pandas.dataframe
:param k: (Optional) Indice of the fold, can be 0,1,2,3 or 4.
:type k: int
:return: Dataset with train and test.
:rtype: Dictionary
"""
skf = StratifiedKFold(random_state=1234,shuffle=True)
for i,(tmp_train,tmp_test) in enumerate(skf.split(data,data["PINCP"])):
if i==k:
train = tmp_train
test = tmp_test
data = {"train":data.iloc[train],
"test":data.iloc[test]}
return data