import pandas as pd
[docs]def load_data(data, sep=';', na_strings=None, unclassified_data=False):
"""
Load pycroc data: a customized pandas.read_csv() function that checks the conformity of the dataset format, and only if all checks are passed, loads it.
The dataset to be analysed should be in text format, which can be comma, tab or semicolon separated:
- The 1st column must contain unique patient/sample IDs as strings.
- The 2nd column must contain the class to which each sample belongs, if data is not unlaballed; the classes must be exactly 2 and they must be strings.
- From the 3rd column on (2nd if data is unlabelled), the dataset must contain numerical values that represent the signal corresponding to the markers abundance in each sample (marker-related columns).
- Marker-related columns can be called 'Marker1, Marker2, Marker3, ...' or can be called directly with the gene/protein name, but "-" is not allowed in the column name.
If all the checks are passed, the function alphabetically reorders the marker-related columns depending on marker names (necessary for a proper computation of combinations), and it forces "Class" as 2nd column name.
Parameters
----------
data: str
the path of the file which the data are to be read from.
sep: str
the field separator string. Default is ';'.
na_string: str
string to be interpreted as NA values. Default is None.
unclassified_data: bool
it specifies if data is labelled (False) or unlabelled (True).
Returns
-------
df: pandas.DataFrame
a dataframe containing a representation of the data in the file, properly formatted for pycroc analysis.
"""
df = pd.read_csv(data, sep=sep, na_values=na_strings)
if len(df[df.columns[0]].unique())!= df.shape[0]: # IDs must be unique
raise ValueError( "Column "+df.columns[0]+ ' must have unique values')
if unclassified_data==False:
if df.columns[1]!='Class': # The second column must be "Class"
df.insert(1, 'Class', df.iloc[:,1])
del df[df.columns[2]]
st=[]
for n in range(0, df.shape[0]): # Class values must be str
st.append(type(df['Class'][n])==str)
if sum(st)!= df.shape[0]:
raise TypeError( "2nd column must contain strings")
elif len(df[df.columns[1]].unique())!=2: # There must be 2 classes
raise ValueError( "2nd column must contain 2 classes but there are "+ str(len(df[df.columns[1]].unique())))
else: #reordering marker columns alphabetically - necessary to properly compute combinations later
df=df.loc[:,list(df.columns[0:2])+ sorted(list(df.columns[2:df.shape[1]]))]
for i in range(2, df.shape[1]):
if not (df[df.columns[i]].dtype=='int64') | (df[df.columns[i]].dtype== 'float'): # Markers values must be numeric
raise TypeError( "Column "+df.columns[i]+ ' is not numeric (int64 or float)')
elif '-' in df.columns[i]: # Markers name must not contain '-'
raise ValueError( "Markers name must not contain '-'")
else:
for i in range(1, df.shape[1]):
if not (df[df.columns[i]].dtype=='int64') | (df[df.columns[i]].dtype== 'float'): # Markers values must be numeric
raise TypeError( "Column "+df.columns[i]+ ' is not numeric (int64 or float)')
elif '-' in df.columns[i]: # Markers name must not contain '-'
raise ValueError( "Markers name must not contain '-'")
df=df.fillna('')
return df