Source code for pycroc.combi_score

import pandas as pd
import numpy as np
import sklearn

[docs]def combi_score(data,models, classify=False, reports=None, case_class=True, control_class=False, deal_NaN=None): """ A function to fit the models of combinations to obtain the combi-score (response) or classify a new dataset. Parameters ---------- models: dict(str:statsmodels.genmod.generalized_linear_model.GLMResultsWrapper): a dict containing trained models, returned by train_models(). data: pandas.DataFrame a dataframe containing values for the very same markers of the combinations in models. classify: bool if false the function computes the response values for each model, if true it also compares response values to the optimal cutoff to classify the observations. reports: pandas.DataFrame a data.frame with parameters of ROC curves at the optimal cutoff, returned by roc_reports(). Required only if classify==True. case_class: str a string that specifies the name to assign to positively classified observations. Required only if classify==True. control_class: str a string that specifies the name to assign to negatively classified observations. Required only if classify==True. deal_NaN: str a character that specifies how to treat missing values. With 'impute' missing values of each marker are substituted with the median of that given marker values in the class that observation belongs to. With 'remove' the whole observations containing a missing value are removed'. Returns ------- res: pandas.DataFrame a dataframe showing response values or assigned labels of each observation for each combination model. """ data=data.fillna('') if data.columns[1]=='Class': c=2 else: c=1 if deal_NaN!=None: if deal_NaN not in ['impute', 'remove']: raise ValueError('deal_NaN must be "impute" or "remove"' ) if max(np.sum(data==''))>0: if deal_NaN=='impute': for m in data.columns[c:]: data[m][data[m]=='']=np.median(data[m][data[m]!='']) data[m]= pd.to_numeric(data[m]) if deal_NaN=='remove': pos=np.where(data=='') pos = set(data.index)- set(np.unique(pos[0])) pos = list(pos) data=data.iloc[pos, :] data.reset_index(inplace=True) del data['index'] for m in data.columns[c:]: data[m]= pd.to_numeric(data[m]) if data.columns[1]=='Class': sample_id = data.iloc[:,0] data=data.iloc[:,2:] else: sample_id = data.iloc[:,0] data=data.iloc[:,1:] res=pd.DataFrame() for comb in models.keys(): response= models[comb].predict(exog=data) if classify==True: response= response> reports.loc[comb, 'OptCutoff'] response.replace([True, False],[case_class, control_class], inplace=True) res[comb]= response #add ID res = pd.concat([sample_id, res], axis=1) return res