Source code for pycroc.combinations

import pandas as pd
import itertools
import numpy as np

import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns

import statsmodels.api as sm # 0.9.0
import statsmodels.formula.api as smf

from sklearn import metrics, preprocessing

[docs]class Combinations: """ A class used to perform the combinatorial analysis and store its results. ... Attributes ---------- tab: pandas.DataFrame a DataFrame containing how many samples of each class are "positive" , SE, SP, and the number of markers for each combination. ranked: pandas.DataFrame a DataFrame with ranked combination, reporting: SE, SP, number of markers composing the combination and the Youden index. training_data: pandas.DataFrame a DataFrame containing the training dataset (data after the imputation or removal of missing values) models: dict(str:statsmodels.genmod.generalized_linear_model.GLMResultsWrapper) a dict containing trained models for the selected combinations. reports: pandas.DataFrame a data.frame with parameters of ROC curves at the optimal cutoff. Methods ------- ranked_combs(self, min_SE=0, min_SP=0) To rank combinations by Youden index and select them if they have a min SE and/or SP. bubble_chart(self, min_SE=0, min_SP=0) To plot a bubble chart showing gold combinations train_models(self, data, combinations=[]) To compute General Linear Model (binomial) for each selected combination roc_curves(self, data, combinations=[]) To show reports of ROC curves of trained models """ #Attributes tab = None ranked = None training_data=None models = None reports = None #TODO:evaluate if a case_class would be useful or if it is better to explicit the case class at each time #Constructor def __init__(self, data, case_class, signalthr, combithr=1, max_length=None, deal_NaN=None): """ Computes the marker combinations, counts their corresponding positive samples for each class and calculates SE and SP (once thresholds are selected). Parameters ---------- data: pandas.DataFrame a labelled pycroc object returned by load_data(). case_class: str a string that specifies which of the two classes of the dataset is the case class. signalthr: int or float a number that specifies the value above which a marker expression is considered positive for a given observation. combithr: int a numeric value that specifies the necessary number of positively expressed markers (>= signalthr), in a given combination, to cosinder that combination positivelly expressed in a sample. max_length: int an integer that specifies the max combination length that is allowed. deal_NaN: str a character that specifies how to treat missing values. With 'impute' missing values of each marker are substituted with the median of that given marker values in the class that observation belongs to. With 'remove' the whole observations containing a missing value are removed'. """ #equal to combi() markers=data.columns[2:] combs=[] combs_len=[] pos=[] names=[] control_class=data['Class'].unique()[data['Class'].unique()!=case_class][0] if deal_NaN != None: if deal_NaN not in ['impute', 'remove']: raise ValueError('deal_NaN must be "impute" or "remove"' ) if max(np.sum(data==''))>0: if deal_NaN=='impute': for m in data.columns[2:]: data.loc[(data.loc[:,m]=='')&(data.iloc[:,1]==case_class),m]=np.median(data.loc[(data.loc[:,m]!='')& (data.iloc[:,1]==case_class),m]) data.loc[(data.loc[:,m]=='')&(data.iloc[:,1]==control_class),m]=np.median(data.loc[(data.loc[:,m]!='')& (data.iloc[:,1]==control_class),m]) data[m]= pd.to_numeric(data[m]) if deal_NaN=='remove': pos=np.where(data=='') pos = set(data.index)- set(np.unique(pos[0])) pos = list(pos) data=data.iloc[pos, :] data.reset_index(inplace=True) del data['index'] for m in data.columns[2:]: data[m]= pd.to_numeric(data[m]) training_data=data self.training_data=training_data if max_length==None: N=len(markers)+1 else: N=max_length+1 for L in range(1, N): for subset in itertools.combinations(markers, L): combs.append(subset) combs_len.append(len(subset)) pos.append([sum(np.sum(data.loc[data.Class==case_class,subset] >= signalthr, axis=1) >= combithr), sum(np.sum(data.loc[data.Class==control_class,subset] >= signalthr, axis=1) >= combithr)]) names.append('-'.join(subset)) comps=names.copy() c=['Combination '+str(i) for i in range(1,(len(combs)-len(markers)+1))] comps[len(markers):len(combs)]=c tab=pd.DataFrame(comps, columns=['index']) tab['Markers']=names tab=pd.concat([tab,pd.DataFrame(pos, columns=['#Positives '+case_class, '#Positives '+control_class ])],axis=1) tab.set_index('index', inplace=True) tab['SE']=round((100*tab.iloc[:, 1]/sum(data.Class==case_class)), 1) tab['SP']=round((100*(1-(tab.iloc[:, 2]/sum(data.Class==control_class)))), 1) tab['# of markers']=combs_len self.tab = tab #Methods
[docs] def ranked_combs(self, min_SE=0, min_SP=0): """ To rank combinations by Youden index and select them if they have a min SE and/or SP. Parameters ---------- min_SE: int or float a numeric value that specifies the min value of SE that a threshold must have to be included min_SP: int or float a numeric value that specifies the min value of SP that a threshold must have to be included """ rnk = self.tab rnk['Youden']=(self.tab.SE + self.tab.SP - 100)/100 rnk = rnk.iloc[:, [3,4,5,6]] #rnk['# of markers'] = [i.count('-')+1 for i in self.tab.Markers] rnk.sort_values('Youden', ascending=False, inplace=True) rnk = rnk.loc[(rnk.SE >= min_SE) & (rnk.SP>=min_SP),:] self.ranked = rnk
#
[docs] def bubble_chart(self, min_SE=0, min_SP=0): """ To plot a bubble chart showing gold combinations. Parameters ---------- min_SE: int or float a numeric value that specifies the min value of SE that a combination must have to be included min_SP: int or float a numeric value that specifies the min value of SP that a combination must have to be included """ combinations = self.tab combinations['Combo'] = (self.tab.SE>= min_SE) & (self.tab.SP>= min_SP) combinations['Combo'].replace(to_replace=[True, False], value=['gold combination', 'below thresholdS'], inplace=True) combinations['Combo'] = combinations['Combo'].astype('category') bc=sns.scatterplot(data = combinations, hue='Combo', x="SP", y="SE", size="# of markers", alpha=0.7, sizes=(10, 600)) bc.axhline(min_SE, color='black', ls='--', linewidth=1) bc.axvline(min_SP, color='black', ls='--', linewidth=1) plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#
[docs] def train_models(self, combinations=[]): """ To compute General Linear Model (binomial) for each selected combination. Parameters ---------- data: pandas.DataFrame a pycroc object returned by load_data(). combinations: list(str) the names of the selected combinations. """ #change data parameters to train or trainingset?? combs=self.tab.loc[combinations,'Markers'] models={} for i in combinations: m= combs.loc[i].split('-') y=['np.log('+x+'+1)' for x in m] models[i]=smf.glm(formula = "Class ~ "+'+'.join(y), family=sm.families.Binomial(), data=self.training_data).fit() self.models = models
#
[docs] def roc_curves(self, case_class): """ To show reports of ROC curves of trained models. Parameters ---------- case_class: str a string that specifies which of the two classes of the dataset is the case class. """ data = self.training_data control_class = data['Class'].unique()[data['Class'].unique() != case_class][0] y = data.Class y = y.replace([case_class, control_class], [1,0]) reports = pd.DataFrame() SESP = [] #used to store xy coordinates for the plots for comb in self.models.keys(): combi_score = self.models[comb].predict() x = np.sort(combi_score) auc = metrics.roc_auc_score(y, combi_score) threshold=[] SE=[] SP=[] df=[] for i in range(0, len(x)-1): threshold.append((x[i]+x[i+1])/2) for i in threshold: tp = sum((combi_score>i) & (data['Class']==case_class)) fp = sum((combi_score>i) & (data['Class']==control_class)) tn = sum((combi_score<=i) & (data['Class']==control_class)) fn = sum((combi_score<=i) & (data['Class']==case_class)) totp = sum((data['Class']==case_class)) totn = sum((data['Class']==control_class)) se = tp/totp sp = tn/totn SE.append(se) SP.append(sp) acc = (tp+tn)/data.shape[0] df.append([auc,se,sp,i,se+sp-1,acc, tn, tp, fn, fp]) #save statistics in the dataframe df = pd.DataFrame(df, columns=['AUC','SE', 'SP', 'OptCutoff', 'Youden', 'ACC', 'TN', 'TP', 'FN', 'FP']) df = df.sort_values('Youden', ascending=False) df.reset_index(inplace=True) del df['index'] df = df.iloc[0,:] reports[comb] = df #compose coordinates for ROC plotting SESP_tmp = pd.DataFrame(zip(abs(np.array(SP)-1),SE), columns=['SP','SE']) SESP_tmp['comb'] = comb SESP.append(SESP_tmp) del SESP_tmp self.reports = reports.transpose() #Plotting fig, ax = plt.subplots() for section, group in pd.concat(SESP).groupby('comb'): group.plot(x='SP', y='SE', ax=ax, label=section) plt.title('Receiver Operating Characteristic') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.legend()