import pandas as pd
import itertools
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm # 0.9.0
import statsmodels.formula.api as smf
from sklearn import metrics, preprocessing
[docs]class Combinations:
"""
A class used to perform the combinatorial analysis and store its results.
...
Attributes
----------
tab: pandas.DataFrame
a DataFrame containing how many samples of each class are "positive" , SE, SP, and the number of markers for each combination.
ranked: pandas.DataFrame
a DataFrame with ranked combination, reporting: SE, SP, number of markers composing the combination and the Youden index.
training_data: pandas.DataFrame
a DataFrame containing the training dataset (data after the imputation or removal of missing values)
models: dict(str:statsmodels.genmod.generalized_linear_model.GLMResultsWrapper)
a dict containing trained models for the selected combinations.
reports: pandas.DataFrame
a data.frame with parameters of ROC curves at the optimal cutoff.
Methods
-------
ranked_combs(self, min_SE=0, min_SP=0)
To rank combinations by Youden index and select them if they have a min SE and/or SP.
bubble_chart(self, min_SE=0, min_SP=0)
To plot a bubble chart showing gold combinations
train_models(self, data, combinations=[])
To compute General Linear Model (binomial) for each selected combination
roc_curves(self, data, combinations=[])
To show reports of ROC curves of trained models
"""
#Attributes
tab = None
ranked = None
training_data=None
models = None
reports = None
#TODO:evaluate if a case_class would be useful or if it is better to explicit the case class at each time
#Constructor
def __init__(self, data, case_class, signalthr, combithr=1, max_length=None, deal_NaN=None):
"""
Computes the marker combinations, counts their corresponding positive samples for each class and calculates SE and SP (once thresholds are selected).
Parameters
----------
data: pandas.DataFrame
a labelled pycroc object returned by load_data().
case_class: str
a string that specifies which of the two classes of the dataset is the case class.
signalthr: int or float
a number that specifies the value above which a marker expression is considered positive for a given observation.
combithr: int
a numeric value that specifies the necessary number of positively expressed markers (>= signalthr), in a given combination, to cosinder that combination positivelly expressed in a sample.
max_length: int
an integer that specifies the max combination length that is allowed.
deal_NaN: str
a character that specifies how to treat missing values. With 'impute' missing values of each marker are substituted with the median of that given marker values in the class that observation belongs to. With 'remove' the whole observations containing a missing value are removed'.
"""
#equal to combi()
markers=data.columns[2:]
combs=[]
combs_len=[]
pos=[]
names=[]
control_class=data['Class'].unique()[data['Class'].unique()!=case_class][0]
if deal_NaN != None:
if deal_NaN not in ['impute', 'remove']:
raise ValueError('deal_NaN must be "impute" or "remove"' )
if max(np.sum(data==''))>0:
if deal_NaN=='impute':
for m in data.columns[2:]:
data.loc[(data.loc[:,m]=='')&(data.iloc[:,1]==case_class),m]=np.median(data.loc[(data.loc[:,m]!='')&
(data.iloc[:,1]==case_class),m])
data.loc[(data.loc[:,m]=='')&(data.iloc[:,1]==control_class),m]=np.median(data.loc[(data.loc[:,m]!='')&
(data.iloc[:,1]==control_class),m])
data[m]= pd.to_numeric(data[m])
if deal_NaN=='remove':
pos=np.where(data=='')
pos = set(data.index)- set(np.unique(pos[0]))
pos = list(pos)
data=data.iloc[pos, :]
data.reset_index(inplace=True)
del data['index']
for m in data.columns[2:]:
data[m]= pd.to_numeric(data[m])
training_data=data
self.training_data=training_data
if max_length==None:
N=len(markers)+1
else:
N=max_length+1
for L in range(1, N):
for subset in itertools.combinations(markers, L):
combs.append(subset)
combs_len.append(len(subset))
pos.append([sum(np.sum(data.loc[data.Class==case_class,subset] >= signalthr, axis=1) >= combithr),
sum(np.sum(data.loc[data.Class==control_class,subset] >= signalthr, axis=1) >= combithr)])
names.append('-'.join(subset))
comps=names.copy()
c=['Combination '+str(i) for i in range(1,(len(combs)-len(markers)+1))]
comps[len(markers):len(combs)]=c
tab=pd.DataFrame(comps, columns=['index'])
tab['Markers']=names
tab=pd.concat([tab,pd.DataFrame(pos, columns=['#Positives '+case_class, '#Positives '+control_class ])],axis=1)
tab.set_index('index', inplace=True)
tab['SE']=round((100*tab.iloc[:, 1]/sum(data.Class==case_class)), 1)
tab['SP']=round((100*(1-(tab.iloc[:, 2]/sum(data.Class==control_class)))), 1)
tab['# of markers']=combs_len
self.tab = tab
#Methods
[docs] def ranked_combs(self, min_SE=0, min_SP=0):
"""
To rank combinations by Youden index and select them if they have a min SE and/or SP.
Parameters
----------
min_SE: int or float
a numeric value that specifies the min value of SE that a threshold must have to be included
min_SP: int or float
a numeric value that specifies the min value of SP that a threshold must have to be included
"""
rnk = self.tab
rnk['Youden']=(self.tab.SE + self.tab.SP - 100)/100
rnk = rnk.iloc[:, [3,4,5,6]]
#rnk['# of markers'] = [i.count('-')+1 for i in self.tab.Markers]
rnk.sort_values('Youden', ascending=False, inplace=True)
rnk = rnk.loc[(rnk.SE >= min_SE) & (rnk.SP>=min_SP),:]
self.ranked = rnk
#
[docs] def bubble_chart(self, min_SE=0, min_SP=0):
"""
To plot a bubble chart showing gold combinations.
Parameters
----------
min_SE: int or float
a numeric value that specifies the min value of SE that a combination must have to be included
min_SP: int or float
a numeric value that specifies the min value of SP that a combination must have to be included
"""
combinations = self.tab
combinations['Combo'] = (self.tab.SE>= min_SE) & (self.tab.SP>= min_SP)
combinations['Combo'].replace(to_replace=[True, False], value=['gold combination', 'below thresholdS'], inplace=True)
combinations['Combo'] = combinations['Combo'].astype('category')
bc=sns.scatterplot(data = combinations, hue='Combo', x="SP", y="SE", size="# of markers", alpha=0.7, sizes=(10, 600))
bc.axhline(min_SE, color='black', ls='--', linewidth=1)
bc.axvline(min_SP, color='black', ls='--', linewidth=1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#
[docs] def train_models(self, combinations=[]):
"""
To compute General Linear Model (binomial) for each selected combination.
Parameters
----------
data: pandas.DataFrame
a pycroc object returned by load_data().
combinations: list(str)
the names of the selected combinations.
"""
#change data parameters to train or trainingset??
combs=self.tab.loc[combinations,'Markers']
models={}
for i in combinations:
m= combs.loc[i].split('-')
y=['np.log('+x+'+1)' for x in m]
models[i]=smf.glm(formula = "Class ~ "+'+'.join(y), family=sm.families.Binomial(), data=self.training_data).fit()
self.models = models
#
[docs] def roc_curves(self, case_class):
"""
To show reports of ROC curves of trained models.
Parameters
----------
case_class: str
a string that specifies which of the two classes of the dataset is the case class.
"""
data = self.training_data
control_class = data['Class'].unique()[data['Class'].unique() != case_class][0]
y = data.Class
y = y.replace([case_class, control_class], [1,0])
reports = pd.DataFrame()
SESP = [] #used to store xy coordinates for the plots
for comb in self.models.keys():
combi_score = self.models[comb].predict()
x = np.sort(combi_score)
auc = metrics.roc_auc_score(y, combi_score)
threshold=[]
SE=[]
SP=[]
df=[]
for i in range(0, len(x)-1):
threshold.append((x[i]+x[i+1])/2)
for i in threshold:
tp = sum((combi_score>i) & (data['Class']==case_class))
fp = sum((combi_score>i) & (data['Class']==control_class))
tn = sum((combi_score<=i) & (data['Class']==control_class))
fn = sum((combi_score<=i) & (data['Class']==case_class))
totp = sum((data['Class']==case_class))
totn = sum((data['Class']==control_class))
se = tp/totp
sp = tn/totn
SE.append(se)
SP.append(sp)
acc = (tp+tn)/data.shape[0]
df.append([auc,se,sp,i,se+sp-1,acc, tn, tp, fn, fp])
#save statistics in the dataframe
df = pd.DataFrame(df, columns=['AUC','SE', 'SP', 'OptCutoff', 'Youden', 'ACC', 'TN', 'TP', 'FN', 'FP'])
df = df.sort_values('Youden', ascending=False)
df.reset_index(inplace=True)
del df['index']
df = df.iloc[0,:]
reports[comb] = df
#compose coordinates for ROC plotting
SESP_tmp = pd.DataFrame(zip(abs(np.array(SP)-1),SE), columns=['SP','SE'])
SESP_tmp['comb'] = comb
SESP.append(SESP_tmp)
del SESP_tmp
self.reports = reports.transpose()
#Plotting
fig, ax = plt.subplots()
for section, group in pd.concat(SESP).groupby('comb'):
group.plot(x='SP', y='SE', ax=ax, label=section)
plt.title('Receiver Operating Characteristic')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend()