Source code for pycroc.distr

import pandas as pd
import numpy as np

import matplotlib
from matplotlib import colors
from matplotlib import pyplot as plt
import seaborn as sns

from .bw_nrd0 import _bw_nrd0

[docs]class Distr: """ A class used to represent the distr object, containing all the information related to the distribution of signal intensity values (see https://cran.r-project.org/web/packages/combiroc/vignettes/combiroc_standard.html#the-distr-object). ... Attributes ---------- data_long: pandas.DataFrame a pycroc dataset in long format case_class: str a string that specifies which of the two classes of the dataset is the case class control_class: str a string that specifies which of the two classes of the dataset is the control class min_SE: int or float a numeric value that specifies the min value of SE that a threshold must have to be included min_SP: int or float a numeric value that specifies the min value of SP that a threshold must have to be included coord: pandas.DataFrame a DataFrame that shows SE and SP in function of the signal intensity "threshold" optimal_threshold: float a number that indicates a suggested optimal signal threshold that guarantees SE >= min_SE and SP >= min_SP Methods ------- markers_boxplot(self, ylim=(None, None), colors= [(1.0, 0.4980392156862745, 0.054901960784313725),(0.12156862745098039, 0.4666666666666667, 0.7058823529411765)]) Plot a boxplot showing the distribution of the signal intensity values of each marker singularly, for both the classes. markers_coord(self, min_SE=0, min_SP=0) To show how many real positive samples would be found positive (SE) and how many real negative samples would be found negative (SP) in function of signal threshold markers_density(self, xlim=(0,None), ylim=(0, None), bw_adjust= {'case':'nrd0', 'control':'nrd0'}, colors= [(1.0, 0.4980392156862745, 0.054901960784313725),(0.12156862745098039, 0.4666666666666667, 0.7058823529411765)], signalthr_prediction=False) To show a density plot showing the distribution of the signal intensity values for both the classes density_summary(self) To show a summary of the distribution of the signal values in the two classes markers_ROC(self) To show a ROC curve with SE and SP in function of signal threshold """ #Attributes data_long = None case_class = None control_class = None min_SE = None min_SP = None coord = None optimal_threshold = None #Constructor def __init__(self, data, case_class): """ Parameters ---------- data: pandas.DataFrame a labelled pycroc dataset case_class: str a string that specifies which of the two classes of the dataset is the case class """ self.data_long = pd.melt(data, id_vars=data.columns[[0,1]], var_name='Markers', value_name='Values', value_vars=data.columns[range(2,data.shape[1])]) self.data_long=self.data_long.loc[self.data_long['Values']!='', :] self.data_long.reset_index(inplace=True) del self.data_long['index'] #to coordinate color between plots transform class column in categories self.data_long['Class'] = self.data_long['Class'].astype('category') #w:this line introduce an error in the following manipulation self.case_class = case_class self.control_class = self.data_long['Class'].unique()[self.data_long['Class'].unique()!=case_class][0] #Methods
[docs] def markers_boxplot(self, ylim=(None, None), colors= [(1.0, 0.4980392156862745, 0.054901960784313725),(0.12156862745098039, 0.4666666666666667, 0.7058823529411765)]): """ Plot a boxplot showing the distribution of the signal intensity values of each marker singularly, for both the classes. Parameters ---------- ylim: tuple a tuple that specifies both the min and max value shown on the y axis. ylim=(min, max) colors: colors to use for the different levels of the hue variable. Should be something that can be interpreted by seaborn.color_palette(), or a dictionary mapping hue levels to matplotlib colors. """ # reproducing Boxplot bp = sns.boxplot(x="Markers", y="Values", data=self.data_long, hue='Class') bp.set(ylim=ylim)
#plt.show()
[docs] def markers_coord(self, min_SE=0, min_SP=0): """ To show how many real positive samples would be found positive (SE) and how many real negative samples would be found negative (SP) in function of signal threshold. Parameters ---------- min_SE: int or float a numeric value that specifies the min value of SE that a threshold must have to be included min_SP: int or float a numeric value that specifies the min value of SP that a threshold must have to be included """ #Initialize attribute coord x=np.sort(self.data_long['Values'].unique()) threshold = [(x[i]+x[i+1])/2 for i in range(0, len(x)-1)] SE=[] SP=[] for t in threshold: SE.append(sum((self.data_long['Values']>t)&(self.data_long['Class']==self.case_class))/sum(self.data_long['Class']==self.case_class)) SP.append(sum((self.data_long['Values']<=t)&(self.data_long['Class']==self.control_class))/sum(self.data_long['Class']==self.control_class)) Coord= pd.DataFrame() Coord['threshold']=threshold Coord['SE']=SE Coord['SP']=SP Coord['Youden']=Coord['SE']+Coord['SP']-1 Coord['SE']=Coord['SE']*100 Coord['SP']=Coord['SP']*100 first_line=pd.DataFrame(np.array([0,100,0,0]).reshape(1,4), columns=['threshold','SE', 'SP', 'Youden']) Coord=pd.concat([first_line,Coord], ignore_index=True) #Coord=Coord.loc[(Coord['SE']>=min_SE)&(Coord['SP']>=min_SP),:] #removed filtering from here Coord= round(Coord,2) #return self.min_SE = min_SE self.min_SP = min_SP self.coord = Coord
[docs] def markers_density(self, xlim=(0,None), ylim=(0, None), bw_adjust= {'case':'nrd0', 'control':'nrd0'}, colors= [(1.0, 0.4980392156862745, 0.054901960784313725), (0.12156862745098039, 0.4666666666666667, 0.7058823529411765)], signalthr_prediction=False): """ To show a density plot showing the distribution of the signal intensity values for both the classes. In case of lack of a priori known threshold the user can set set signalthr_prediction= TRUE. In this way the function provides a "suggested signal threshold" that corresponds to the median of the singnal threshold values at which SE/SP are grater or equal to their set minimal values (min_SE and min_SP). Parameters ---------- xlim: tuple a tuple that specifies both the min and max value shown on the x axis. xlim=(min, max) ylim: tuple a tuple that specifies both the min and max value shown on the y axis. ylim=(min, max) bw_adjust: dict(str: int, float or str) specify the smoothing bandwith factor for both case and control class. The keys of the dictionary must be 'control' and/or 'case'. The default value 'nrd0' specifies Silverman's ‘rule of thumb’ as smoothing bandwith factor calculation. colors: colors to use for the different levels of the hue variable. Should be something that can be interpreted by seaborn.color_palette(), or a dictionary mapping hue levels to matplotlib colors. signalthr_prediction: bool a bool to specify if signal threshold prediction has to be computed (True) or not (False) """ #NOTE: if min_SE/min_SP were set before, compute threshold using sub_coord like object #sns.set_palette(colors) case= np.array(self.data_long.loc[self.data_long.Class==self.case_class,].Values) control = np.array(self.data_long.loc[self.data_long.Class==self.control_class,].Values) data={'case':case, 'control':control} for i in ['case', 'control']: if not i in bw_adjust.keys(): bw_adjust[i]='nrd0' if bw_adjust[i]=='nrd0': bw_adjust[i]= _bw_nrd0(data[i]) kde=sns.kdeplot(data[i], bw_adjust=bw_adjust[i], gridsize=10000) kde.set(ylim=ylim, xlim=xlim) kde.legend([self.case_class,self.control_class]) if signalthr_prediction==True: sub_coord = self.coord.loc[(self.coord['SE'] >= self.min_SE) & (self.coord['SP'] >= self.min_SP),:] self.optimal_threshold = sub_coord.threshold.median() kde.axvline(self.optimal_threshold, color='black', ls='--') kde.text(self.optimal_threshold*0.9, 0.00002, str(round(self.optimal_threshold,2)), horizontalalignment='right', size='small', color='black')
[docs] def density_summary(self): """ To retrieve the summary of signal values distributions in the two classes. Returns ------- summary: pandas.DataFrame a DataFrame that summarizes the distributions of values in the two classes. """ summary = self.data_long.groupby('Class').describe()['Values'] #fix number of elements summary['count'] = self.data_long.iloc[:,0:2].drop_duplicates().groupby('Class').size() return summary
[docs] def markers_ROC(self): """ To show a ROC curve with SE and SP in function of signal threshold. """ plt.title('Receiver Operating Characteristic') plt.plot(abs((self.coord['SP']/100)-1), self.coord['SE']/100) plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show()