import pandas as pd
import numpy as np
import matplotlib
from matplotlib import colors
from matplotlib import pyplot as plt
import seaborn as sns
from .bw_nrd0 import _bw_nrd0
[docs]class Distr:
"""
A class used to represent the distr object, containing all the information related to the distribution of signal intensity values (see https://cran.r-project.org/web/packages/combiroc/vignettes/combiroc_standard.html#the-distr-object).
...
Attributes
----------
data_long: pandas.DataFrame
a pycroc dataset in long format
case_class: str
a string that specifies which of the two classes of the dataset is the case class
control_class: str
a string that specifies which of the two classes of the dataset is the control class
min_SE: int or float
a numeric value that specifies the min value of SE that a threshold must have to be included
min_SP: int or float
a numeric value that specifies the min value of SP that a threshold must have to be included
coord: pandas.DataFrame
a DataFrame that shows SE and SP in function of the signal intensity "threshold"
optimal_threshold: float
a number that indicates a suggested optimal signal threshold that guarantees SE >= min_SE and SP >= min_SP
Methods
-------
markers_boxplot(self, ylim=(None, None), colors= [(1.0, 0.4980392156862745, 0.054901960784313725),(0.12156862745098039, 0.4666666666666667, 0.7058823529411765)])
Plot a boxplot showing the distribution of the signal intensity values of each marker singularly, for both the classes.
markers_coord(self, min_SE=0, min_SP=0)
To show how many real positive samples would be found positive (SE) and how many real negative samples would be found negative (SP) in function of signal threshold
markers_density(self, xlim=(0,None), ylim=(0, None), bw_adjust= {'case':'nrd0', 'control':'nrd0'}, colors= [(1.0, 0.4980392156862745, 0.054901960784313725),(0.12156862745098039, 0.4666666666666667, 0.7058823529411765)], signalthr_prediction=False)
To show a density plot showing the distribution of the signal intensity values for both the classes
density_summary(self)
To show a summary of the distribution of the signal values in the two classes
markers_ROC(self)
To show a ROC curve with SE and SP in function of signal threshold
"""
#Attributes
data_long = None
case_class = None
control_class = None
min_SE = None
min_SP = None
coord = None
optimal_threshold = None
#Constructor
def __init__(self, data, case_class):
"""
Parameters
----------
data: pandas.DataFrame
a labelled pycroc dataset
case_class: str
a string that specifies which of the two classes of the dataset is the case class
"""
self.data_long = pd.melt(data, id_vars=data.columns[[0,1]], var_name='Markers',
value_name='Values', value_vars=data.columns[range(2,data.shape[1])])
self.data_long=self.data_long.loc[self.data_long['Values']!='', :]
self.data_long.reset_index(inplace=True)
del self.data_long['index']
#to coordinate color between plots transform class column in categories
self.data_long['Class'] = self.data_long['Class'].astype('category') #w:this line introduce an error in the following manipulation
self.case_class = case_class
self.control_class = self.data_long['Class'].unique()[self.data_long['Class'].unique()!=case_class][0]
#Methods
[docs] def markers_boxplot(self, ylim=(None, None), colors= [(1.0, 0.4980392156862745, 0.054901960784313725),(0.12156862745098039, 0.4666666666666667, 0.7058823529411765)]):
"""
Plot a boxplot showing the distribution of the signal intensity values of each marker singularly, for both the classes.
Parameters
----------
ylim: tuple
a tuple that specifies both the min and max value shown on the y axis. ylim=(min, max)
colors:
colors to use for the different levels of the hue variable. Should be something that can be interpreted by seaborn.color_palette(), or a dictionary mapping hue levels to matplotlib colors.
"""
# reproducing Boxplot
bp = sns.boxplot(x="Markers", y="Values", data=self.data_long, hue='Class')
bp.set(ylim=ylim)
#plt.show()
[docs] def markers_coord(self, min_SE=0, min_SP=0):
"""
To show how many real positive samples would be found positive (SE) and how many real negative samples would be found negative (SP) in function of signal threshold.
Parameters
----------
min_SE: int or float
a numeric value that specifies the min value of SE that a threshold must have to be included
min_SP: int or float
a numeric value that specifies the min value of SP that a threshold must have to be included
"""
#Initialize attribute coord
x=np.sort(self.data_long['Values'].unique())
threshold = [(x[i]+x[i+1])/2 for i in range(0, len(x)-1)]
SE=[]
SP=[]
for t in threshold:
SE.append(sum((self.data_long['Values']>t)&(self.data_long['Class']==self.case_class))/sum(self.data_long['Class']==self.case_class))
SP.append(sum((self.data_long['Values']<=t)&(self.data_long['Class']==self.control_class))/sum(self.data_long['Class']==self.control_class))
Coord= pd.DataFrame()
Coord['threshold']=threshold
Coord['SE']=SE
Coord['SP']=SP
Coord['Youden']=Coord['SE']+Coord['SP']-1
Coord['SE']=Coord['SE']*100
Coord['SP']=Coord['SP']*100
first_line=pd.DataFrame(np.array([0,100,0,0]).reshape(1,4), columns=['threshold','SE', 'SP', 'Youden'])
Coord=pd.concat([first_line,Coord], ignore_index=True)
#Coord=Coord.loc[(Coord['SE']>=min_SE)&(Coord['SP']>=min_SP),:] #removed filtering from here
Coord= round(Coord,2)
#return
self.min_SE = min_SE
self.min_SP = min_SP
self.coord = Coord
[docs] def markers_density(self, xlim=(0,None), ylim=(0, None),
bw_adjust= {'case':'nrd0', 'control':'nrd0'},
colors= [(1.0, 0.4980392156862745, 0.054901960784313725),
(0.12156862745098039, 0.4666666666666667, 0.7058823529411765)],
signalthr_prediction=False):
"""
To show a density plot showing the distribution of the signal intensity values for both the classes.
In case of lack of a priori known threshold the user can set set signalthr_prediction= TRUE. In this way the function provides a "suggested signal threshold" that corresponds to the median of the singnal threshold values at which SE/SP are grater or equal to their set minimal values (min_SE and min_SP).
Parameters
----------
xlim: tuple
a tuple that specifies both the min and max value shown on the x axis. xlim=(min, max)
ylim: tuple
a tuple that specifies both the min and max value shown on the y axis. ylim=(min, max)
bw_adjust: dict(str: int, float or str)
specify the smoothing bandwith factor for both case and control class. The keys of the dictionary must be 'control' and/or 'case'. The default value 'nrd0' specifies Silverman's ‘rule of thumb’ as smoothing bandwith factor calculation.
colors:
colors to use for the different levels of the hue variable. Should be something that can be interpreted by seaborn.color_palette(), or a dictionary mapping hue levels to matplotlib colors.
signalthr_prediction: bool
a bool to specify if signal threshold prediction has to be computed (True) or not (False)
"""
#NOTE: if min_SE/min_SP were set before, compute threshold using sub_coord like object
#sns.set_palette(colors)
case= np.array(self.data_long.loc[self.data_long.Class==self.case_class,].Values)
control = np.array(self.data_long.loc[self.data_long.Class==self.control_class,].Values)
data={'case':case, 'control':control}
for i in ['case', 'control']:
if not i in bw_adjust.keys():
bw_adjust[i]='nrd0'
if bw_adjust[i]=='nrd0':
bw_adjust[i]= _bw_nrd0(data[i])
kde=sns.kdeplot(data[i], bw_adjust=bw_adjust[i], gridsize=10000)
kde.set(ylim=ylim, xlim=xlim)
kde.legend([self.case_class,self.control_class])
if signalthr_prediction==True:
sub_coord = self.coord.loc[(self.coord['SE'] >= self.min_SE) & (self.coord['SP'] >= self.min_SP),:]
self.optimal_threshold = sub_coord.threshold.median()
kde.axvline(self.optimal_threshold, color='black', ls='--')
kde.text(self.optimal_threshold*0.9, 0.00002, str(round(self.optimal_threshold,2)),
horizontalalignment='right', size='small', color='black')
[docs] def density_summary(self):
"""
To retrieve the summary of signal values distributions in the two classes.
Returns
-------
summary: pandas.DataFrame
a DataFrame that summarizes the distributions of values in the two classes.
"""
summary = self.data_long.groupby('Class').describe()['Values']
#fix number of elements
summary['count'] = self.data_long.iloc[:,0:2].drop_duplicates().groupby('Class').size()
return summary
[docs] def markers_ROC(self):
"""
To show a ROC curve with SE and SP in function of signal threshold.
"""
plt.title('Receiver Operating Characteristic')
plt.plot(abs((self.coord['SP']/100)-1), self.coord['SE']/100)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()