# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import polars as pl
import scipy as sp
from scipy.spatial.distance import pdist,squareform
from sklearn.base import BaseEstimator, TransformerMixin
from mapply.mapply import mapply
from sklearn.metrics import accuracy_score
from scientisttools import CA
##########################################################################################
# Discriminant Corrrespondence Analysis (DISCA)
##########################################################################################
[docs]class DISCA(BaseEstimator,TransformerMixin):
"""
Discriminant Correspondence Analysis (DISCA)
--------------------------------------------
Description
-----------
This class inherits from sklearn BaseEstimator and TransformerMixin class
Performance Discriminant Correspondence Analysis
Parameters:
----------
n_components : number of dimensions kept in the results
target : string, target variable
features : list of qualitatives variables to be included in the analysis.
priors : The priors statement specifies the prior probabilities of group membership.
- "equal" to set the prior probabilities equal,
- "proportional" or "prop" to set the prior probabilities proportional to the sample sizes
- a pandas series which specify the prior probability for each level of the classification variable.
parallelize : boolean, default = False
If model should be parallelize
- If True : parallelize using mapply
- If False : parallelize using apply
Return
------
call_ : a dictionary with some statistics
ind_ : a dictionary of pandas dataframe containing all the results for the active individuals (coordinates)
var_ : a dictionary of pandas dataframe containing all the results for the active variables (coordinates, correlation between variables and axes, square cosine, contributions)
statistics_ : statistics
classes_ : classes informations
anova_ : analyse of variance
factor_model_ : correspondence analysis model
coef_ : discriminant correspondence analysis coefficients
model_ : string. The model fitted = 'disca'
Author(s)
---------
Duvérier DJIFACK ZEBAZE duverierdjifack@gmail.com
Notes:
------
https://bookdown.org/teddyswiebold/multivariate_statistical_analysis_using_r/discriminant-correspondence-analysis.html
https://search.r-project.org/CRAN/refmans/TExPosition/html/tepDICA.html
http://pbil.univ-lyon1.fr/ADE-4/ade4-html/discrimin.coa.html
https://rdrr.io/cran/ade4/man/discrimin.coa.html
https://stat.ethz.ch/pipermail/r-help/2010-December/263170.html
https://www.sciencedirect.com/science/article/pii/S259026012200011X
"""
def __init__(self,
n_components = None,
target = None,
features = None,
priors = None,
parallelize = False):
self.n_components = n_components
self.target = target
self.features = features
self.priors = priors
self.parallelize = parallelize
[docs] def fit(self,X):
"""
Fit the Discriminant Correspondence Analysis model
--------------------------------------------------
Parameters
----------
X : pandas/polars DataFrame,
Training Data
Returns:
--------
self : object
Fitted estimator
"""
# check if X is an instance of polars dataframe
if isinstance(X,pl.DataFrame):
X = X.to_pandas()
if not isinstance(X,pd.DataFrame):
raise TypeError(
f"{type(X)} is not supported. Please convert to a DataFrame with "
"pd.DataFrame. For more information see: "
"https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html")
# Set parallelize
if self.parallelize:
n_workers = -1
else:
n_workers = 1
# Check if target is None
if self.target is None:
raise ValueError("'target' must be assigned")
elif not isinstance(self.target,list):
raise ValueError("'target' must be a list")
elif len(self.target)>1:
raise ValueError("'target' must be a list of length one")
###############################################################################################################"
# Drop level if ndim greater than 1 and reset columns name
###############################################################################################################
if X.columns.nlevels > 1:
X.columns = X.columns.droplevel()
# Save data
Xtot = X.copy()
#######################################################################################################################
# Split Data into two : x and y
y = X[self.target]
x = X.drop(columns=self.target)
# Set features labels/names
if self.features is None:
features = x.columns.tolist()
elif not isinstance(self.features,list):
raise ValueError("'features' must be a list of variable names")
else:
features = self.features
###### Select features
x = x[features]
# Redefine X
X = pd.concat((x,y),axis=1)
# Number of rows and continuous variables
n_samples, n_features = x.shape
##### Category
classes = np.unique(y).tolist()
# Number of groups
n_classes = len(classes)
################################################ Check if all columns are categoricals
all_cat = all(pd.api.types.is_string_dtype(x[col]) for col in x.columns)
if not all_cat:
raise TypeError("All features must be categoricals")
################################################ Chi2 -test
chi2_test = pd.DataFrame(columns=["statistic","ddl","pvalue"],index=x.columns).astype("float")
for col in x.columns:
tab = pd.crosstab(x[col],y[self.target[0]])
chi2 = sp.stats.chi2_contingency(observed=tab,correction=False)
chi2_test.loc[col,:] = [chi2[0], chi2[2], chi2[1]]
chi2_test = chi2_test.sort_values(by=["pvalue"])
chi2_test["ddl"] = chi2_test["ddl"].astype(int)
statistics = {"chi2" : chi2_test}
######################################################################################
# Tableau des indicatrices
dummies = pd.concat((pd.get_dummies(x[col],prefix=col,prefix_sep="_",dtype=float) for col in x.columns),axis=1)
############################################################
# Construction de la matrice M
M = pd.concat([y,dummies],axis=1).groupby(self.target).sum()
#########################################################################################
# Correspondence Analysis (CA)
######################################################################################
# Correspondence Analysis
global_ca = CA(n_components=self.n_components,parallelize=self.parallelize).fit(M)
# Calcul des profils - Effectifs par classes (pour chaque descripteurs)
n_l = M.sum(axis=0)
# Profil marginal
G = n_l/np.sum(n_l)
# Tableau des profils - Matric des profils
profils = mapply(M,lambda x : x/np.sum(x),axis=1,progressbar=False,n_workers=n_workers)
# Distance entre un groupe et l'origine
dist2 = mapply(profils,lambda x : np.sum((x-G.values)**2/G.values),axis=1,progressbar=False,n_workers=n_workers).to_frame("dist2")
# Distance entre les groupes - Matrice des distances par paires de classes
dist = pd.DataFrame(squareform(pdist(profils,metric="seuclidean",V=G)**2),index=classes,columns=classes)
###########################################
mod_stats = pd.concat([n_l,G],axis=1)
mod_stats.columns = ["Frequence","Proportion"]
statistics = {**statistics,**{"categories" : mod_stats}}
# Number of element
n_k, p_k = y.value_counts(normalize=False), y.value_counts(normalize=True)
# Initial prior - proportion of each element
if self.priors is None:
raise ValueError("'priors' must be assigned")
if self.priors in ["proportional","prop"]:
priors = p_k
elif self.priors == "equal":
priors = pd.Series([1/n_classes]*n_classes,index=classes)
else:
priors = pd.Series([x/self.priors.sum() for x in self.priors.values],index=self.priors.index)
# Store some informations
self.call_ = {"X" : X,
"target" : self.target[0],
"features" : features,
"n_features" : n_features,
"n_samples" : n_samples,
"n_classes" : n_classes,
"priors" : priors}
#############################
# Class level information
class_level_information = pd.concat([n_k,p_k,priors],axis=1)
class_level_information.columns = ["Frequency","Proportion","Prior Probability"]
statistics["information"] = class_level_information
self.statistics_ = statistics
#######################################################
# Coefficient des fonctions discriminantes canoniques
coef = mapply(global_ca.col_["coord"],lambda col : col/(x.shape[1]*np.sqrt(global_ca.eig_.iloc[:global_ca.call_["n_components"],0])),
axis=1,progressbar=False,n_workers=n_workers)
####################################################################
# Coordonnées des individus à partir du tableau des indicatrices
row_coord = dummies.dot(coef)
self.ind_ = {"coord" : row_coord}
# Somme des carrés totales - totla sum of squared
tss = mapply(row_coord,lambda x : x**2,axis=0,progressbar=False,n_workers=n_workers).sum(axis=0)
# Rapport de corrélation - Correlation ratio
eta2 = ((x.shape[0]*global_ca.eig_.iloc[:,0])/tss)
eta2.name = "Eta2"
##################################################################################
# Information sur les classes
self.classes_ = global_ca.row_
self.classes_ = {**self.classes_, **{"classes" : classes, "dist2" : dist2, "dist" : dist}}
##################################################################################
# Informations sur les categories
self.var_ = global_ca.col_
#################################################################################
# Correspondance Analysis
self.factor_model_ = global_ca
########################### DISCA Coefficients
# Score function
self.coef_ = coef
# Analysis of variance
self.anova_ = {"Eta2" : eta2,"canonical_Eta2" : mapply(eta2,lambda x : np.sqrt(x),axis=0,progressbar=False,n_workers=n_workers)}
self.model_ = "disca"
return self
[docs] def decision_function(self,X):
"""
Apply decision function to an array of samples
----------------------------------------------
Parameters
----------
X : DataFrame of shape (n_samples_, n_features)
DataFrame of samples (test vectors).
Returns
-------
C : DataFrame of shape (n_samples_,) or (n_samples_, n_classes)
Decision function values related to each class, per sample.
"""
# check if X is an instance of polars dataframe
if isinstance(X,pl.DataFrame):
X = X.to_pandas()
# Check if X is a pandas DataFrame
if not isinstance(X,pd.DataFrame):
raise TypeError(
f"{type(X)} is not supported. Please convert to a DataFrame with "
"pd.DataFrame. For more information see: "
"https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html")
# Set parallelize
if self.parallelize:
n_workers = -1
else:
n_workers = 1
# Coordonnées des individus
coord = self.transform(X)
# Distance euclidiennes aux centres de classes
scores = pd.concat((mapply(self.classes_["coord"].sub(coord.loc[i,:].values,axis="columns"),lambda x : np.sum(x**2),
axis=1,progressbar=False,n_workers=n_workers).to_frame(i).rename_axis(None).T
for i in coord.index),axis=0)
return scores
[docs] def predict_proba(self,X):
"""
Estimate probability
--------------------
Parameters
----------
X : DataFrame of shape (n_samples_,n_features_)
Input data.
Returns:
--------
C : DataFrame of shape (n_samples_,n_classes_)
Estimated probabilities.
"""
# check if X is an instance of polars dataframe
if isinstance(X,pl.DataFrame):
X = X.to_pandas()
if not isinstance(X,pd.DataFrame):
raise TypeError(
f"{type(X)} is not supported. Please convert to a DataFrame with "
"pd.DataFrame. For more information see: "
"https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html")
# Set parallelize
if self.parallelize:
n_workers = -1
else:
n_workers = 1
# Decision
scores = self.decision_function(X)
# Distance généralisée : scores - 2*log(p_k)
DG = scores.sub((2*np.log(self.call_["priors"].to_frame(name="prior").T.loc[:,scores.columns].values)),axis="columns")
# Probabilité d'appartenance - transformation
C = mapply(mapply(DG,lambda x : np.exp(-0.5*x),axis=0,progressbar=False,n_workers=n_workers),
lambda x : x/np.sum(x),axis=1,progressbar=False,n_workers=n_workers)
return C
[docs] def predict(self,X):
"""
Predict class labels for samples in X
-------------------------------------
Parameters
----------
X : DataFrame of shape (n_samples_, n_features_)
The data matrix for which we want to get the predictions.
Returns:
--------
y_pred : ndarray of shape (n_samples)
Vectors containing the class labels for each sample
"""
if not isinstance(X,pd.DataFrame):
raise TypeError(
f"{type(X)} is not supported. Please convert to a DataFrame with "
"pd.DataFrame. For more information see: "
"https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html")
predict_proba = self.predict_proba(X)
predict = np.unique(self.classes_["classes"])[np.argmax(predict_proba.values,axis=1)]
return pd.Series(predict,index=X.index,name="prediction")
[docs] def score(self, X, y, sample_weight=None):
"""
Return the mean accuracy on the given test data and labels
----------------------------------------------------------
In multi-label classification, this is the subset accuracy
which is a harsh metric since you require for each sample that
each label set be correctly predicted.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test samples.
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
True labels for `X`.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
score : float
Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
"""
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
[docs] def pred_table(self):
"""
Prediction table
----------------
Notes
-----
pred_table[i,j] refers to the number of times “i” was observed and the model predicted “j”. Correct predictions are along the diagonal.
"""
pred = self.predict(self.call_["X"])
return pd.crosstab(self.call_["X"][self.call_["target"]],pred)