Source code for pynir.OutlierDection
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 28 11:00:35 2022
@author: chinn
"""
import numpy as np
from sklearn.cross_decomposition import PLSRegression
from scipy.stats import f, chi2
import matplotlib.pyplot as plt
[docs]class outlierDection_PLS():
# Ref1: https://nirpyresearch.com/outliers-detection-pls-regression-nir-spectroscopy-python/
# Ref2: https://www.sciencedirect.com/science/article/pii/S0378517314004980
def __init__(self, ncomp = 2, conf = 0.99):
self.ncomp = ncomp
self.conf = conf
[docs] def fit(self, X, y):
ncomp = self.ncomp
self.plsModel = PLSRegression(n_components=ncomp)
self.plsModel.fit(X, y)
return self
[docs] def detect(self, X, y):
ncomp = self.ncomp
conf = self.conf
plsModel = self.plsModel
# Get X scores
T = plsModel.transform(X)
# Calculate error array
Err = X - plsModel.inverse_transform(T)
# Calculate Q-residuals (sum over the rows of the error array)
Q = np.sum(Err**2, axis=1)
# Estimate the confidence level for the Q-residuals
Q_conf = (np.var(Q)/2/np.mean(Q))*chi2.ppf(conf, 2*np.mean(Q)**2/np.var(Q))
# Calculate Hotelling's T-squared (note that data are normalised by default)
Tsq = np.sum((plsModel.x_scores_/np.std(plsModel.x_scores_, axis=0))**2, axis=1)
# Calculate confidence level for T-squared from the ppf of the F distribution
Tsq_conf = f.ppf(q=conf,dfn=ncomp,dfd=X.shape[0]-ncomp)
Tsq_conf = Tsq_conf*ncomp*(X.shape[0]-1)*(X.shape[0]+1)/X.shape[0]/(X.shape[0]-ncomp)
idxOutlier = np.logical_and(Q>Q_conf, Tsq>Tsq_conf)
return Q, Tsq, Q_conf, Tsq_conf, idxOutlier
[docs] def plot_HotellingT2_Q(self, Q, Tsq, Q_conf, Tsq_conf, ax=None):
if ax ==None:
fig, ax = plt.subplots(figsize=(8,4.5))
ax.plot(Tsq, Q, 'o')
ax.plot([Tsq_conf,Tsq_conf],[plt.axis()[2],plt.axis()[3]], '--')
ax.plot([plt.axis()[0],plt.axis()[1]],[Q_conf,Q_conf], '--')
ax.set_xlabel("Hotelling's T-squared")
ax.set_ylabel('Q residuals')
plt.show()