import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from numpy.linalg import matrix_rank as rank
import matplotlib.pyplot as plt
from .Preprocessing import smooth
# Single step feature selection method
[docs]class MCUVE:
def __init__(self, x, y, ncomp=2, nrep=500, testSize=0.2):
self.x = x
self.y = y
# The number of latent components should not be larger than any dimension size of independent matrix
self.ncomp = min([ncomp, rank(x)])
self.nrep = nrep
self.testSize = testSize
[docs] def fit(self):
PLSCoef = np.zeros((self.nrep, self.x.shape[1]))
ss = ShuffleSplit(n_splits=self.nrep, test_size=self.testSize)
step = 0
for train, test in ss.split(self.x, self.y):
xtrain = self.x[train, :]
ytrain = self.y[train]
plsModel = PLSRegression(min([self.ncomp, rank(xtrain)]))
plsModel.fit(xtrain, ytrain)
PLSCoef[step, :] = plsModel.coef_.T
step += 1
meanCoef = np.mean(PLSCoef, axis=0)
stdCoef = np.std(PLSCoef, axis=0)
self.criteria = meanCoef / stdCoef
self.featureRank = np.argsort(-np.abs(self.criteria))
return self
[docs] def evalFeatures(self, cv=5):
self.featureR2 = []
for i in range(self.x.shape[1]):
xi = self.x[:, self.featureRank[:i + 1]]
if i<self.ncomp:
regModel = LinearRegression()
else:
regModel = PLSRegression(min([self.ncomp, rank(xi)]))
cvScore = cross_val_score(regModel, xi, self.y, cv=cv)
self.featureR2[i] = np.mean(cvScore)
return self
# Feature selection with random test method
[docs]class RT(MCUVE):
[docs] def fit(self):
# calculate normal pls regression coefficient
plsmodel0=PLSRegression(self.ncomp)
plsmodel0.fit(self.x, self.y)
# calculate noise reference regression coefficient
plsCoef0=plsmodel0.coef_
PLSCoef = np.zeros((self.nrep, self.x.shape[1]))
for i in range(self.nrep):
randomidx = list(range(self.x.shape[0]))
np.random.shuffle(randomidx)
ytrain = self.y[randomidx]
plsModel = PLSRegression(self.ncomp)
plsModel.fit(self.x, ytrain)
PLSCoef[i, :] = plsModel.coef_.T
plsCoef0 = np.tile(np.reshape(plsCoef0, [1, -1]), [ self.nrep, 1])
criteria = np.sum(np.abs(PLSCoef) > np.abs(plsCoef0), axis=0)/self.nrep
self.criteria = criteria
self.featureRank = np.argsort(self.criteria)
return self
[docs] def evalFeatures(self, cv=5):
self.featureR2 = []
# Note: small P value indicating important feature
for i in range(self.x.shape[1]):
xi = self.x[:, self.featureRank[:i + 1]]
if i<self.ncomp:
regModel = LinearRegression()
else:
regModel = PLSRegression(min([self.ncomp, rank(xi)]))
cvScore = cross_val_score(regModel, xi, self.y, cv=cv)
self.featureR2[i] = np.mean(cvScore)
return self
# Feature selection based on the criterion of C values
# Ref. [1] Zhang J., et.al. A variable importance criterion for variable selection in near-infrared spectral analysis [J]. Science China-Chemistry, 2019, 62(2): 271-279.
[docs]class VC(RT):
[docs] def fit(self, cv=5, isSmooth = True):
# calculate normal pls regression coefficient
nVar = self.x.shape[1]
sampleMatrix = np.ndarray([self.nrep,self.x.shape[1]], dtype=int)
sampleMatrix[:, :] = 0
errVector = np.ndarray([self.nrep,1])
# The number of variable in combination should less than the total variable number
if nVar > self.ncomp:
nSample = max([self.ncomp, nVar//10])
else:
nSample = max([1, nVar-1])
sampleidx = range(self.x.shape[1])
for i in range(self.nrep):
sampleidx = shuffle(sampleidx)
seli = sampleidx[:nSample]
plsModel = PLSRegression(n_components=min([self.ncomp, rank(self.x[:, seli])]))
plsModel.fit(self.x[:, seli], self.y)
sampleMatrix[i, seli] = 1
yhati=cross_val_predict(plsModel, self.x[:, seli], self.y, cv=cv)
errVector[i] = np.sqrt(mean_squared_error(yhati, self.y))
plsModel = PLSRegression(n_components=self.ncomp)
plsModel.fit(sampleMatrix, errVector)
self.criteria = plsModel.coef_.ravel()
if self.nrep < 5*self.x.shape[0] and isSmooth:
self.criteria = smooth(polyorder=1).transform(self.criteria)
self.featureRank = np.argsort(self.criteria)
return self
# Recursive feature selection based on the criterion of C values
[docs]class MSVC:
def __init__(self, x, y, ncomp=1, nrep=7000, ncut=50, testSize=0.2):
self.x = x
self.y = y
# The number of latent components should not be larger than any dimension size of independent matrix
self.ncomp = min([ncomp, rank(x)])
self.nrep = nrep
self.ncut = ncut
self.testSize = testSize
self.criteria = np.full([ncut, self.x.shape[1]], np.nan)
self.featureR2 = np.empty(ncut)
self.selFeature = None
[docs] def calcCriteria(self):
varidx = np.array(range(self.x.shape[1]))
ncuti = np.logspace(np.log(self.x.shape[1]), np.log(1), self.ncut, base=np.e)
ncuti = (np.round(ncuti)).astype(int)
for i in range(self.ncut):
vcModel = VC(self.x[:, varidx], self.y, self.ncomp, nrep=self.nrep)
vcModel.fit(isSmooth = False)
self.criteria[i, varidx] = vcModel.criteria
var_ranked = np.argsort(vcModel.criteria)
if i < self.ncut - 1:
varidx = varidx[var_ranked[:ncuti[i+1]]]
[docs] def evalCriteria(self, cv=3):
for i in range(self.ncut):
varSeli = ~np.isnan(self.criteria[i, :])
xi = self.x[:,varSeli]
if sum(varSeli) < self.ncomp:
regModel = LinearRegression()
else:
regModel = PLSRegression(min([self.ncomp, rank(xi)]))
cvScore = cross_val_score(regModel, xi, self.y, cv=cv)
self.featureR2[i] = np.mean(cvScore)
[docs] def cutFeature(self, *args):
cuti = np.argmax(self.featureR2)
self.selFeature = ~np.isnan(self.criteria[cuti, :])
if len(args) != 0:
returnx = list(args)
i = 0
for argi in args:
if argi.shape[1] == self.x.shape[1]:
returnx[i] = argi[:, self.selFeature]
i += 1
return tuple(returnx)
[docs]def plotFeatureSelection(wv, X, selFeatures, methodNames = None,
ylabel = "Intensity (A.U.)",
xlabel = "Wavelength (nm)",
title = "Feature selection results",
ax = None):
scale = 0.4
ofset = 1
if ax is None:
fig, ax = plt.subplots()
ax.plot(wv,X.transpose())
if isinstance(selFeatures[0], int) or \
isinstance(selFeatures[0], np.int64):
ax.eventplot(wv[selFeatures],
linelengths = scale*np.std(X),
lineoffsets = np.min(X)-(scale+ofset)*np.std(X))
if methodNames is not None:
ax.text(wv[0]-0.05*(wv[-1]-wv[0]),np.min(X)-(scale+ofset)*np.std(X),methodNames)
else:
for i in range(len(selFeatures)):
ax.eventplot(wv[selFeatures[i]],
linelengths = scale*np.std(X),
lineoffsets = np.min(X)-(scale*2*i+ofset)*np.std(X))
if methodNames is not None:
ax.text(wv[0]-0.05*(wv[-1]-wv[0]), np.min(X)-(scale*2*i+ofset)*np.std(X),
methodNames[i])
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.set_title(title)