# -*- coding: utf-8 -*-
"""
Created on Wed Sep 28 11:00:35 2022
@author: chinn
"""
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist
from sklearn.cross_decomposition import PLSRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_predict
import matplotlib.pyplot as plt
[docs]class pls:
"""
Partial Least Squares (PLS) regression model.
Parameters
----------
n_components : int, optional (default=2)
The number of PLS components to use.
Attributes
----------
model : dict
The PLS model, containing the following keys:
- 'x_scores': the X scores
- 'x_loadings': the X loadings
- 'y_loadings': the Y loadings
- 'x_weights': the X weights
- 'B': the regression coefficients
optLV : int
The optimal number of PLS components, determined by cross-validation.
Methods
-------
fit(X, y)
Fit the PLS model to the training data.
predict(Xnew, n_components=None)
Predict the response variable for new data.
crossValidation_predict(nfold=10)
Perform cross-validation and return the predicted response variable.
get_optLV(nfold=10)
Determine the optimal number of PLS components using cross-validation.
transform(Xnew)
Transform new data into the PLS space.
get_vip()
Compute the variable importance in projection (VIP) scores.
plot_prediction(y, yhat, xlabel="Reference", ylabel="Prediction", title="", ax=None)
Plot the predicted response variable against the reference variable.
"""
def __init__(self, n_components=2):
self.n_components = n_components
[docs] def fit(self, X, y):
"""
Fit the PLS model to the training data.
Parameters
----------
X : numpy.ndarray
The independent variable matrix.
y : numpy.ndarray
The dependent variable vector.
Returns
-------
self : pls
The fitted PLS model.
"""
self.X = X
self.y = y
meanX = np.mean(X, axis=0)
meany = np.mean(y)
Xcentered = X - meanX
ycentered = y - meany
model = simpls(Xcentered, ycentered, self.n_components)
meanX_hat = -1 * np.dot(meanX, model['B']) + meany
model['B'] = np.append(meanX_hat[np.newaxis, :], model['B'], axis=0)
self.model = model
return self
[docs] def predict(self, Xnew, n_components=None):
"""
Predict the response variable for new data.
Parameters
----------
Xnew : numpy.ndarray
The new independent variable matrix.
n_components : int, optional
The number of PLS components to use (default is None, which uses all components).
Returns
-------
ynew_hat : numpy.ndarray
The predicted response variable.
"""
if n_components is None:
B = self.model['B'][:, -1]
else:
B = self.model['B'][:, n_components-1]
if Xnew.shape[1] != B.shape[0]-1:
raise ValueError(
'The feature number of predictor is isconsistent with that of indepnentent.')
Xnew = np.append(np.ones([Xnew.shape[0], 1]), Xnew, axis=1)
ynew_hat = np.dot(Xnew, B)
return ynew_hat
[docs] def crossValidation_predict(self, nfold=10):
"""
Perform cross-validation and return the predicted response variable.
Parameters
----------
nfold : int, optional (default=10)
The number of folds to use in cross-validation.
Returns
-------
yhat : numpy.ndarray
The predicted response variable.
"""
X = self.X
y = self.y
yhat = np.zeros((y.shape[0], self.n_components))
model = pls(n_components=self.n_components)
for train, test in KFold(n_splits=nfold).split(X):
model.fit(X[train, :], y[train])
yhat[test, :] = model.predict(
X[test, :], np.arange(self.n_components)+1)
return yhat
[docs] def get_optLV(self, nfold=10):
"""
Determine the optimal number of PLS components using cross-validation.
Parameters
----------
nfold : int, optional (default=10)
The number of folds to use in cross-validation.
Returns
-------
optLV : int
The optimal number of PLS components.
"""
yhat_cv = self.crossValidation_predict(nfold)
rmsecv = []
r2cv = []
for i in range(yhat_cv.shape[1]):
reportcv = regressionReport(self.y, yhat_cv[:, i])
rmsecv.append(reportcv["rmse"])
r2cv.append(reportcv["r2"])
optLV = int(np.argmin(rmsecv)+1)
self.optLV = optLV
return optLV
[docs] def get_vip(self):
"""
Compute the variable importance in projection (VIP) scores.
Returns
-------
vipScore : numpy.ndarray
The VIP scores.
References
----------
https://www.sciencedirect.com/topics/engineering/variable-importance-in-projection
"""
x_scores, x_loadings, y_loadings, x_weights = \
self.model['x_scores'], self.model['x_loadings'],\
self.model['y_loadings'], self.model['x_weights']
n_samples, n_components = x_scores.shape
W0 = x_weights / np.sqrt(np.sum(x_weights**2, axis=0))
p = x_loadings.shape[0]
sumSq = np.sum(x_scores**2, axis=0) * np.sum(y_loadings**2, axis=1)
vipScore = np.sqrt(p * np.sum(sumSq * (W0**2), axis=1) / np.sum(sumSq))
return vipScore
[docs] def plot_prediction(self, y, yhat, xlabel="Reference", ylabel="Prediction", title="", ax=None):
"""
Plot the predicted response variable against the reference variable.
Parameters
----------
y : numpy.ndarray
The reference variable.
yhat : numpy.ndarray
The predicted response variable.
xlabel : str, optional (default="Reference")
The label for the x-axis.
ylabel : str, optional (default="Prediction")
The label for the y-axis.
title : str, optional (default="")
The title for the plot.
ax : matplotlib.axes.Axes, optional
The axes on which to plot the figure (default is None).
Returns
-------
ax : matplotlib.axes.Axes
The axes object containing the plotted figure.
"""
report = regressionReport(y, yhat)
if ax == None:
fig, ax = plt.subplots()
ax.plot([np.min(y)*0.95, np.max(y)*1.05], [np.min(y)*0.95, np.max(y)*1.05],
color='black', label="y=x")
ax.scatter(y, yhat, color='tab:green', marker='*', label='Prediction')
ax.text(0.7, 0.03,
"RMSEP = {:.4f}\nR$^2$ = {:.2}".format(
report["rmse"], report["r2"]),
transform=ax.transAxes)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
ax.legend(loc='upper left', bbox_to_anchor=(0.0, 1.0))
ax.set_title(title)
[docs]class plsda(PLSRegression):
"""
Partial Least Squares Discriminant Analysis (PLS-DA) model.
This class extends the scikit-learn PLSRegression class to include
Linear Discriminant Analysis (LDA) for classification.
Parameters
----------
n_components : int, optional (default = 2)
Number of components to keep in the model.
scale : bool, optional (default = True)
Whether to scale the data before fitting the model.
**kwargs : dict, optional
Additional keyword arguments to pass to the PLSRegression constructor.
Attributes
----------
lda : LinearDiscriminantAnalysis
The LDA model used for classification.
Methods
-------
fit(X, y)
Fit the PLS-DA model to the training data.
predict(X)
Predict the class labels for new data.
predict_log_proba(X)
Predict the log probabilities of the class labels for new data.
predict_proba(X)
Predict the probabilities of the class labels for new data.
crossValidation_predict(nfold=10)
Perform cross-validation to predict the class labels for the training data.
get_optLV(nfold=10)
Find the optimal number of components using cross-validation.
get_confusion_matrix(X, y)
Compute the confusion matrix for the model.
get_vip()
Compute the Variable Importance in Projection (VIP) scores for the model.
permutation_test(X, y, n_repeats=100, n_jobs=None)
Perform a permutation test to assess the significance of the model.
"""
def __init__(self, n_components=2, scale=True, **kwargs):
super().__init__(n_components=n_components, scale=scale, **kwargs)
self.lda = LinearDiscriminantAnalysis()
[docs] def fit(self, X, y):
"""
Fit the PLS-DA model to the training data.
Parameters
----------
X : numpy.ndarray
The training data matrix.
y : numpy.ndarray
The target variable vector.
Returns
-------
self : plsda
The fitted PLS-DA model.
"""
self.X = X
self.y = y
super().fit(X, y)
self.lda.fit(self.x_scores_, y)
return self
[docs] def predict(self, X):
"""
Predict the class labels for new data.
Parameters
----------
X : numpy.ndarray
The new data matrix.
Returns
-------
y_pred : numpy.ndarray
The predicted class labels.
"""
return self.lda.predict(self.transform(X))
[docs] def predict_log_proba(self, X):
"""
Predict the log probabilities of the class labels for new data.
Parameters
----------
X : numpy.ndarray
The new data matrix.
Returns
-------
log_proba : numpy.ndarray
The log probabilities of the class labels.
"""
return self.lda.predict_log_proba(self.predict(X))
[docs] def predict_proba(self, X):
"""
Predict the probabilities of the class labels for new data.
Parameters
----------
X : numpy.ndarray
The new data matrix.
Returns
-------
proba : numpy.ndarray
The probabilities of the class labels.
"""
return self.lda.predict_proba(self.predict(X))
[docs] def crossValidation_predict(self, nfold=10):
"""
Perform cross-validation to predict the class labels for the training data.
Parameters
----------
nfold : int, optional (default = 10)
The number of folds to use in cross-validation.
Returns
-------
yhat : numpy.ndarray
The predicted class labels for each fold and each number of components.
"""
X = self.X
y = self.y
yhat = np.zeros((y.shape[0], self.n_components))
for i in range(self.n_components):
model = plsda(n_components=i+1)
for train, test in KFold(n_splits=nfold).split(X):
model.fit(X[train, :], y[train])
yhat[test, i] = model.predict(X[test, :])
return yhat
[docs] def get_optLV(self, nfold=10):
"""
Find the optimal number of components using cross-validation.
Parameters
----------
nfold : int, optional (default = 10)
The number of folds to use in cross-validation.
Returns
-------
optLV : int
The optimal number of components.
"""
yhat_cv = self.crossValidation_predict(nfold)
accuracy_cv = []
for i in range(yhat_cv.shape[1]):
if len(self.lda.classes_) == 2:
report_cv = binaryClassificationReport(self.y, yhat_cv[:, i])
accuracy_cv.append(report_cv["accuracy"])
elif len(self.lda.classes_) > 2:
report_cv = multiClassificationReport(self.y, yhat_cv[:, i])
accuracy_tmp = [rep["accuracy"] for rep in report_cv.values()]
accuracy_cv.append(sum(accuracy_tmp))
optLV = int(np.argmax(accuracy_cv)+1)
self.optLV = optLV
return optLV
[docs] def get_confusion_matrix(self, X, y):
"""
Compute the confusion matrix for the model.
Parameters
----------
X : numpy.ndarray
The data matrix.
y : numpy.ndarray
The target variable vector.
Returns
-------
cm : numpy.ndarray
The confusion matrix.
"""
yhat = self.predict(X)
cm = confusion_matrix(y, yhat)
return cm
[docs] def get_vip(self):
"""
Compute the Variable Importance in Projection (VIP) scores for the model.
Returns
-------
vipScore : numpy.ndarray
The VIP scores.
"""
# latex code: VIP = \sqrt{\frac{p\sum_{a=1}^{A}((q_a^2t_a^Tt_a)(w_{ja}/||w_a||)^2}{\sum_{a=1}^A{(q_a^2t_a^Tt_a)}}}
XL = self.x_scores_
yl = self.y_scores_
Xw = self.x_weights_
W0 = Xw / np.sqrt(np.sum(Xw**2, axis=0))
p = XL.shape[0]
sumSq = np.sum(Xw**2, axis=0) * np.sum(yl**2, axis=0)
vipScore = np.sqrt(p * np.sum(sumSq * (W0**2), axis=1) / np.sum(sumSq))
return vipScore
[docs] def permutation_test(self, X, y, n_repeats=100, n_jobs=None):
"""
Perform a permutation test to assess the significance of the model.
Parameters
----------
X : numpy.ndarray
The data matrix.
y : numpy.ndarray
The target variable vector.
n_repeats : int, optional (default = 100)
The number of permutations to perform.
n_jobs : int, optional (default = None)
The number of parallel jobs to run. If None, all CPUs are used.
Returns
-------
q2 : numpy.ndarray
The Q2 values for each permutation.
r2 : numpy.ndarray
The R2 values for each permutation.
permutation_ratio : numpy.ndarray
The ratio of permuted target variable values to total target variable values for each permutation.
"""
q2 = np.zeros(n_repeats)
r2 = np.zeros(n_repeats)
permutation_ratio = np.zeros(n_repeats)
for i in range(n_repeats):
y_shuffled = np.random.permutation(y)
self.fit(X, y_shuffled)
y_pred = cross_val_predict(
self, X, y_shuffled, cv=10, n_jobs=n_jobs)
q2[i] = self.score(X, y_shuffled)
r2[i] = r2_score(y_shuffled, y_pred)
permutation_ratio[i] = np.sum(y_shuffled != y) / len(y)
return q2, r2, permutation_ratio
[docs]class lsvc(LinearSVC): # linear svc
"""
Linear Support Vector Classification (Linear SVC) model.
This class extends the scikit-learn LinearSVC class to include
methods for finding the optimal hyperparameters and computing
the confusion matrix.
Methods
-------
get_optParams(X, y, Params=None, nfold=10, n_jobs=None)
Find the optimal hyperparameters for the model using cross-validation.
get_confusion_matrix(X, y)
Compute the confusion matrix for the model.
"""
[docs] def get_optParams(self, X, y, Params=None, nfold=10, n_jobs=None):
"""
Find the optimal hyperparameters for the model using cross-validation.
Parameters
----------
X : numpy.ndarray
The data matrix.
y : numpy.ndarray
The target variable vector.
Params : dict, optional (default = None)
The hyperparameters to search over. If None, a default set of hyperparameters is used.
nfold : int, optional (default = 10)
The number of folds to use in cross-validation.
n_jobs : int, optional (default = None)
The number of parallel jobs to run. If None, all CPUs are used.
Returns
-------
best_params : dict
The optimal hyperparameters for the model.
"""
if Params is None:
Params = {'C': np.logspace(-4, 5, 10),
'penalty': ('l1', 'l2')}
self.gsh = GridSearchCV(estimator=self, param_grid=Params,
cv=nfold, n_jobs=n_jobs)
self.gsh.fit(X, y)
return self.gsh.best_params_
[docs] def get_confusion_matrix(self, X, y):
"""
Compute the confusion matrix for the model.
Parameters
----------
X : numpy.ndarray
The data matrix.
y : numpy.ndarray
The target variable vector.
Returns
-------
cm : numpy.ndarray
The confusion matrix.
"""
yhat = self.predict(X)
cm = confusion_matrix(y, yhat)
return cm
[docs]class svc(SVC):
"""
Support Vector Classification (SVC) model.
This class extends the scikit-learn SVC class to include methods for
finding the optimal hyperparameters and computing the confusion matrix.
Methods
-------
get_optParams(X, y, Params=None, nfold=10, n_jobs=None)
Find the optimal hyperparameters for the model using cross-validation.
get_confusion_matrix(X, y)
Compute the confusion matrix for the model.
"""
[docs] def get_optParams(self, X, y, Params=None, nfold=10, n_jobs=None):
"""
Find the optimal hyperparameters for the model using cross-validation.
Parameters
----------
X : numpy.ndarray
The data matrix.
y : numpy.ndarray
The target variable vector.
Params : dict, optional (default = None)
The hyperparameters to search over. If None, a default set of hyperparameters is used.
nfold : int, optional (default = 10)
The number of folds to use in cross-validation.
n_jobs : int, optional (default = None)
The number of parallel jobs to run. If None, all CPUs are used.
Returns
-------
best_params : dict
The optimal hyperparameters for the model.
"""
if Params is None:
Params = {'C': np.logspace(-4, 5, 10),
'gamma': np.logspace(-4, 5, 10),
'kernel': ('poly', 'rbf', 'sigmoid')}
self.gsh = GridSearchCV(estimator=self, param_grid=Params,
cv=nfold, n_jobs=n_jobs)
self.gsh.fit(X, y)
return self.gsh.best_params_
[docs] def get_confusion_matrix(self, X, y):
"""
Compute the confusion matrix for the model.
Parameters
----------
X : numpy.ndarray
The data matrix.
y : numpy.ndarray
The target variable vector.
Returns
-------
cm : numpy.ndarray
The confusion matrix.
"""
yhat = self.predict(X)
cm = confusion_matrix(y, yhat)
return cm
[docs]class rf(RandomForestClassifier):
"""
Random Forest Classification (RF) model.
This class extends the scikit-learn RandomForestClassifier class to include
methods for finding the optimal hyperparameters and computing the confusion matrix.
Methods
-------
get_optParams(X, y, Params=None, nfold=10, n_jobs=None)
Find the optimal hyperparameters for the model using cross-validation.
get_confusion_matrix(X, y)
Compute the confusion matrix for the model.
"""
[docs] def get_optParams(self, X, y, Params=None, nfold=10, n_jobs=None):
"""
Find the optimal hyperparameters for the model using cross-validation.
Parameters
----------
X : numpy.ndarray
The data matrix.
y : numpy.ndarray
The target variable vector.
Params : dict, optional (default = None)
The hyperparameters to search over. If None, a default set of hyperparameters is used.
nfold : int, optional (default = 10)
The number of folds to use in cross-validation.
n_jobs : int, optional (default = None)
The number of parallel jobs to run. If None, all CPUs are used.
Returns
-------
best_params : dict
The optimal hyperparameters for the model.
"""
if Params is None:
Params = {'n_estimators': np.arange(100)+1,
'max_depth': np.arange(3)+1}
self.gsh = GridSearchCV(estimator=self, param_grid=Params,
cv=nfold, n_jobs=n_jobs)
self.gsh.fit(X, y)
return self.gsh.best_params_
[docs] def get_confusion_matrix(self, X, y):
"""
Compute the confusion matrix for the model.
Parameters
----------
X : numpy.ndarray
The data matrix.
y : numpy.ndarray
The target variable vector.
Returns
-------
cm : numpy.ndarray
The confusion matrix.
"""
yhat = self.predict(X)
cm = confusion_matrix(y, yhat)
return cm
[docs]class multiClass_to_binaryMatrix():
"""
Multi-class to binary matrix conversion.
This class is used to convert a multi-class target variable into a binary matrix
suitable for training a multi-label classifier.
Methods
-------
fit(x)
Fit the transformer to the data.
transform(x)
Transform the data into a binary matrix.
reTransform(xnew)
Convert the binary matrix back into the original target variable.
"""
def __init__(self):
pass
[docs] def fit(self, x):
"""
Fit the transformer to the data.
Parameters
----------
x : numpy.ndarray
The target variable vector.
Returns
-------
self : object
Returns self.
"""
self.classes = np.unique(x)
return self
[docs]def plot_confusion_matrix(cm,
target_names,
title='Confusion matrix',
cmap=None,
normalize=True):
"""
given a sklearn confusion matrix (cm), make a nice plot
Arguments
---------
cm: confusion matrix from sklearn.metrics.confusion_matrix
target_names: given classification classes such as [0, 1, 2]
the class names, for example: ['high', 'medium', 'low']
title: the text to display at the top of the matrix
cmap: the gradient of the values displayed from matplotlib.pyplot.cm
see http://matplotlib.org/examples/color/colormaps_reference.html
plt.get_cmap('jet') or plt.cm.Blues
normalize: If False, plot the raw numbers
If True, plot the proportions
Usage
-----
plot_confusion_matrix(cm = cm, # confusion matrix created by
# sklearn.metrics.confusion_matrix
normalize = True, # show proportions
target_names = y_labels_vals, # list of names of the classes
title = best_estimator_name) # title of graph
Citiation
---------
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
"""
accuracy = np.trace(cm) / np.sum(cm).astype('float')
misclass = 1 - accuracy
if cmap is None:
cmap = plt.get_cmap('Blues')
plt.figure(figsize=(8, 6))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
if target_names is not None:
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
thresh = cm.max() / 1.5 if normalize else cm.max() / 2
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
if normalize:
plt.text(j, i, "{:0.4f}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
else:
plt.text(j, i, "{:,}".format(cm[i, j]),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(
accuracy, misclass))
plt.show()
[docs]def multiClassificationReport(ytrue, ypred):
"""
Generate a classification report for a multi-class classification problem.
This function generates a classification report for a multi-class classification problem
by computing binary classification reports for each class.
Parameters
----------
ytrue : numpy.ndarray
The true target variable vector.
ypred : numpy.ndarray
The predicted target variable vector.
Returns
-------
report : dict
A dictionary containing binary classification reports for each class.
"""
labels = np.unique(ytrue)
report = dict()
for labeli in labels:
report[labeli] = binaryClassificationReport(
ytrue=ytrue == labeli, ypred=ypred == labeli)
return report
[docs]def binaryClassificationReport(ytrue, ypred):
"""
Generate a binary classification report.
This function generates a binary classification report for a binary classification problem
by computing the confusion matrix and various performance metrics.
Parameters
----------
ytrue : numpy.ndarray
The true target variable vector.
ypred : numpy.ndarray
The predicted target variable vector.
Returns
-------
report : dict
A dictionary containing various performance metrics.
"""
if len(np.unique(ytrue)) > 2:
raise ("Use the multiClassificationReport function for multiple classification.")
else:
tn, fp, fn, tp = confusion_matrix(ytrue, ypred).ravel()
report = dict()
report["accuracy"] = accuracy_score(ytrue, ypred)
report["sensitivity"] = recall_score(ytrue, ypred) # recall
report["specificity"] = tn/(tn+fp)
report["f1"] = f1_score(ytrue, ypred)
return report
[docs]def regressionReport(ytrue, ypred):
"""
Generate a regression report.
This function generates a regression report for a regression problem
by computing the root mean squared error (RMSE) and the R-squared (R2) score.
Parameters
----------
ytrue : numpy.ndarray
The true target variable vector.
ypred : numpy.ndarray
The predicted target variable vector.
Returns
-------
report : dict
A dictionary containing the RMSE and R2 score.
"""
report = dict()
report["rmse"] = mean_squared_error(ytrue, ypred, squared=False)
report["r2"] = r2_score(ytrue, ypred)
return report
[docs]def simpls(X, y, n_components):
"""
Perform SIMPLS (Partial Least Squares) regression.
This function performs SIMPLS regression, which is a variant of PLS regression
that uses a sequential algorithm to compute the PLS components.
Parameters
----------
X : numpy.ndarray
The independent variable matrix.
y : numpy.ndarray
The dependent variable vector.
n_components : int
The number of PLS components to compute.
Returns
-------
results : dict
A dictionary containing the PLS components and loadings.
Notes
-----
This implementation is based on the algorithm described in:
Wold, S., Ruhe, A., Wold, H., & Dunn III, W. J. (1984).
The collinearity problem in linear regression. The partial least squares (PLS) approach to generalized inverses.
SIAM Journal on Scientific and Statistical Computing, 5(3), 735-743.
"""
n_samples, n_variables = X.shape
if np.ndim(y) == 1:
y = y[:, np.newaxis]
if n_samples != y.shape[0]:
raise ValueError(
'The number of independent and dependent variable are inconsistent')
n_components = np.min((n_components, n_samples, n_variables))
V = np.zeros((n_variables, n_components))
x_scores = np.zeros((n_samples, n_components)) # X scores (standardized)
x_weights = np.zeros((n_variables, n_components)) # X weights
x_loadings = np.zeros((n_variables, n_components)) # X loadings
y_loadings = np.zeros((1, n_components)) # Y loadings
y_scores = np.zeros((n_samples, n_components)) # Y scores
s = np.dot(X.T, y).ravel() # cross-product matrix between the X and y_data
for i in range(n_components):
r = s
t = np.dot(X, r)
tt = np.linalg.norm(t)
t = t / tt
r = r / tt
p = np.dot(X.T, t)
q = np.dot(y.T, t)
u = np.dot(y, q)
v = p
if i > 0:
v = v - np.dot(V, np.dot(V.T, p)) # Gram-Schimidt orthogonal
u = u - np.dot(x_scores, np.dot(x_scores.T, u))
v = v / np.linalg.norm(v)
s = s - np.dot(v, np.dot(v.T, s))
x_weights[:, i] = r
x_scores[:, i] = t
x_loadings[:, i] = p
y_loadings[:, i] = q
y_scores[:, i] = u
V[:, i] = v
B = np.cumsum(np.dot(x_weights, np.diag(y_loadings.ravel())), axis=1)
return {'B': B, 'x_scores': x_scores, 'x_loadings': x_loadings, 'y_loadings': y_loadings,
'x_scores_weights': x_weights, 'x_weights': x_weights, 'y_scores': y_scores}
[docs]def sampleSplit_random(X, test_size=0.25, random_state=1, shuffle=False):
"""
Randomly split a dataset into training and testing sets.
This function randomly splits a dataset into training and testing sets
using the train_test_split function from scikit-learn.
Parameters
----------
X : numpy.ndarray
The dataset to split.
test_size : float, optional
The proportion of the dataset to include in the test split.
random_state : int, optional
The random seed to use for reproducibility.
shuffle : bool, optional
Whether or not to shuffle the dataset before splitting.
Returns
-------
trainIdx : numpy.ndarray
The indices of the training set.
testIdx : numpy.ndarray
The indices of the testing set.
"""
sampleIdx = np.arange(X.shape[0])
trainIdx, testIdx = train_test_split(sampleIdx, test_size=test_size,
random_state=random_state,
shuffle=shuffle)
return trainIdx, testIdx
[docs]def sampleSplit_KS(X, test_size=0.25, metric='euclidean', *args, **kwargs):
"""
Split a dataset into training and testing sets using the KS algorithm.
This function splits a dataset into training and testing sets using the KS algorithm,
which selects points that maximize the minimum distance between them and previously
selected points.
Parameters
----------
X : numpy.ndarray
The dataset to split.
test_size : float, optional
The proportion of the dataset to include in the test split.
metric : str, optional
The distance metric to use for computing distances between points.
*args : tuple
Additional arguments to pass to the distance metric function.
**kwargs : dict
Additional keyword arguments to pass to the distance metric function.
Returns
-------
trainIdx : numpy.ndarray
The indices of the training set.
testIdx : numpy.ndarray
The indices of the testing set.
Notes
-----
This implementation is based on the algorithm described in:
K. S. Lee, "Automatic thresholding for defect detection," Pattern Recognition, vol. 21, no. 3, pp. 225-238, 1988.
"""
Xscore = PCA(n_components=2).fit_transform(X)
distance = cdist(Xscore, Xscore, metric=metric, *args, **kwargs)
select_pts = []
remaining_pts = [x for x in range(distance.shape[0])]
# first select 2 farthest points
first_2pts = np.unravel_index(np.argmax(distance), distance.shape)
select_pts.append(first_2pts[0])
select_pts.append(first_2pts[1])
# remove the first 2 points from the remaining list
remaining_pts.remove(first_2pts[0])
remaining_pts.remove(first_2pts[1])
for i in range(round(X.shape[0]*(1-test_size)) - 2):
# find the maximum minimum distance
select_distance = distance[select_pts, :]
min_distance = select_distance[:, remaining_pts]
min_distance = np.min(min_distance, axis=0)
max_min_distance = np.max(min_distance)
# select the first point (in case that several distances are the same, choose the first one)
points = np.argwhere(select_distance == max_min_distance)[
:, 1].tolist()
for point in points:
if point in select_pts:
pass
else:
select_pts.append(point)
remaining_pts.remove(point)
break
return select_pts, remaining_pts