Coverage for /Users/Newville/Codes/xraylarch/larch/math/learn_regress.py: 14%
111 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-09 10:08 -0600
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-09 10:08 -0600
1#!/usr/bin/env python
2"""
3generalizerd linear models and feature selection using machine learning methods
4including Partial Least Squares (PLS) and L1-Regularized Linear Modeling (Lasso).
6These methods are built on the methods from scikit-learn
7"""
8import numpy as np
10try:
11 from sklearn.cross_decomposition import PLSRegression
12 from sklearn.model_selection import RepeatedKFold
13 from sklearn.linear_model import LassoLarsCV, LassoLars, Lasso
14 HAS_SKLEARN = True
15except ImportError:
16 HAS_SKLEARN = False
18from .. import Group, isgroup
20from .utils import interp
21from .lincombo_fitting import get_arrays, groups2matrix
23def pls_train(groups, varname='valence', arrayname='norm', scale=True,
24 ncomps=2, cv_folds=None, cv_repeats=None, skip_cv=False,
25 xmin=-np.inf, xmax=np.inf, **kws):
27 """use a list of data groups to train a Partial Least Squares model
29 Arguments
30 ---------
31 groups list of groups to use as components
32 varname name of characteristic value to model ['valence']
33 arrayname string of array name to be fit (see Note 3) ['norm']
34 xmin x-value for start of fit range [-inf]
35 xmax x-value for end of fit range [+inf]
36 scale bool to scale data [True]
37 cv_folds None or number of Cross-Validation folds (Seee Note 4) [None]
38 cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None]
39 skip_cv bool to skip doing Cross-Validation [None]
40 ncomps number of independent components (See Note 5) [2]
42 Returns
43 -------
44 group with trained PSLResgession, to be used with pls_predict
46 Notes
47 -----
48 1. The group members for the components must match each other
49 in data content and array names.
50 2. all grouops must have an attribute (scalar value) for `varname`
51 3. arrayname can be one of `norm` or `dmude`
52 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used
53 (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1
54 will be used (rounded).
55 5. The optimal number of components may be best found from PCA. If set to None,
56 a search will be done for ncomps that gives the lowest RMSE_CV.
57 """
58 if not HAS_SKLEARN:
59 raise ImportError("scikit-learn not installed")
61 xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
62 groupnames = []
63 ydat = []
64 for g in groups:
65 groupnames.append(getattr(g, 'filename',
66 getattr(g, 'groupname', repr(g))))
67 val = getattr(g, varname, None)
68 if val is None:
69 raise Value("group '%s' does not have attribute '%s'" % (g, varname))
70 ydat.append(val)
71 ydat = np.array(ydat)
73 nvals = len(groups)
75 kws['scale'] = scale
76 kws['n_components'] = ncomps
78 model = PLSRegression(**kws)
80 rmse_cv = None
81 if not skip_cv:
82 if cv_folds is None:
83 cv_folds = int(round(np.sqrt(nvals)))
84 if cv_repeats is None:
85 cv_repeats = int(round(np.sqrt(nvals)) - 1)
87 resid = []
88 cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats)
89 for ctrain, ctest in cv.split(range(nvals)):
90 model.fit(spectra[ctrain, :], ydat[ctrain])
91 ypred = model.predict(spectra[ctest, :])[:, 0]
92 resid.extend((ypred - ydat[ctest]).tolist())
93 resid = np.array(resid)
94 rmse_cv = np.sqrt( (resid**2).mean() )
96 # final fit without cross-validation
97 model = PLSRegression(**kws)
98 out = model.fit(spectra, ydat)
100 ypred = model.predict(spectra)[:, 0]
102 rmse = np.sqrt(((ydat - ypred)**2).mean())
104 return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred,
105 coefs=model.x_weights_, loadings=model.x_loadings_,
106 cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv,
107 rmse=rmse, model=model, varname=varname,
108 arrayname=arrayname, scale=scale, groupnames=groupnames,
109 keywords=kws)
113def lasso_train(groups, varname='valence', arrayname='norm', alpha=None,
114 use_lars=True, fit_intercept=True, normalize=True,
115 cv_folds=None, cv_repeats=None, skip_cv=False,
116 xmin=-np.inf, xmax=np.inf, **kws):
118 """use a list of data groups to train a Lasso/LassoLars model
120 Arguments
121 ---------
122 groups list of groups to use as components
123 varname name of characteristic value to model ['valence']
124 arrayname string of array name to be fit (see Note 3) ['norm']
125 xmin x-value for start of fit range [-inf]
126 xmax x-value for end of fit range [+inf]
127 alpha alpha parameter for LassoLars (See Note 5) [None]
128 use_lars bool to use LassoLars instead of Lasso [True]
129 cv_folds None or number of Cross-Validation folds (Seee Note 4) [None]
130 cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None]
131 skip_cv bool to skip doing Cross-Validation [None]
133 Returns
134 -------
135 group with trained LassoLars model, to be used with lasso_predict
136 Notes
137 -----
138 1. The group members for the components must match each other
139 in data content and array names.
140 2. all grouops must have an attribute (scalar value) for `varname`
141 3. arrayname can be one of `norm` or `dmude`
142 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used
143 (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1
144 will be used (rounded).
145 5. alpha is the regularization parameter. if alpha is None it will
146 be set using LassoLarsCV
147 """
148 if not HAS_SKLEARN:
149 raise ImportError("scikit-learn not installed")
150 xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
151 groupnames = []
152 ydat = []
153 for g in groups:
154 groupnames.append(getattr(g, 'filename',
155 getattr(g, 'groupname', repr(g))))
156 val = getattr(g, varname, None)
157 if val is None:
158 raise Value("group '%s' does not have attribute '%s'" % (g, varname))
159 ydat.append(val)
160 ydat = np.array(ydat)
162 nvals = len(groups)
164 kws.update(dict(fit_intercept=fit_intercept, normalize=normalize))
165 creator = LassoLars if use_lars else Lasso
166 model = None
168 rmse_cv = None
169 if not skip_cv:
170 if cv_folds is None:
171 cv_folds = int(round(np.sqrt(nvals)))
172 if cv_repeats is None:
173 cv_repeats = int(round(np.sqrt(nvals)) - 1)
175 cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats)
176 if alpha is None:
177 lcvmod = LassoLarsCV(cv=cv, max_n_alphas=1e7,
178 max_iter=1e7, eps=1.e-12, **kws)
179 lcvmod.fit(spectra, ydat)
180 alpha = lcvmod.alpha_
182 model = creator(alpha=alpha, **kws)
183 resid = []
184 for ctrain, ctest in cv.split(range(nvals)):
185 model.fit(spectra[ctrain, :], ydat[ctrain])
186 ypred = model.predict(spectra[ctest, :])
187 resid.extend((ypred - ydat[ctest]).tolist())
188 resid = np.array(resid)
189 rmse_cv = np.sqrt( (resid**2).mean() )
191 if alpha is None:
192 cvmod = creator(**kws)
193 cvmod.fit(spectra, ydat)
194 alpha = cvmod.alpha_
196 if model is None:
197 model = creator(alpha=alpha, **kws)
199 # final fit without cross-validation
200 out = model.fit(spectra, ydat)
202 ypred = model.predict(spectra)
204 rmse = np.sqrt(((ydat - ypred)**2).mean())
206 return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred,
207 alpha=alpha, active=model.active_, coefs=model.coef_,
208 cv_folds=cv_folds, cv_repeats=cv_repeats,
209 rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname,
210 arrayname=arrayname, fit_intercept=fit_intercept,
211 normalize=normalize, groupnames=groupnames, keywords=kws)
214def _predict(group, model):
215 """internal use """
216 # generate arrays and interpolate components onto the unknown x array
217 xdat, ydat = get_arrays(group, model.arrayname)
218 if xdat is None or ydat is None:
219 raise ValueError("cannot get arrays for arrayname='%s'" % arrayname)
221 spectra = interp(xdat, ydat, model.x, kind='cubic')
222 spectra.shape = (1, len(spectra))
223 return model.model.predict(spectra)[0]
225def lasso_predict(group, model):
226 """
227 Predict the external value for a group based on a Lasso model
229 Arguments
230 ---------
231 group group with data to fit
232 model Lasso/LassoLars model as found from lasso_train()
234 Returns
235 -------
236 predict value of external variable for the group
237 """
238 valid = (isgroup(model) and hasattr(model, 'model') and
239 hasattr(model, 'x') and hasattr(model, 'arrayname') and
240 model.model.__repr__().startswith('Lasso'))
241 if not valid:
242 raise ValueError("lasso_predict needs a Lasso training model")
243 return _predict(group, model)
245def pls_predict(group, model):
246 """
247 Predict the external value for a group based on a PLS model
249 Arguments
250 ---------
251 group group with data to fit
252 model PLS model as found from pls_train()
254 Returns
255 -------
256 predict value of external variable for the group
257 """
258 valid = (isgroup(model) and hasattr(model, 'model') and
259 hasattr(model, 'x') and hasattr(model, 'arrayname') and
260 model.model.__repr__().startswith('PLS'))
261 if not valid:
262 raise ValueError("pls_predict needs a PLS training model")
263 return _predict(group, model)