Coverage for /Users/Newville/Codes/xraylarch/larch/math/learn_regress.py: 14%

111 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-09 10:08 -0600

1#!/usr/bin/env python 

2""" 

3generalizerd linear models and feature selection using machine learning methods 

4including Partial Least Squares (PLS) and L1-Regularized Linear Modeling (Lasso). 

5 

6These methods are built on the methods from scikit-learn 

7""" 

8import numpy as np 

9 

10try: 

11 from sklearn.cross_decomposition import PLSRegression 

12 from sklearn.model_selection import RepeatedKFold 

13 from sklearn.linear_model import LassoLarsCV, LassoLars, Lasso 

14 HAS_SKLEARN = True 

15except ImportError: 

16 HAS_SKLEARN = False 

17 

18from .. import Group, isgroup 

19 

20from .utils import interp 

21from .lincombo_fitting import get_arrays, groups2matrix 

22 

23def pls_train(groups, varname='valence', arrayname='norm', scale=True, 

24 ncomps=2, cv_folds=None, cv_repeats=None, skip_cv=False, 

25 xmin=-np.inf, xmax=np.inf, **kws): 

26 

27 """use a list of data groups to train a Partial Least Squares model 

28 

29 Arguments 

30 --------- 

31 groups list of groups to use as components 

32 varname name of characteristic value to model ['valence'] 

33 arrayname string of array name to be fit (see Note 3) ['norm'] 

34 xmin x-value for start of fit range [-inf] 

35 xmax x-value for end of fit range [+inf] 

36 scale bool to scale data [True] 

37 cv_folds None or number of Cross-Validation folds (Seee Note 4) [None] 

38 cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None] 

39 skip_cv bool to skip doing Cross-Validation [None] 

40 ncomps number of independent components (See Note 5) [2] 

41 

42 Returns 

43 ------- 

44 group with trained PSLResgession, to be used with pls_predict 

45 

46 Notes 

47 ----- 

48 1. The group members for the components must match each other 

49 in data content and array names. 

50 2. all grouops must have an attribute (scalar value) for `varname` 

51 3. arrayname can be one of `norm` or `dmude` 

52 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used 

53 (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1 

54 will be used (rounded). 

55 5. The optimal number of components may be best found from PCA. If set to None, 

56 a search will be done for ncomps that gives the lowest RMSE_CV. 

57 """ 

58 if not HAS_SKLEARN: 

59 raise ImportError("scikit-learn not installed") 

60 

61 xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) 

62 groupnames = [] 

63 ydat = [] 

64 for g in groups: 

65 groupnames.append(getattr(g, 'filename', 

66 getattr(g, 'groupname', repr(g)))) 

67 val = getattr(g, varname, None) 

68 if val is None: 

69 raise Value("group '%s' does not have attribute '%s'" % (g, varname)) 

70 ydat.append(val) 

71 ydat = np.array(ydat) 

72 

73 nvals = len(groups) 

74 

75 kws['scale'] = scale 

76 kws['n_components'] = ncomps 

77 

78 model = PLSRegression(**kws) 

79 

80 rmse_cv = None 

81 if not skip_cv: 

82 if cv_folds is None: 

83 cv_folds = int(round(np.sqrt(nvals))) 

84 if cv_repeats is None: 

85 cv_repeats = int(round(np.sqrt(nvals)) - 1) 

86 

87 resid = [] 

88 cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats) 

89 for ctrain, ctest in cv.split(range(nvals)): 

90 model.fit(spectra[ctrain, :], ydat[ctrain]) 

91 ypred = model.predict(spectra[ctest, :])[:, 0] 

92 resid.extend((ypred - ydat[ctest]).tolist()) 

93 resid = np.array(resid) 

94 rmse_cv = np.sqrt( (resid**2).mean() ) 

95 

96 # final fit without cross-validation 

97 model = PLSRegression(**kws) 

98 out = model.fit(spectra, ydat) 

99 

100 ypred = model.predict(spectra)[:, 0] 

101 

102 rmse = np.sqrt(((ydat - ypred)**2).mean()) 

103 

104 return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred, 

105 coefs=model.x_weights_, loadings=model.x_loadings_, 

106 cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv, 

107 rmse=rmse, model=model, varname=varname, 

108 arrayname=arrayname, scale=scale, groupnames=groupnames, 

109 keywords=kws) 

110 

111 

112 

113def lasso_train(groups, varname='valence', arrayname='norm', alpha=None, 

114 use_lars=True, fit_intercept=True, normalize=True, 

115 cv_folds=None, cv_repeats=None, skip_cv=False, 

116 xmin=-np.inf, xmax=np.inf, **kws): 

117 

118 """use a list of data groups to train a Lasso/LassoLars model 

119 

120 Arguments 

121 --------- 

122 groups list of groups to use as components 

123 varname name of characteristic value to model ['valence'] 

124 arrayname string of array name to be fit (see Note 3) ['norm'] 

125 xmin x-value for start of fit range [-inf] 

126 xmax x-value for end of fit range [+inf] 

127 alpha alpha parameter for LassoLars (See Note 5) [None] 

128 use_lars bool to use LassoLars instead of Lasso [True] 

129 cv_folds None or number of Cross-Validation folds (Seee Note 4) [None] 

130 cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None] 

131 skip_cv bool to skip doing Cross-Validation [None] 

132 

133 Returns 

134 ------- 

135 group with trained LassoLars model, to be used with lasso_predict 

136 Notes 

137 ----- 

138 1. The group members for the components must match each other 

139 in data content and array names. 

140 2. all grouops must have an attribute (scalar value) for `varname` 

141 3. arrayname can be one of `norm` or `dmude` 

142 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used 

143 (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1 

144 will be used (rounded). 

145 5. alpha is the regularization parameter. if alpha is None it will 

146 be set using LassoLarsCV 

147 """ 

148 if not HAS_SKLEARN: 

149 raise ImportError("scikit-learn not installed") 

150 xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) 

151 groupnames = [] 

152 ydat = [] 

153 for g in groups: 

154 groupnames.append(getattr(g, 'filename', 

155 getattr(g, 'groupname', repr(g)))) 

156 val = getattr(g, varname, None) 

157 if val is None: 

158 raise Value("group '%s' does not have attribute '%s'" % (g, varname)) 

159 ydat.append(val) 

160 ydat = np.array(ydat) 

161 

162 nvals = len(groups) 

163 

164 kws.update(dict(fit_intercept=fit_intercept, normalize=normalize)) 

165 creator = LassoLars if use_lars else Lasso 

166 model = None 

167 

168 rmse_cv = None 

169 if not skip_cv: 

170 if cv_folds is None: 

171 cv_folds = int(round(np.sqrt(nvals))) 

172 if cv_repeats is None: 

173 cv_repeats = int(round(np.sqrt(nvals)) - 1) 

174 

175 cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats) 

176 if alpha is None: 

177 lcvmod = LassoLarsCV(cv=cv, max_n_alphas=1e7, 

178 max_iter=1e7, eps=1.e-12, **kws) 

179 lcvmod.fit(spectra, ydat) 

180 alpha = lcvmod.alpha_ 

181 

182 model = creator(alpha=alpha, **kws) 

183 resid = [] 

184 for ctrain, ctest in cv.split(range(nvals)): 

185 model.fit(spectra[ctrain, :], ydat[ctrain]) 

186 ypred = model.predict(spectra[ctest, :]) 

187 resid.extend((ypred - ydat[ctest]).tolist()) 

188 resid = np.array(resid) 

189 rmse_cv = np.sqrt( (resid**2).mean() ) 

190 

191 if alpha is None: 

192 cvmod = creator(**kws) 

193 cvmod.fit(spectra, ydat) 

194 alpha = cvmod.alpha_ 

195 

196 if model is None: 

197 model = creator(alpha=alpha, **kws) 

198 

199 # final fit without cross-validation 

200 out = model.fit(spectra, ydat) 

201 

202 ypred = model.predict(spectra) 

203 

204 rmse = np.sqrt(((ydat - ypred)**2).mean()) 

205 

206 return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred, 

207 alpha=alpha, active=model.active_, coefs=model.coef_, 

208 cv_folds=cv_folds, cv_repeats=cv_repeats, 

209 rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname, 

210 arrayname=arrayname, fit_intercept=fit_intercept, 

211 normalize=normalize, groupnames=groupnames, keywords=kws) 

212 

213 

214def _predict(group, model): 

215 """internal use """ 

216 # generate arrays and interpolate components onto the unknown x array 

217 xdat, ydat = get_arrays(group, model.arrayname) 

218 if xdat is None or ydat is None: 

219 raise ValueError("cannot get arrays for arrayname='%s'" % arrayname) 

220 

221 spectra = interp(xdat, ydat, model.x, kind='cubic') 

222 spectra.shape = (1, len(spectra)) 

223 return model.model.predict(spectra)[0] 

224 

225def lasso_predict(group, model): 

226 """ 

227 Predict the external value for a group based on a Lasso model 

228 

229 Arguments 

230 --------- 

231 group group with data to fit 

232 model Lasso/LassoLars model as found from lasso_train() 

233 

234 Returns 

235 ------- 

236 predict value of external variable for the group 

237 """ 

238 valid = (isgroup(model) and hasattr(model, 'model') and 

239 hasattr(model, 'x') and hasattr(model, 'arrayname') and 

240 model.model.__repr__().startswith('Lasso')) 

241 if not valid: 

242 raise ValueError("lasso_predict needs a Lasso training model") 

243 return _predict(group, model) 

244 

245def pls_predict(group, model): 

246 """ 

247 Predict the external value for a group based on a PLS model 

248 

249 Arguments 

250 --------- 

251 group group with data to fit 

252 model PLS model as found from pls_train() 

253 

254 Returns 

255 ------- 

256 predict value of external variable for the group 

257 """ 

258 valid = (isgroup(model) and hasattr(model, 'model') and 

259 hasattr(model, 'x') and hasattr(model, 'arrayname') and 

260 model.model.__repr__().startswith('PLS')) 

261 if not valid: 

262 raise ValueError("pls_predict needs a PLS training model") 

263 return _predict(group, model)