Coverage for /Users/Newville/Codes/xraylarch/larch/math/pca.py: 19%

130 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-11-09 10:08 -0600

1#!/usr/bin/env python 

2""" 

3linear combination fitting 

4""" 

5import os 

6import sys 

7import time 

8import json 

9from itertools import combinations 

10 

11import numpy as np 

12from numpy.random import randint 

13 

14try: 

15 from sklearn.decomposition import PCA 

16 HAS_SKLEARN = True 

17except ImportError: 

18 HAS_SKLEARN = False 

19 

20from lmfit import minimize, Parameters 

21 

22from .. import Group 

23from .utils import interp, index_of 

24from larch.utils import str2bytes, bytes2str, read_textfile 

25 

26from .lincombo_fitting import get_arrays, get_label, groups2matrix 

27 

28 

29def nmf_train(groups, arrayname='norm', xmin=-np.inf, xmax=np.inf, 

30 solver='cd', beta_loss=2): 

31 """use a list of data groups to train a Non-negative model 

32 

33 Arguments 

34 --------- 

35 groups list of groups to use as components 

36 arrayname string of array name to be fit (see Note 2) ['norm'] 

37 xmin x-value for start of fit range [-inf] 

38 xmax x-value for end of fit range [+inf] 

39 beta_loss beta parameter for NMF [2] 

40 

41 Returns 

42 ------- 

43 group with trained NMF model, to be used with pca_fit 

44 

45 Notes 

46 ----- 

47 1. The group members for the components must match each other 

48 in data content and array names. 

49 2. arrayname can be one of `norm` or `dmude` 

50 """ 

51 xdat, ydat = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) 

52 

53 ydat[np.where(ydat<0)] = 0 

54 opts = dict(n_components=len(groups), solver=solver) 

55 if solver == 'mu': 

56 opts.update(dict(beta_loss=beta_loss)) 

57 ret = NMF(**opts).fit(ydat) 

58 labels = [get_label(g) for g in groups] 

59 

60 return Group(x=xdat, arrayname=arrayname, labels=labels, ydat=ydat, 

61 components=ret.components_, 

62 xmin=xmin, xmax=xmax, model=ret) 

63 

64 

65def pca_train_sklearn(groups, arrayname='norm', xmin=-np.inf, xmax=np.inf): 

66 """use a list of data groups to train a Principal Component Analysis 

67 

68 Arguments 

69 --------- 

70 groups list of groups to use as components 

71 arrayname string of array name to be fit (see Note 2) ['norm'] 

72 xmin x-value for start of fit range [-inf] 

73 xmax x-value for end of fit range [+inf] 

74 

75 Returns 

76 ------- 

77 group with trained PCA or N model, to be used with pca_fit 

78 

79 Notes 

80 ----- 

81 1. The group members for the components must match each other 

82 in data content and array names. 

83 2. arrayname can be one of `norm` or `dmude` 

84 """ 

85 xdat, ydat = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) 

86 if not HAS_SKLEARN: 

87 raise ImportError("scikit-learn not installed") 

88 

89 ret = PCA().fit(ydat) 

90 labels = [get_label(g) for g in groups] 

91 

92 return Group(x=xdat, arrayname=arrayname, labels=labels, ydat=ydat, 

93 xmin=xmin, xmax=xmax, model=ret, mean=ret.mean_, 

94 components=ret.components_, 

95 variances=ret.explained_variance_ratio_) 

96 

97 

98def pca_athena(groups, arrayname='norm', subtract_mean=True, 

99 normalize=True, xmin=-np.inf, xmax=np.inf): 

100 xdat, data = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) 

101 if subtract_mean: 

102 data = data - data.mean(axis=0) 

103 

104 data = data.T 

105 data = data - data.mean(axis=0) 

106 if normalize: 

107 data = data / data.std(axis=0) 

108 

109 cor = np.dot(data.T, data) / data.shape[0] 

110 evals, var = np.linalg.eigh(cor) 

111 iorder = np.argsort(evals)[::-1] 

112 evals = evals[iorder] 

113 evec = np.dot(data, var)[:, iorder] 

114 return evec, evals 

115 

116def pca_train(groups, arrayname='norm', xmin=-np.inf, xmax=np.inf): 

117 """use a list of data groups to train a Principal Component Analysis 

118 

119 Arguments 

120 --------- 

121 groups list of groups to use as components 

122 arrayname string of array name to be fit (see Note 2) ['norm'] 

123 xmin x-value for start of fit range [-inf] 

124 xmax x-value for end of fit range [+inf] 

125 

126 Returns 

127 ------- 

128 group with trained PCA or N model, to be used with pca_fit 

129 

130 Notes 

131 ----- 

132 1. The group members for the components must match each other 

133 in data content and array names. 

134 2. arrayname can be one of `norm` or `dmude` 

135 """ 

136 xdat, ydat = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) 

137 labels = [get_label(g) for g in groups] 

138 narr, nfreq = ydat.shape 

139 

140 ymean = ydat.mean(axis=0) 

141 ynorm = ydat - ymean 

142 

143 # normalize data to be centered at 0 with unit standard deviation 

144 ynorm = (ynorm.T - ynorm.mean(axis=1)) / ynorm.std(axis=1) 

145 eigval, eigvec_ = np.linalg.eigh(np.dot(ynorm.T, ynorm) / narr) 

146 eigvec = (np.dot(ynorm, -eigvec_)/narr).T 

147 eigvec, eigval = eigvec[::-1, :], eigval[::-1] 

148 

149 variances = eigval/eigval.sum() 

150 

151 # calculate IND statistic 

152 ind = None 

153 for r in range(narr-1): 

154 nr = narr - r - 1 

155 indval = np.sqrt(nfreq*eigval[r:].sum()/nr)/nr**2 

156 if ind is None: 

157 ind = [indval] 

158 ind.append(indval) 

159 ind = np.array(ind) 

160 

161 nsig = int(np.argmin(ind)) 

162 return Group(x=xdat, arrayname=arrayname, labels=labels, ydat=ydat, 

163 xmin=xmin, xmax=xmax, mean=ymean, components=eigvec, 

164 eigenvalues=eigval, variances=variances, ind=ind, nsig=nsig) 

165 

166 

167def save_pca_model(pca_model, filename): 

168 """save a PCA model to a file""" 

169 from larch.utils.jsonutils import encode4js 

170 buff = ['##Larch PCA Model: 1.0 : %s' % time.strftime('%Y-%m-%d %H:%M:%S')] 

171 buff.append('%s' % json.dumps(encode4js(pca_model))) 

172 

173 fh = GzipFile(filename, "w") 

174 fh.write(str2bytes("\n".join(buff))) 

175 fh.close() 

176 

177def read_pca_model(filename): 

178 """read a PCA model from a file""" 

179 from larch.utils.jsonutils import decode4js 

180 text = read_textfile(filename) 

181 lines = text.split('\n') 

182 if not lines[0].startswith('##Larch PCA Model'): 

183 raise ValueError(f"Invalid Larch PCA Model: '{fname:s}'") 

184 return decode4js(json.loads(lines[1])) 

185 

186 

187def pca_statistics(pca_model): 

188 """return PCA arrays of statistics IND and F 

189 

190 For data of shape (p, n) (that is, p frequencies/energies, n spectra) 

191 

192 For index r, and eigv = eigenvalues 

193 

194 IND(r) = sqrt( eigv[r:].sum() / (p*(n-r))) / (n-r)**2 

195 

196 F1R(r) = eigv[r] / (p+1-r)*(n+1-r) / sum_i=r^n-1 (eigv[i] / ((p+1-i)*(n+1-i))) 

197 """ 

198 p, n = pca_model.ydat.shape 

199 eigv = pca_model.eigenvalues 

200 ind, f1r = [], [] 

201 for r in range(n-1): 

202 nr = n-r-1 

203 ind.append( np.sqrt(eigv[r:].sum()/ (p*nr))/nr**2) 

204 f1sum = 0 

205 for i in range(r, n): 

206 f1sum += eigv[i]/((p+1-i)*(n+1-i)) 

207 f1sum = max(1.e-10, f1sum) 

208 f1r.append(eigv[r] / (max(1, (p+1-r)*(n-r+1)) * f1sum)) 

209 

210 pca_model.ind = np.array(ind) 

211 pca_model.f1r = np.array(f1r) 

212 

213 return pca_model.ind, pca_model.f1r 

214 

215def _pca_scale_resid(params, ydat=None, pca_model=None, comps=None): 

216 scale = params['scale'].value 

217 weights, chi2, rank, s = np.linalg.lstsq(comps, ydat*scale-pca_model.mean) 

218 yfit = (weights * comps).sum(axis=1) + pca_model.mean 

219 return (scale*ydat - yfit) 

220 

221 

222def pca_fit(group, pca_model, ncomps=None, rescale=True): 

223 """ 

224 fit a spectrum from a group to a PCA training model from pca_train() 

225 

226 Arguments 

227 --------- 

228 group group with data to fit 

229 pca_model PCA model as found from pca_train() 

230 ncomps number of components to included 

231 rescale whether to allow data to be renormalized (True) 

232 

233 Returns 

234 ------- 

235 None, the group will have a subgroup name `pca_result` created 

236 with the following members: 

237 

238 x x or energy value from model 

239 ydat input data interpolated onto `x` 

240 yfit linear least-squares fit using model components 

241 weights weights for PCA components 

242 chi_square goodness-of-fit measure 

243 pca_model the input PCA model 

244 

245 """ 

246 # get first nerate arrays and interpolate components onto the unknown x array 

247 xdat, ydat = groups2matrix([group], pca_model.arrayname, xmin=pca_model.xmin, xmax=pca_model.xmax) 

248 

249 if xdat is None or ydat is None: 

250 raise ValueError("cannot get arrays for arrayname='%s'" % arrayname) 

251 

252 xshape = xdat.shape 

253 if len(xshape) == 2: 

254 xdat = xdat[0] 

255 

256 ydat = ydat[0] 

257 ydat = interp(xdat, ydat, pca_model.x, kind='cubic') 

258 

259 params = Parameters() 

260 params.add('scale', value=1.0, vary=True, min=0) 

261 

262 if ncomps is None: 

263 ncomps=len(pca_model.components) 

264 comps = pca_model.components[:ncomps].transpose() 

265 

266 if rescale: 

267 weights, chi2, rank, s = np.linalg.lstsq(comps, ydat-pca_model.mean) 

268 yfit = (weights * comps).sum(axis=1) + pca_model.mean 

269 

270 result = minimize(_pca_scale_resid, params, method='leastsq', 

271 gtol=1.e-5, ftol=1.e-5, xtol=1.e-5, epsfcn=1.e-5, 

272 kws = dict(ydat=ydat, comps=comps, pca_model=pca_model)) 

273 scale = result.params['scale'].value 

274 ydat *= scale 

275 weights, chi2, rank, s = np.linalg.lstsq(comps, ydat-pca_model.mean) 

276 yfit = (weights * comps).sum(axis=1) + pca_model.mean 

277 

278 else: 

279 weights, chi2, rank, s = np.linalg.lstsq(comps, ydat-pca_model.mean) 

280 yfit = (weights * comps).sum(axis=1) + pca_model.mean 

281 scale = 1.0 

282 

283 group.pca_result = Group(x=pca_model.x, ydat=ydat, yfit=yfit, 

284 pca_model=pca_model, chi_square=chi2[0], 

285 data_scale=scale, weights=weights) 

286 return