Coverage for /Users/Newville/Codes/xraylarch/larch/math/pca.py: 19%
130 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-09 10:08 -0600
« prev ^ index » next coverage.py v7.3.2, created at 2023-11-09 10:08 -0600
1#!/usr/bin/env python
2"""
3linear combination fitting
4"""
5import os
6import sys
7import time
8import json
9from itertools import combinations
11import numpy as np
12from numpy.random import randint
14try:
15 from sklearn.decomposition import PCA
16 HAS_SKLEARN = True
17except ImportError:
18 HAS_SKLEARN = False
20from lmfit import minimize, Parameters
22from .. import Group
23from .utils import interp, index_of
24from larch.utils import str2bytes, bytes2str, read_textfile
26from .lincombo_fitting import get_arrays, get_label, groups2matrix
29def nmf_train(groups, arrayname='norm', xmin=-np.inf, xmax=np.inf,
30 solver='cd', beta_loss=2):
31 """use a list of data groups to train a Non-negative model
33 Arguments
34 ---------
35 groups list of groups to use as components
36 arrayname string of array name to be fit (see Note 2) ['norm']
37 xmin x-value for start of fit range [-inf]
38 xmax x-value for end of fit range [+inf]
39 beta_loss beta parameter for NMF [2]
41 Returns
42 -------
43 group with trained NMF model, to be used with pca_fit
45 Notes
46 -----
47 1. The group members for the components must match each other
48 in data content and array names.
49 2. arrayname can be one of `norm` or `dmude`
50 """
51 xdat, ydat = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
53 ydat[np.where(ydat<0)] = 0
54 opts = dict(n_components=len(groups), solver=solver)
55 if solver == 'mu':
56 opts.update(dict(beta_loss=beta_loss))
57 ret = NMF(**opts).fit(ydat)
58 labels = [get_label(g) for g in groups]
60 return Group(x=xdat, arrayname=arrayname, labels=labels, ydat=ydat,
61 components=ret.components_,
62 xmin=xmin, xmax=xmax, model=ret)
65def pca_train_sklearn(groups, arrayname='norm', xmin=-np.inf, xmax=np.inf):
66 """use a list of data groups to train a Principal Component Analysis
68 Arguments
69 ---------
70 groups list of groups to use as components
71 arrayname string of array name to be fit (see Note 2) ['norm']
72 xmin x-value for start of fit range [-inf]
73 xmax x-value for end of fit range [+inf]
75 Returns
76 -------
77 group with trained PCA or N model, to be used with pca_fit
79 Notes
80 -----
81 1. The group members for the components must match each other
82 in data content and array names.
83 2. arrayname can be one of `norm` or `dmude`
84 """
85 xdat, ydat = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
86 if not HAS_SKLEARN:
87 raise ImportError("scikit-learn not installed")
89 ret = PCA().fit(ydat)
90 labels = [get_label(g) for g in groups]
92 return Group(x=xdat, arrayname=arrayname, labels=labels, ydat=ydat,
93 xmin=xmin, xmax=xmax, model=ret, mean=ret.mean_,
94 components=ret.components_,
95 variances=ret.explained_variance_ratio_)
98def pca_athena(groups, arrayname='norm', subtract_mean=True,
99 normalize=True, xmin=-np.inf, xmax=np.inf):
100 xdat, data = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
101 if subtract_mean:
102 data = data - data.mean(axis=0)
104 data = data.T
105 data = data - data.mean(axis=0)
106 if normalize:
107 data = data / data.std(axis=0)
109 cor = np.dot(data.T, data) / data.shape[0]
110 evals, var = np.linalg.eigh(cor)
111 iorder = np.argsort(evals)[::-1]
112 evals = evals[iorder]
113 evec = np.dot(data, var)[:, iorder]
114 return evec, evals
116def pca_train(groups, arrayname='norm', xmin=-np.inf, xmax=np.inf):
117 """use a list of data groups to train a Principal Component Analysis
119 Arguments
120 ---------
121 groups list of groups to use as components
122 arrayname string of array name to be fit (see Note 2) ['norm']
123 xmin x-value for start of fit range [-inf]
124 xmax x-value for end of fit range [+inf]
126 Returns
127 -------
128 group with trained PCA or N model, to be used with pca_fit
130 Notes
131 -----
132 1. The group members for the components must match each other
133 in data content and array names.
134 2. arrayname can be one of `norm` or `dmude`
135 """
136 xdat, ydat = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
137 labels = [get_label(g) for g in groups]
138 narr, nfreq = ydat.shape
140 ymean = ydat.mean(axis=0)
141 ynorm = ydat - ymean
143 # normalize data to be centered at 0 with unit standard deviation
144 ynorm = (ynorm.T - ynorm.mean(axis=1)) / ynorm.std(axis=1)
145 eigval, eigvec_ = np.linalg.eigh(np.dot(ynorm.T, ynorm) / narr)
146 eigvec = (np.dot(ynorm, -eigvec_)/narr).T
147 eigvec, eigval = eigvec[::-1, :], eigval[::-1]
149 variances = eigval/eigval.sum()
151 # calculate IND statistic
152 ind = None
153 for r in range(narr-1):
154 nr = narr - r - 1
155 indval = np.sqrt(nfreq*eigval[r:].sum()/nr)/nr**2
156 if ind is None:
157 ind = [indval]
158 ind.append(indval)
159 ind = np.array(ind)
161 nsig = int(np.argmin(ind))
162 return Group(x=xdat, arrayname=arrayname, labels=labels, ydat=ydat,
163 xmin=xmin, xmax=xmax, mean=ymean, components=eigvec,
164 eigenvalues=eigval, variances=variances, ind=ind, nsig=nsig)
167def save_pca_model(pca_model, filename):
168 """save a PCA model to a file"""
169 from larch.utils.jsonutils import encode4js
170 buff = ['##Larch PCA Model: 1.0 : %s' % time.strftime('%Y-%m-%d %H:%M:%S')]
171 buff.append('%s' % json.dumps(encode4js(pca_model)))
173 fh = GzipFile(filename, "w")
174 fh.write(str2bytes("\n".join(buff)))
175 fh.close()
177def read_pca_model(filename):
178 """read a PCA model from a file"""
179 from larch.utils.jsonutils import decode4js
180 text = read_textfile(filename)
181 lines = text.split('\n')
182 if not lines[0].startswith('##Larch PCA Model'):
183 raise ValueError(f"Invalid Larch PCA Model: '{fname:s}'")
184 return decode4js(json.loads(lines[1]))
187def pca_statistics(pca_model):
188 """return PCA arrays of statistics IND and F
190 For data of shape (p, n) (that is, p frequencies/energies, n spectra)
192 For index r, and eigv = eigenvalues
194 IND(r) = sqrt( eigv[r:].sum() / (p*(n-r))) / (n-r)**2
196 F1R(r) = eigv[r] / (p+1-r)*(n+1-r) / sum_i=r^n-1 (eigv[i] / ((p+1-i)*(n+1-i)))
197 """
198 p, n = pca_model.ydat.shape
199 eigv = pca_model.eigenvalues
200 ind, f1r = [], []
201 for r in range(n-1):
202 nr = n-r-1
203 ind.append( np.sqrt(eigv[r:].sum()/ (p*nr))/nr**2)
204 f1sum = 0
205 for i in range(r, n):
206 f1sum += eigv[i]/((p+1-i)*(n+1-i))
207 f1sum = max(1.e-10, f1sum)
208 f1r.append(eigv[r] / (max(1, (p+1-r)*(n-r+1)) * f1sum))
210 pca_model.ind = np.array(ind)
211 pca_model.f1r = np.array(f1r)
213 return pca_model.ind, pca_model.f1r
215def _pca_scale_resid(params, ydat=None, pca_model=None, comps=None):
216 scale = params['scale'].value
217 weights, chi2, rank, s = np.linalg.lstsq(comps, ydat*scale-pca_model.mean)
218 yfit = (weights * comps).sum(axis=1) + pca_model.mean
219 return (scale*ydat - yfit)
222def pca_fit(group, pca_model, ncomps=None, rescale=True):
223 """
224 fit a spectrum from a group to a PCA training model from pca_train()
226 Arguments
227 ---------
228 group group with data to fit
229 pca_model PCA model as found from pca_train()
230 ncomps number of components to included
231 rescale whether to allow data to be renormalized (True)
233 Returns
234 -------
235 None, the group will have a subgroup name `pca_result` created
236 with the following members:
238 x x or energy value from model
239 ydat input data interpolated onto `x`
240 yfit linear least-squares fit using model components
241 weights weights for PCA components
242 chi_square goodness-of-fit measure
243 pca_model the input PCA model
245 """
246 # get first nerate arrays and interpolate components onto the unknown x array
247 xdat, ydat = groups2matrix([group], pca_model.arrayname, xmin=pca_model.xmin, xmax=pca_model.xmax)
249 if xdat is None or ydat is None:
250 raise ValueError("cannot get arrays for arrayname='%s'" % arrayname)
252 xshape = xdat.shape
253 if len(xshape) == 2:
254 xdat = xdat[0]
256 ydat = ydat[0]
257 ydat = interp(xdat, ydat, pca_model.x, kind='cubic')
259 params = Parameters()
260 params.add('scale', value=1.0, vary=True, min=0)
262 if ncomps is None:
263 ncomps=len(pca_model.components)
264 comps = pca_model.components[:ncomps].transpose()
266 if rescale:
267 weights, chi2, rank, s = np.linalg.lstsq(comps, ydat-pca_model.mean)
268 yfit = (weights * comps).sum(axis=1) + pca_model.mean
270 result = minimize(_pca_scale_resid, params, method='leastsq',
271 gtol=1.e-5, ftol=1.e-5, xtol=1.e-5, epsfcn=1.e-5,
272 kws = dict(ydat=ydat, comps=comps, pca_model=pca_model))
273 scale = result.params['scale'].value
274 ydat *= scale
275 weights, chi2, rank, s = np.linalg.lstsq(comps, ydat-pca_model.mean)
276 yfit = (weights * comps).sum(axis=1) + pca_model.mean
278 else:
279 weights, chi2, rank, s = np.linalg.lstsq(comps, ydat-pca_model.mean)
280 yfit = (weights * comps).sum(axis=1) + pca_model.mean
281 scale = 1.0
283 group.pca_result = Group(x=pca_model.x, ydat=ydat, yfit=yfit,
284 pca_model=pca_model, chi_square=chi2[0],
285 data_scale=scale, weights=weights)
286 return