Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/multivariate/factor.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
3import warnings
5import numpy as np
6from numpy.linalg import eigh, inv, norm, matrix_rank
7import pandas as pd
8from scipy.optimize import minimize
10from statsmodels.tools.decorators import cache_readonly
11from statsmodels.base.model import Model
12from statsmodels.iolib import summary2
13from statsmodels.graphics.utils import _import_mpl
15from .factor_rotation import rotate_factors, promax
18_opt_defaults = {'gtol': 1e-7}
21def _check_args_1(endog, n_factor, corr, nobs):
23 msg = "Either endog or corr must be provided."
24 if endog is not None and corr is not None:
25 raise ValueError(msg)
26 if endog is None and corr is None:
27 warnings.warn('Both endog and corr are provided, ' +
28 'corr will be used for factor analysis.')
30 if n_factor <= 0:
31 raise ValueError('n_factor must be larger than 0! %d < 0' %
32 (n_factor))
34 if nobs is not None and endog is not None:
35 warnings.warn("nobs is ignored when endog is provided")
38def _check_args_2(endog, n_factor, corr, nobs, k_endog):
40 if n_factor > k_endog:
41 raise ValueError('n_factor cannot be greater than the number'
42 ' of variables! %d > %d' %
43 (n_factor, k_endog))
45 if np.max(np.abs(np.diag(corr) - 1)) > 1e-10:
46 raise ValueError("corr must be a correlation matrix")
48 if corr.shape[0] != corr.shape[1]:
49 raise ValueError('Correlation matrix corr must be a square '
50 '(rows %d != cols %d)' % corr.shape)
53class Factor(Model):
54 """
55 Factor analysis
57 Parameters
58 ----------
59 endog : array_like
60 Variables in columns, observations in rows. May be `None` if
61 `corr` is not `None`.
62 n_factor : int
63 The number of factors to extract
64 corr : array_like
65 Directly specify the correlation matrix instead of estimating
66 it from `endog`. If provided, `endog` is not used for the
67 factor analysis, it may be used in post-estimation.
68 method : str
69 The method to extract factors, currently must be either 'pa'
70 for principal axis factor analysis or 'ml' for maximum
71 likelihood estimation.
72 smc : True or False
73 Whether or not to apply squared multiple correlations (method='pa')
74 endog_names : str
75 Names of endogenous variables. If specified, it will be used
76 instead of the column names in endog
77 nobs : int
78 The number of observations, not used if endog is present. Needs to
79 be provided for inference if endog is None.
80 missing : 'none', 'drop', or 'raise'
81 Missing value handling for endog, default is row-wise deletion 'drop'
82 If 'none', no nan checking is done. If 'drop', any observations with
83 nans are dropped. If 'raise', an error is raised.
86 Notes
87 -----
88 **Experimental**
90 Supported rotations: 'varimax', 'quartimax', 'biquartimax',
91 'equamax', 'oblimin', 'parsimax', 'parsimony', 'biquartimin',
92 'promax'
94 If method='ml', the factors are rotated to satisfy condition IC3
95 of Bai and Li (2012). This means that the scores have covariance
96 I, so the model for the covariance matrix is L * L' + diag(U),
97 where L are the loadings and U are the uniquenesses. In addition,
98 L' * diag(U)^{-1} L must be diagonal.
100 References
101 ----------
102 .. [*] Hofacker, C. (2004). Exploratory Factor Analysis, Mathematical
103 Marketing. http://www.openaccesstexts.org/pdf/Quant_Chapter_11_efa.pdf
104 .. [*] J Bai, K Li (2012). Statistical analysis of factor models of high
105 dimension. Annals of Statistics. https://arxiv.org/pdf/1205.6617.pdf
106 """
107 def __init__(self, endog=None, n_factor=1, corr=None, method='pa',
108 smc=True, endog_names=None, nobs=None, missing='drop'):
110 _check_args_1(endog, n_factor, corr, nobs)
112 if endog is not None:
113 super(Factor, self).__init__(endog, exog=None, missing=missing)
114 endog = self.endog # after preprocessing like missing, asarray
115 k_endog = endog.shape[1]
116 nobs = endog.shape[0]
117 corr = self.corr = np.corrcoef(endog, rowvar=0)
118 elif corr is not None:
119 corr = self.corr = np.asarray(corr)
120 k_endog = self.corr.shape[0]
121 self.endog = None
122 else:
123 msg = "Either endog or corr must be provided."
124 raise ValueError(msg)
126 _check_args_2(endog, n_factor, corr, nobs, k_endog)
128 self.n_factor = n_factor
129 self.loadings = None
130 self.communality = None
131 self.method = method
132 self.smc = smc
133 self.nobs = nobs
134 self.method = method
135 self.corr = corr
136 self.k_endog = k_endog
138 if endog_names is None:
139 if hasattr(corr, 'index'):
140 endog_names = corr.index
141 if hasattr(corr, 'columns'):
142 endog_names = corr.columns
143 self.endog_names = endog_names
145 @property
146 def endog_names(self):
147 """Names of endogenous variables"""
148 if self._endog_names is not None:
149 return self._endog_names
150 else:
151 if self.endog is not None:
152 return self.data.ynames
153 else:
154 d = 0
155 n = self.corr.shape[0] - 1
156 while n > 0:
157 d += 1
158 n //= 10
159 return [('var%0' + str(d) + 'd') % i
160 for i in range(self.corr.shape[0])]
162 @endog_names.setter
163 def endog_names(self, value):
164 # Check validity of endog_names:
165 if value is not None:
166 if len(value) != self.corr.shape[0]:
167 raise ValueError('The length of `endog_names` must '
168 'equal the number of variables.')
169 self._endog_names = np.asarray(value)
170 else:
171 self._endog_names = None
173 def fit(self, maxiter=50, tol=1e-8, start=None, opt_method='BFGS',
174 opt=None, em_iter=3):
175 """
176 Estimate factor model parameters.
178 Parameters
179 ----------
180 maxiter : int
181 Maximum number of iterations for iterative estimation algorithms
182 tol : float
183 Stopping criteria (error tolerance) for iterative estimation
184 algorithms
185 start : array_like
186 Starting values, currently only used for ML estimation
187 opt_method : str
188 Optimization method for ML estimation
189 opt : dict-like
190 Keyword arguments passed to optimizer, only used for ML estimation
191 em_iter : int
192 The number of EM iterations before starting gradient optimization,
193 only used for ML estimation.
195 Returns
196 -------
197 results: FactorResults
198 """
199 method = self.method.lower()
200 if method == 'pa':
201 return self._fit_pa(maxiter=maxiter, tol=tol)
202 elif method == 'ml':
203 return self._fit_ml(start, em_iter, opt_method, opt)
204 else:
205 msg = "Unknown factor extraction approach '%s'" % self.method
206 raise ValueError(msg)
208 def _fit_pa(self, maxiter=50, tol=1e-8):
209 """
210 Extract factors using the iterative principal axis method
212 Parameters
213 ----------
214 maxiter : int
215 Maximum number of iterations for communality estimation
216 tol : float
217 If `norm(communality - last_communality) < tolerance`,
218 estimation stops
220 Returns
221 -------
222 results : FactorResults instance
223 """
225 R = self.corr.copy() # inplace modification below
227 # Parameter validation
228 self.n_comp = matrix_rank(R)
229 if self.n_factor > self.n_comp:
230 raise ValueError('n_factor must be smaller or equal to the rank'
231 ' of endog! %d > %d' %
232 (self.n_factor, self.n_comp))
233 if maxiter <= 0:
234 raise ValueError('n_max_iter must be larger than 0! %d < 0' %
235 (maxiter))
236 if tol <= 0 or tol > 0.01:
237 raise ValueError('tolerance must be larger than 0 and smaller than'
238 ' 0.01! Got %f instead' % (tol))
240 # Initial communality estimation
241 if self.smc:
242 c = 1 - 1 / np.diag(inv(R))
243 else:
244 c = np.ones(len(R))
246 # Iterative communality estimation
247 eigenvals = None
248 for i in range(maxiter):
249 # Get eigenvalues/eigenvectors of R with diag replaced by
250 # communality
251 for j in range(len(R)):
252 R[j, j] = c[j]
253 L, V = eigh(R, UPLO='U')
254 c_last = np.array(c)
255 ind = np.argsort(L)
256 ind = ind[::-1]
257 L = L[ind]
258 n_pos = (L > 0).sum()
259 V = V[:, ind]
260 eigenvals = np.array(L)
262 # Select eigenvectors with positive eigenvalues
263 n = np.min([n_pos, self.n_factor])
264 sL = np.diag(np.sqrt(L[:n]))
265 V = V[:, :n]
267 # Calculate new loadings and communality
268 A = V.dot(sL)
269 c = np.power(A, 2).sum(axis=1)
270 if norm(c_last - c) < tol:
271 break
273 self.eigenvals = eigenvals
274 self.communality = c
275 self.uniqueness = 1 - c
276 self.loadings = A
277 return FactorResults(self)
279 # Unpacks the model parameters from a flat vector, used for ML
280 # estimation. The first k_endog elements of par are the square
281 # roots of the uniquenesses. The remaining elements are the
282 # factor loadings, packed one factor at a time.
283 def _unpack(self, par):
284 return (par[0:self.k_endog]**2,
285 np.reshape(par[self.k_endog:], (-1, self.k_endog)).T)
287 # Packs the model parameters into a flat parameter, used for ML
288 # estimation.
289 def _pack(self, load, uniq):
290 return np.concatenate((np.sqrt(uniq), load.T.flat))
292 def loglike(self, par):
293 """
294 Evaluate the log-likelihood function.
296 Parameters
297 ----------
298 par : ndarray or tuple of 2 ndarray's
299 The model parameters, either a packed representation of
300 the model parameters or a 2-tuple containing a `k_endog x
301 n_factor` matrix of factor loadings and a `k_endog` vector
302 of uniquenesses.
304 Returns
305 -------
306 loglike : float
307 """
309 if type(par) is np.ndarray:
310 uniq, load = self._unpack(par)
311 else:
312 load, uniq = par[0], par[1]
314 loadu = load / uniq[:, None]
315 lul = np.dot(load.T, loadu)
317 # log|GG' + S|
318 # Using matrix determinant lemma:
319 # |GG' + S| = |I + G'S^{-1}G|*|S|
320 lul.flat[::lul.shape[0]+1] += 1
321 _, ld = np.linalg.slogdet(lul)
322 v = np.sum(np.log(uniq)) + ld
324 # tr((GG' + S)^{-1}C)
325 # Using Sherman-Morrison-Woodbury
326 w = np.sum(1 / uniq)
327 b = np.dot(load.T, self.corr / uniq[:, None])
328 b = np.linalg.solve(lul, b)
329 b = np.dot(loadu, b)
330 w -= np.trace(b)
332 # Scaled log-likelihood
333 return -(v + w) / (2*self.k_endog)
335 def score(self, par):
336 """
337 Evaluate the score function (first derivative of loglike).
339 Parameters
340 ----------
341 par : ndarray or tuple of 2 ndarray's
342 The model parameters, either a packed representation of
343 the model parameters or a 2-tuple containing a `k_endog x
344 n_factor` matrix of factor loadings and a `k_endog` vector
345 of uniquenesses.
347 Returns
348 -------
349 score : ndarray
350 """
352 if type(par) is np.ndarray:
353 uniq, load = self._unpack(par)
354 else:
355 load, uniq = par[0], par[1]
357 # Center term of SMW
358 loadu = load / uniq[:, None]
359 c = np.dot(load.T, loadu)
360 c.flat[::c.shape[0]+1] += 1
361 d = np.linalg.solve(c, load.T)
363 # Precompute these terms
364 lud = np.dot(loadu, d)
365 cu = (self.corr / uniq) / uniq[:, None]
366 r = np.dot(cu, load)
367 lul = np.dot(lud.T, load)
368 luz = np.dot(cu, lul)
370 # First term
371 du = 2*np.sqrt(uniq) * (1/uniq - (d * load.T).sum(0) / uniq**2)
372 dl = 2*(loadu - np.dot(lud, loadu))
374 # Second term
375 h = np.dot(lud, cu)
376 f = np.dot(h, lud.T)
377 du -= 2*np.sqrt(uniq) * (np.diag(cu) - 2*np.diag(h) + np.diag(f))
378 dl -= 2*r
379 dl += 2*np.dot(lud, r)
380 dl += 2*luz
381 dl -= 2*np.dot(lud, luz)
383 # Cannot use _pack because we are working with the square root
384 # uniquenesses directly.
385 return -np.concatenate((du, dl.T.flat)) / (2*self.k_endog)
387 # Maximum likelihood factor analysis.
388 def _fit_ml(self, start, em_iter, opt_method, opt):
389 """estimate Factor model using Maximum Likelihood
390 """
392 # Starting values
393 if start is None:
394 load, uniq = self._fit_ml_em(em_iter)
395 start = self._pack(load, uniq)
396 elif len(start) == 2:
397 if len(start[1]) != start[0].shape[0]:
398 msg = "Starting values have incompatible dimensions"
399 raise ValueError(msg)
400 start = self._pack(start[0], start[1])
401 else:
402 raise ValueError("Invalid starting values")
404 def nloglike(par):
405 return -self.loglike(par)
407 def nscore(par):
408 return -self.score(par)
410 # Do the optimization
411 if opt is None:
412 opt = _opt_defaults
413 r = minimize(nloglike, start, jac=nscore, method=opt_method,
414 options=opt)
415 if not r.success:
416 warnings.warn("Fitting did not converge")
417 par = r.x
418 uniq, load = self._unpack(par)
420 if uniq.min() < 1e-10:
421 warnings.warn("Some uniquenesses are nearly zero")
423 # Rotate solution to satisfy IC3 of Bai and Li
424 load = self._rotate(load, uniq)
426 self.uniqueness = uniq
427 self.communality = 1 - uniq
428 self.loadings = load
429 self.mle_retvals = r
431 return FactorResults(self)
433 def _fit_ml_em(self, iter):
434 """estimate Factor model using EM algorithm
435 """
436 # Starting values
437 np.random.seed(3427)
438 load = 0.1*np.random.normal(size=(self.k_endog, self.n_factor))
439 uniq = 0.5 * np.ones(self.k_endog)
441 for k in range(iter):
443 loadu = load / uniq[:, None]
445 f = np.dot(load.T, loadu)
446 f.flat[::f.shape[0]+1] += 1
448 r = np.linalg.solve(f, loadu.T)
449 q = np.dot(loadu.T, load)
450 h = np.dot(r, load)
452 c = load - np.dot(load, h)
453 c /= uniq[:, None]
455 g = np.dot(q, r)
456 e = np.dot(g, self.corr)
457 d = np.dot(loadu.T, self.corr) - e
459 a = np.dot(d, c)
460 a -= np.dot(load.T, c)
461 a.flat[::a.shape[0]+1] += 1
463 b = np.dot(self.corr, c)
465 load = np.linalg.solve(a, b.T).T
466 uniq = np.diag(self.corr) - (load * d.T).sum(1)
468 return load, uniq
470 def _rotate(self, load, uniq):
471 """rotate loadings for MLE
472 """
473 # Rotations used in ML estimation.
474 load, s, _ = np.linalg.svd(load, 0)
475 load *= s
477 if self.nobs is None:
478 nobs = 1
479 else:
480 nobs = self.nobs
482 cm = np.dot(load.T, load / uniq[:, None]) / nobs
483 _, f = np.linalg.eig(cm)
484 load = np.dot(load, f)
485 return load
488class FactorResults(object):
489 """
490 Factor results class
492 For result summary, scree/loading plots and factor rotations
494 Parameters
495 ----------
496 factor : Factor
497 Fitted Factor class
499 Attributes
500 ----------
501 uniqueness: ndarray
502 The uniqueness (variance of uncorrelated errors unique to
503 each variable)
504 communality: ndarray
505 1 - uniqueness
506 loadings : ndarray
507 Each column is the loading vector for one factor
508 loadings_no_rot : ndarray
509 Unrotated loadings, not available under maximum likelihood
510 analysis.
511 eigenvalues : ndarray
512 The eigenvalues for a factor analysis obtained using
513 principal components; not available under ML estimation.
514 n_comp : int
515 Number of components (factors)
516 nbs : int
517 Number of observations
518 fa_method : str
519 The method used to obtain the decomposition, either 'pa' for
520 'principal axes' or 'ml' for maximum likelihood.
521 df : int
522 Degrees of freedom of the factor model.
524 Notes
525 -----
526 Under ML estimation, the default rotation (used for `loadings`) is
527 condition IC3 of Bai and Li (2012). Under this rotation, the
528 factor scores are iid and standardized. If `G` is the canonical
529 loadings and `U` is the vector of uniquenesses, then the
530 covariance matrix implied by the factor analysis is `GG' +
531 diag(U)`.
533 Status: experimental, Some refactoring will be necessary when new
534 features are added.
535 """
536 def __init__(self, factor):
537 self.model = factor
538 self.endog_names = factor.endog_names
539 self.loadings_no_rot = factor.loadings
540 if hasattr(factor, "eigenvals"):
541 self.eigenvals = factor.eigenvals
543 self.communality = factor.communality
544 self.uniqueness = factor.uniqueness
545 self.rotation_method = None
546 self.fa_method = factor.method
547 self.n_comp = factor.loadings.shape[1]
548 self.nobs = factor.nobs
549 self._factor = factor
550 if hasattr(factor, "mle_retvals"):
551 self.mle_retvals = factor.mle_retvals
553 p, k = self.loadings_no_rot.shape
554 self.df = ((p - k)**2 - (p + k)) // 2
556 # no rotation, overwritten in `rotate`
557 self.loadings = factor.loadings
558 self.rotation_matrix = np.eye(self.n_comp)
561 def __str__(self):
562 return self.summary().__str__()
564 def rotate(self, method):
565 """
566 Apply rotation, inplace modification of this Results instance
568 Parameters
569 ----------
570 method : str
571 Rotation to be applied. Allowed methods are varimax,
572 quartimax, biquartimax, equamax, oblimin, parsimax,
573 parsimony, biquartimin, promax.
575 Returns
576 -------
577 None : nothing returned, modifications are inplace
580 Notes
581 -----
582 Warning: 'varimax', 'quartimax' and 'oblimin' are verified against R or
583 Stata. Some rotation methods such as promax do not produce the same
584 results as the R or Stata default functions.
586 See Also
587 --------
588 factor_rotation : subpackage that implements rotation methods
589 """
590 self.rotation_method = method
591 if method not in ['varimax', 'quartimax', 'biquartimax',
592 'equamax', 'oblimin', 'parsimax', 'parsimony',
593 'biquartimin', 'promax']:
594 raise ValueError('Unknown rotation method %s' % (method))
596 if method in ['varimax', 'quartimax', 'biquartimax', 'equamax',
597 'parsimax', 'parsimony', 'biquartimin']:
598 self.loadings, T = rotate_factors(self.loadings_no_rot, method)
599 elif method == 'oblimin':
600 self.loadings, T = rotate_factors(self.loadings_no_rot,
601 'quartimin')
602 elif method == 'promax':
603 self.loadings, T = promax(self.loadings_no_rot)
604 else:
605 raise ValueError('rotation method not recognized')
607 self.rotation_matrix = T
609 def _corr_factors(self):
610 """correlation of factors implied by rotation
612 If the rotation is oblique, then the factors are correlated.
614 currently not cached
616 Returns
617 -------
618 corr_f : ndarray
619 correlation matrix of rotated factors, assuming initial factors are
620 orthogonal
621 """
622 T = self.rotation_matrix
623 corr_f = T.T.dot(T)
624 return corr_f
626 def factor_score_params(self, method='bartlett'):
627 """
628 Compute factor scoring coefficient matrix
630 The coefficient matrix is not cached.
632 Parameters
633 ----------
634 method : 'bartlett' or 'regression'
635 Method to use for factor scoring.
636 'regression' can be abbreviated to `reg`
638 Returns
639 -------
640 coeff_matrix : ndarray
641 matrix s to compute factors f from a standardized endog ys.
642 ``f = ys dot s``
644 Notes
645 -----
646 The `regression` method follows the Stata definition.
647 Method bartlett and regression are verified against Stats.
648 Two unofficial methods, 'ols' and 'gls', produce similar factor scores
649 but are not verified.
651 See Also
652 --------
653 statsmodels.multivariate.factor.FactorResults.factor_scoring
654 """
655 L = self.loadings
656 T = self.rotation_matrix.T
657 #TODO: check row versus column convention for T
658 uni = 1 - self.communality #self.uniqueness
660 if method == 'bartlett':
661 s_mat = np.linalg.inv(L.T.dot(L/(uni[:,None]))).dot((L.T / uni)).T
662 elif method.startswith('reg'):
663 corr = self.model.corr
664 corr_f = self._corr_factors()
665 # if orthogonal then corr_f is just eye
666 s_mat = corr_f.dot(L.T.dot(np.linalg.inv(corr))).T
667 elif method == 'ols':
668 # not verified
669 corr = self.model.corr
670 corr_f = self._corr_factors()
671 s_mat = corr_f.dot(np.linalg.pinv(L)).T
672 elif method == 'gls':
673 # not verified
674 #s_mat = np.linalg.inv(1*np.eye(L.shape[1]) + L.T.dot(L/(uni[:,None])))
675 corr = self.model.corr
676 corr_f = self._corr_factors()
677 s_mat = np.linalg.inv(np.linalg.inv(corr_f) + L.T.dot(L/(uni[:,None])))
678 s_mat = s_mat.dot(L.T / uni).T
679 else:
680 raise ValueError('method not available, use "bartlett ' +
681 'or "regression"')
682 return s_mat
684 def factor_scoring(self, endog=None, method='bartlett', transform=True):
685 """
686 factor scoring: compute factors for endog
688 If endog was not provided when creating the factor class, then
689 a standarized endog needs to be provided here.
691 Parameters
692 ----------
693 method : 'bartlett' or 'regression'
694 Method to use for factor scoring.
695 'regression' can be abbreviated to `reg`
696 transform : bool
697 If transform is true and endog is provided, then it will be
698 standardized using mean and scale of original data, which has to
699 be available in this case.
700 If transform is False, then a provided endog will be used unchanged.
701 The original endog in the Factor class will
702 always be standardized if endog is None, independently of `transform`.
704 Returns
705 -------
706 factor_score : ndarray
707 estimated factors using scoring matrix s and standarized endog ys
708 ``f = ys dot s``
710 Notes
711 -----
712 Status: transform option is experimental and might change.
714 See Also
715 --------
716 statsmodels.multivariate.factor.FactorResults.factor_score_params
717 """
719 if transform is False and endog is not None:
720 # no transformation in this case
721 endog = np.asarray(endog)
722 else:
723 # we need to standardize with the original mean and scale
724 if self.model.endog is not None:
725 m = self.model.endog.mean(0)
726 s = self.model.endog.std(ddof=1, axis=0)
727 if endog is None:
728 endog = self.model.endog
729 else:
730 endog = np.asarray(endog)
731 else:
732 raise ValueError('If transform is True, then `endog` needs ' +
733 'to be available in the Factor instance.')
735 endog = (endog - m) / s
737 s_mat = self.factor_score_params(method=method)
738 factors = endog.dot(s_mat)
739 return factors
741 def summary(self):
742 """Summary"""
743 summ = summary2.Summary()
744 summ.add_title('Factor analysis results')
745 loadings_no_rot = pd.DataFrame(
746 self.loadings_no_rot,
747 columns=["factor %d" % (i)
748 for i in range(self.loadings_no_rot.shape[1])],
749 index=self.endog_names
750 )
751 if hasattr(self, "eigenvals"):
752 # eigenvals not available for ML method
753 eigenvals = pd.DataFrame(
754 [self.eigenvals], columns=self.endog_names, index=[''])
755 summ.add_dict({'': 'Eigenvalues'})
756 summ.add_df(eigenvals)
757 communality = pd.DataFrame([self.communality],
758 columns=self.endog_names, index=[''])
759 summ.add_dict({'': ''})
760 summ.add_dict({'': 'Communality'})
761 summ.add_df(communality)
762 summ.add_dict({'': ''})
763 summ.add_dict({'': 'Pre-rotated loadings'})
764 summ.add_df(loadings_no_rot)
765 summ.add_dict({'': ''})
766 if self.rotation_method is not None:
767 loadings = pd.DataFrame(
768 self.loadings,
769 columns=["factor %d" % (i)
770 for i in range(self.loadings.shape[1])],
771 index=self.endog_names
772 )
773 summ.add_dict({'': '%s rotated loadings' % (self.rotation_method)})
774 summ.add_df(loadings)
775 return summ
777 def get_loadings_frame(self, style='display', sort_=True, threshold=0.3,
778 highlight_max=True, color_max='yellow',
779 decimals=None):
780 """get loadings matrix as DataFrame or pandas Styler
782 Parameters
783 ----------
784 style : 'display' (default), 'raw' or 'strings'
785 Style to use for display
787 * 'raw' returns just a DataFrame of the loadings matrix, no options are
788 applied
789 * 'display' add sorting and styling as defined by other keywords
790 * 'strings' returns a DataFrame with string elements with optional sorting
791 and suppressing small loading coefficients.
793 sort_ : bool
794 If True, then the rows of the DataFrame is sorted by contribution of each
795 factor. applies if style is either 'display' or 'strings'
796 threshold : float
797 If the threshold is larger than zero, then loading coefficients are
798 either colored white (if style is 'display') or replace by empty
799 string (if style is 'strings').
800 highlight_max : bool
801 This add a background color to the largest coefficient in each row.
802 color_max : html color
803 default is 'yellow'. color for background of row maximum
804 decimals : None or int
805 If None, then pandas default precision applies. Otherwise values are
806 rounded to the specified decimals. If style is 'display', then the
807 underlying dataframe is not changed. If style is 'strings', then
808 values are rounded before conversion to strings.
810 Returns
811 -------
812 loadings : DataFrame or pandas Styler instance
813 The return is a pandas Styler instance, if style is 'display' and
814 at least one of highlight_max, threshold or decimals is applied.
815 Otherwise, the returned loadings is a DataFrame.
817 Examples
818 --------
819 >>> mod = Factor(df, 3, smc=True)
820 >>> res = mod.fit()
821 >>> res.get_loadings_frame(style='display', decimals=3, threshold=0.2)
823 To get a sorted DataFrame, all styling options need to be turned off:
825 >>> df_sorted = res.get_loadings_frame(style='display',
826 ... highlight_max=False, decimals=None, threshold=0)
828 Options except for highlighting are available for plain test or Latex
829 usage:
831 >>> lds = res_u.get_loadings_frame(style='strings', decimals=3,
832 ... threshold=0.3)
833 >>> print(lds.to_latex())
834 """
836 loadings_df = pd.DataFrame(
837 self.loadings,
838 columns=["factor %d" % (i)
839 for i in range(self.loadings.shape[1])],
840 index=self.endog_names
841 )
843 if style not in ['raw', 'display', 'strings']:
844 msg = "style has to be one of 'raw', 'display', 'strings'"
845 raise ValueError(msg)
847 if style == 'raw':
848 return loadings_df
850 # add sorting and some formatting
851 if sort_ is True:
852 loadings_df2 = loadings_df.copy()
853 n_f = len(loadings_df2)
854 high = np.abs(loadings_df2.values).argmax(1)
855 loadings_df2['high'] = high
856 loadings_df2['largest'] = np.abs(loadings_df.values[np.arange(n_f), high])
857 loadings_df2.sort_values(by=['high', 'largest'], ascending=[True, False], inplace=True)
858 loadings_df = loadings_df2.drop(['high', 'largest'], axis=1)
860 if style == 'display':
861 sty = None
862 if threshold > 0:
863 def color_white_small(val):
864 """
865 Takes a scalar and returns a string with
866 the css property `'color: white'` for small values, black otherwise.
868 takes threshold from outer scope
869 """
870 color = 'white' if np.abs(val) < threshold else 'black'
871 return 'color: %s' % color
873 sty = loadings_df.style.applymap(color_white_small)
875 if highlight_max is True:
876 def highlight_max(s):
877 '''
878 highlight the maximum in a Series yellow.
879 '''
880 s = np.abs(s)
881 is_max = s == s.max()
882 return ['background-color: '+ color_max if v else '' for v in is_max]
884 if sty is None:
885 sty = loadings_df.style
887 sty = sty.apply(highlight_max, axis=1)
889 if decimals is not None:
890 if sty is None:
891 sty = loadings_df.style
893 sty.format("{:.%sf}" % decimals)
895 if sty is None:
896 return loadings_df
897 else:
898 return sty
900 if style == 'strings':
901 ld = loadings_df
902 if decimals is not None:
903 ld = ld.round(decimals)
904 ld = ld.astype(str)
905 if threshold > 0:
906 ld[loadings_df.abs() < threshold] = ''
907 return ld
909 def plot_scree(self, ncomp=None):
910 """
911 Plot of the ordered eigenvalues and variance explained for the loadings
913 Parameters
914 ----------
915 ncomp : int, optional
916 Number of loadings to include in the plot. If None, will
917 included the same as the number of maximum possible loadings
919 Returns
920 -------
921 Figure
922 Handle to the figure.
923 """
924 _import_mpl()
925 from .plots import plot_scree
926 return plot_scree(self.eigenvals, self.n_comp, ncomp)
928 def plot_loadings(self, loading_pairs=None, plot_prerotated=False):
929 """
930 Plot factor loadings in 2-d plots
932 Parameters
933 ----------
934 loading_pairs : None or a list of tuples
935 Specify plots. Each tuple (i, j) represent one figure, i and j is
936 the loading number for x-axis and y-axis, respectively. If `None`,
937 all combinations of the loadings will be plotted.
938 plot_prerotated : True or False
939 If True, the loadings before rotation applied will be plotted. If
940 False, rotated loadings will be plotted.
942 Returns
943 -------
944 figs : a list of figure handles
945 """
946 _import_mpl()
947 from .plots import plot_loadings
949 if self.rotation_method is None:
950 plot_prerotated = True
951 loadings = self.loadings_no_rot if plot_prerotated else self.loadings
952 if plot_prerotated:
953 title = 'Prerotated Factor Pattern'
954 else:
955 title = '%s Rotated Factor Pattern' % (self.rotation_method)
956 var_explained = self.eigenvals / self.n_comp * 100
958 return plot_loadings(loadings, loading_pairs=loading_pairs,
959 title=title, row_names=self.endog_names,
960 percent_variance=var_explained)
962 @cache_readonly
963 def fitted_cov(self):
964 """
965 Returns the fitted covariance matrix.
966 """
968 c = np.dot(self.loadings, self.loadings.T)
969 c.flat[::c.shape[0]+1] += self.uniqueness
970 return c
972 @cache_readonly
973 def uniq_stderr(self, kurt=0):
974 """
975 The standard errors of the uniquenesses.
977 Parameters
978 ----------
979 kurt: float
980 Excess kurtosis
982 Notes
983 -----
984 If excess kurtosis is known, provide as `kurt`. Standard
985 errors are only available if the model was fit using maximum
986 likelihood. If `endog` is not provided, `nobs` must be
987 provided to obtain standard errors.
989 These are asymptotic standard errors. See Bai and Li (2012)
990 for conditions under which the standard errors are valid.
992 The standard errors are only applicable to the original,
993 unrotated maximum likelihood solution.
994 """
996 if self.fa_method.lower() != "ml":
997 msg = "Standard errors only available under ML estimation"
998 raise ValueError(msg)
1000 if self.nobs is None:
1001 msg = "nobs is required to obtain standard errors."
1002 raise ValueError(msg)
1004 v = self.uniqueness**2 * (2 + kurt)
1005 return np.sqrt(v / self.nobs)
1007 @cache_readonly
1008 def load_stderr(self):
1009 """
1010 The standard errors of the loadings.
1012 Standard errors are only available if the model was fit using
1013 maximum likelihood. If `endog` is not provided, `nobs` must be
1014 provided to obtain standard errors.
1016 These are asymptotic standard errors. See Bai and Li (2012)
1017 for conditions under which the standard errors are valid.
1019 The standard errors are only applicable to the original,
1020 unrotated maximum likelihood solution.
1021 """
1023 if self.fa_method.lower() != "ml":
1024 msg = "Standard errors only available under ML estimation"
1025 raise ValueError(msg)
1027 if self.nobs is None:
1028 msg = "nobs is required to obtain standard errors."
1029 raise ValueError(msg)
1031 v = np.outer(self.uniqueness, np.ones(self.loadings.shape[1]))
1032 return np.sqrt(v / self.nobs)