Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/stats/correlation_tools.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
4Created on Fri Aug 17 13:10:52 2012
6Author: Josef Perktold
7License: BSD-3
8"""
10import numpy as np
11import scipy.sparse as sparse
12from scipy.sparse.linalg import svds
13from scipy.optimize import fminbound
14import warnings
16from statsmodels.tools.tools import Bunch
17from statsmodels.tools.sm_exceptions import (
18 IterationLimitWarning, iteration_limit_doc)
21def clip_evals(x, value=0): # threshold=0, value=0):
22 evals, evecs = np.linalg.eigh(x)
23 clipped = np.any(evals < value)
24 x_new = np.dot(evecs * np.maximum(evals, value), evecs.T)
25 return x_new, clipped
28def corr_nearest(corr, threshold=1e-15, n_fact=100):
29 '''
30 Find the nearest correlation matrix that is positive semi-definite.
32 The function iteratively adjust the correlation matrix by clipping the
33 eigenvalues of a difference matrix. The diagonal elements are set to one.
35 Parameters
36 ----------
37 corr : ndarray, (k, k)
38 initial correlation matrix
39 threshold : float
40 clipping threshold for smallest eigenvalue, see Notes
41 n_fact : int or float
42 factor to determine the maximum number of iterations. The maximum
43 number of iterations is the integer part of the number of columns in
44 the correlation matrix times n_fact.
46 Returns
47 -------
48 corr_new : ndarray, (optional)
49 corrected correlation matrix
51 Notes
52 -----
53 The smallest eigenvalue of the corrected correlation matrix is
54 approximately equal to the ``threshold``.
55 If the threshold=0, then the smallest eigenvalue of the correlation matrix
56 might be negative, but zero within a numerical error, for example in the
57 range of -1e-16.
59 Assumes input correlation matrix is symmetric.
61 Stops after the first step if correlation matrix is already positive
62 semi-definite or positive definite, so that smallest eigenvalue is above
63 threshold. In this case, the returned array is not the original, but
64 is equal to it within numerical precision.
66 See Also
67 --------
68 corr_clipped
69 cov_nearest
71 '''
72 k_vars = corr.shape[0]
73 if k_vars != corr.shape[1]:
74 raise ValueError("matrix is not square")
76 diff = np.zeros(corr.shape)
77 x_new = corr.copy()
78 diag_idx = np.arange(k_vars)
80 for ii in range(int(len(corr) * n_fact)):
81 x_adj = x_new - diff
82 x_psd, clipped = clip_evals(x_adj, value=threshold)
83 if not clipped:
84 x_new = x_psd
85 break
86 diff = x_psd - x_adj
87 x_new = x_psd.copy()
88 x_new[diag_idx, diag_idx] = 1
89 else:
90 warnings.warn(iteration_limit_doc, IterationLimitWarning)
92 return x_new
95def corr_clipped(corr, threshold=1e-15):
96 '''
97 Find a near correlation matrix that is positive semi-definite
99 This function clips the eigenvalues, replacing eigenvalues smaller than
100 the threshold by the threshold. The new matrix is normalized, so that the
101 diagonal elements are one.
102 Compared to corr_nearest, the distance between the original correlation
103 matrix and the positive definite correlation matrix is larger, however,
104 it is much faster since it only computes eigenvalues once.
106 Parameters
107 ----------
108 corr : ndarray, (k, k)
109 initial correlation matrix
110 threshold : float
111 clipping threshold for smallest eigenvalue, see Notes
113 Returns
114 -------
115 corr_new : ndarray, (optional)
116 corrected correlation matrix
119 Notes
120 -----
121 The smallest eigenvalue of the corrected correlation matrix is
122 approximately equal to the ``threshold``. In examples, the
123 smallest eigenvalue can be by a factor of 10 smaller than the threshold,
124 e.g. threshold 1e-8 can result in smallest eigenvalue in the range
125 between 1e-9 and 1e-8.
126 If the threshold=0, then the smallest eigenvalue of the correlation matrix
127 might be negative, but zero within a numerical error, for example in the
128 range of -1e-16.
130 Assumes input correlation matrix is symmetric. The diagonal elements of
131 returned correlation matrix is set to ones.
133 If the correlation matrix is already positive semi-definite given the
134 threshold, then the original correlation matrix is returned.
136 ``cov_clipped`` is 40 or more times faster than ``cov_nearest`` in simple
137 example, but has a slightly larger approximation error.
139 See Also
140 --------
141 corr_nearest
142 cov_nearest
144 '''
145 x_new, clipped = clip_evals(corr, value=threshold)
146 if not clipped:
147 return corr
149 # cov2corr
150 x_std = np.sqrt(np.diag(x_new))
151 x_new = x_new / x_std / x_std[:, None]
152 return x_new
155def cov_nearest(cov, method='clipped', threshold=1e-15, n_fact=100,
156 return_all=False):
157 """
158 Find the nearest covariance matrix that is positive (semi-) definite
160 This leaves the diagonal, i.e. the variance, unchanged
162 Parameters
163 ----------
164 cov : ndarray, (k,k)
165 initial covariance matrix
166 method : str
167 if "clipped", then the faster but less accurate ``corr_clipped`` is
168 used.if "nearest", then ``corr_nearest`` is used
169 threshold : float
170 clipping threshold for smallest eigen value, see Notes
171 n_fact : int or float
172 factor to determine the maximum number of iterations in
173 ``corr_nearest``. See its doc string
174 return_all : bool
175 if False (default), then only the covariance matrix is returned.
176 If True, then correlation matrix and standard deviation are
177 additionally returned.
179 Returns
180 -------
181 cov_ : ndarray
182 corrected covariance matrix
183 corr_ : ndarray, (optional)
184 corrected correlation matrix
185 std_ : ndarray, (optional)
186 standard deviation
189 Notes
190 -----
191 This converts the covariance matrix to a correlation matrix. Then, finds
192 the nearest correlation matrix that is positive semidefinite and converts
193 it back to a covariance matrix using the initial standard deviation.
195 The smallest eigenvalue of the intermediate correlation matrix is
196 approximately equal to the ``threshold``.
197 If the threshold=0, then the smallest eigenvalue of the correlation matrix
198 might be negative, but zero within a numerical error, for example in the
199 range of -1e-16.
201 Assumes input covariance matrix is symmetric.
203 See Also
204 --------
205 corr_nearest
206 corr_clipped
207 """
209 from statsmodels.stats.moment_helpers import cov2corr, corr2cov
210 cov_, std_ = cov2corr(cov, return_std=True)
211 if method == 'clipped':
212 corr_ = corr_clipped(cov_, threshold=threshold)
213 else: # method == 'nearest'
214 corr_ = corr_nearest(cov_, threshold=threshold, n_fact=n_fact)
216 cov_ = corr2cov(corr_, std_)
218 if return_all:
219 return cov_, corr_, std_
220 else:
221 return cov_
224def _nmono_linesearch(obj, grad, x, d, obj_hist, M=10, sig1=0.1,
225 sig2=0.9, gam=1e-4, maxiter=100):
226 """
227 Implements the non-monotone line search of Grippo et al. (1986),
228 as described in Birgin, Martinez and Raydan (2013).
230 Parameters
231 ----------
232 obj : real-valued function
233 The objective function, to be minimized
234 grad : vector-valued function
235 The gradient of the objective function
236 x : array_like
237 The starting point for the line search
238 d : array_like
239 The search direction
240 obj_hist : array_like
241 Objective function history (must contain at least one value)
242 M : positive int
243 Number of previous function points to consider (see references
244 for details).
245 sig1 : real
246 Tuning parameter, see references for details.
247 sig2 : real
248 Tuning parameter, see references for details.
249 gam : real
250 Tuning parameter, see references for details.
251 maxiter : int
252 The maximum number of iterations; returns Nones if convergence
253 does not occur by this point
255 Returns
256 -------
257 alpha : real
258 The step value
259 x : Array_like
260 The function argument at the final step
261 obval : Real
262 The function value at the final step
263 g : Array_like
264 The gradient at the final step
266 Notes
267 -----
268 The basic idea is to take a big step in the direction of the
269 gradient, even if the function value is not decreased (but there
270 is a maximum allowed increase in terms of the recent history of
271 the iterates).
273 References
274 ----------
275 Grippo L, Lampariello F, Lucidi S (1986). A Nonmonotone Line
276 Search Technique for Newton's Method. SIAM Journal on Numerical
277 Analysis, 23, 707-716.
279 E. Birgin, J.M. Martinez, and M. Raydan. Spectral projected
280 gradient methods: Review and perspectives. Journal of Statistical
281 Software (preprint).
282 """
284 alpha = 1.
285 last_obval = obj(x)
286 obj_max = max(obj_hist[-M:])
288 for iter in range(maxiter):
290 obval = obj(x + alpha*d)
291 g = grad(x)
292 gtd = (g * d).sum()
294 if obval <= obj_max + gam*alpha*gtd:
295 return alpha, x + alpha*d, obval, g
297 a1 = -0.5*alpha**2*gtd / (obval - last_obval - alpha*gtd)
299 if (sig1 <= a1) and (a1 <= sig2*alpha):
300 alpha = a1
301 else:
302 alpha /= 2.
304 last_obval = obval
306 return None, None, None, None
309def _spg_optim(func, grad, start, project, maxiter=1e4, M=10,
310 ctol=1e-3, maxiter_nmls=200, lam_min=1e-30,
311 lam_max=1e30, sig1=0.1, sig2=0.9, gam=1e-4):
312 """
313 Implements the spectral projected gradient method for minimizing a
314 differentiable function on a convex domain.
316 Parameters
317 ----------
318 func : real valued function
319 The objective function to be minimized.
320 grad : real array-valued function
321 The gradient of the objective function
322 start : array_like
323 The starting point
324 project : function
325 In-place projection of the argument to the domain
326 of func.
327 ... See notes regarding additional arguments
329 Returns
330 -------
331 rslt : Bunch
332 rslt.params is the final iterate, other fields describe
333 convergence status.
335 Notes
336 -----
337 This can be an effective heuristic algorithm for problems where no
338 guaranteed algorithm for computing a global minimizer is known.
340 There are a number of tuning parameters, but these generally
341 should not be changed except for `maxiter` (positive integer) and
342 `ctol` (small positive real). See the Birgin et al reference for
343 more information about the tuning parameters.
345 Reference
346 ---------
347 E. Birgin, J.M. Martinez, and M. Raydan. Spectral projected
348 gradient methods: Review and perspectives. Journal of Statistical
349 Software (preprint). Available at:
350 http://www.ime.usp.br/~egbirgin/publications/bmr5.pdf
351 """
353 lam = min(10*lam_min, lam_max)
355 params = start.copy()
356 gval = grad(params)
358 obj_hist = [func(params), ]
360 for itr in range(int(maxiter)):
362 # Check convergence
363 df = params - gval
364 project(df)
365 df -= params
366 if np.max(np.abs(df)) < ctol:
367 return Bunch(**{"Converged": True, "params": params,
368 "objective_values": obj_hist,
369 "Message": "Converged successfully"})
371 # The line search direction
372 d = params - lam*gval
373 project(d)
374 d -= params
376 # Carry out the nonmonotone line search
377 alpha, params1, fval, gval1 = _nmono_linesearch(
378 func,
379 grad,
380 params,
381 d,
382 obj_hist,
383 M=M,
384 sig1=sig1,
385 sig2=sig2,
386 gam=gam,
387 maxiter=maxiter_nmls)
389 if alpha is None:
390 return Bunch(**{"Converged": False, "params": params,
391 "objective_values": obj_hist,
392 "Message": "Failed in nmono_linesearch"})
394 obj_hist.append(fval)
395 s = params1 - params
396 y = gval1 - gval
398 sy = (s*y).sum()
399 if sy <= 0:
400 lam = lam_max
401 else:
402 ss = (s*s).sum()
403 lam = max(lam_min, min(ss/sy, lam_max))
405 params = params1
406 gval = gval1
408 return Bunch(**{"Converged": False, "params": params,
409 "objective_values": obj_hist,
410 "Message": "spg_optim did not converge"})
413def _project_correlation_factors(X):
414 """
415 Project a matrix into the domain of matrices whose row-wise sums
416 of squares are less than or equal to 1.
418 The input matrix is modified in-place.
419 """
420 nm = np.sqrt((X*X).sum(1))
421 ii = np.flatnonzero(nm > 1)
422 if len(ii) > 0:
423 X[ii, :] /= nm[ii][:, None]
426class FactoredPSDMatrix:
427 """
428 Representation of a positive semidefinite matrix in factored form.
430 The representation is constructed based on a vector `diag` and
431 rectangular matrix `root`, such that the PSD matrix represented by
432 the class instance is Diag + root * root', where Diag is the
433 square diagonal matrix with `diag` on its main diagonal.
435 Parameters
436 ----------
437 diag : 1d array_like
438 See above
439 root : 2d array_like
440 See above
442 Notes
443 -----
444 The matrix is represented internally in the form Diag^{1/2}(I +
445 factor * scales * factor')Diag^{1/2}, where `Diag` and `scales`
446 are diagonal matrices, and `factor` is an orthogonal matrix.
447 """
449 def __init__(self, diag, root):
450 self.diag = diag
451 self.root = root
452 root = root / np.sqrt(diag)[:, None]
453 u, s, vt = np.linalg.svd(root, 0)
454 self.factor = u
455 self.scales = s**2
457 def to_matrix(self):
458 """
459 Returns the PSD matrix represented by this instance as a full
460 (square) matrix.
461 """
462 return np.diag(self.diag) + np.dot(self.root, self.root.T)
464 def decorrelate(self, rhs):
465 """
466 Decorrelate the columns of `rhs`.
468 Parameters
469 ----------
470 rhs : array_like
471 A 2 dimensional array with the same number of rows as the
472 PSD matrix represented by the class instance.
474 Returns
475 -------
476 C^{-1/2} * rhs, where C is the covariance matrix represented
477 by this class instance.
479 Notes
480 -----
481 The returned matrix has the identity matrix as its row-wise
482 population covariance matrix.
484 This function exploits the factor structure for efficiency.
485 """
487 # I + factor * qval * factor' is the inverse square root of
488 # the covariance matrix in the homogeneous case where diag =
489 # 1.
490 qval = -1 + 1 / np.sqrt(1 + self.scales)
492 # Decorrelate in the general case.
493 rhs = rhs / np.sqrt(self.diag)[:, None]
494 rhs1 = np.dot(self.factor.T, rhs)
495 rhs1 *= qval[:, None]
496 rhs1 = np.dot(self.factor, rhs1)
497 rhs += rhs1
499 return rhs
501 def solve(self, rhs):
502 """
503 Solve a linear system of equations with factor-structured
504 coefficients.
506 Parameters
507 ----------
508 rhs : array_like
509 A 2 dimensional array with the same number of rows as the
510 PSD matrix represented by the class instance.
512 Returns
513 -------
514 C^{-1} * rhs, where C is the covariance matrix represented
515 by this class instance.
517 Notes
518 -----
519 This function exploits the factor structure for efficiency.
520 """
522 qval = -self.scales / (1 + self.scales)
523 dr = np.sqrt(self.diag)
524 rhs = rhs / dr[:, None]
525 mat = qval[:, None] * np.dot(self.factor.T, rhs)
526 rhs = rhs + np.dot(self.factor, mat)
527 return rhs / dr[:, None]
529 def logdet(self):
530 """
531 Returns the logarithm of the determinant of a
532 factor-structured matrix.
533 """
535 logdet = np.sum(np.log(self.diag))
536 logdet += np.sum(np.log(self.scales))
537 logdet += np.sum(np.log(1 + 1 / self.scales))
539 return logdet
542def corr_nearest_factor(corr, rank, ctol=1e-6, lam_min=1e-30,
543 lam_max=1e30, maxiter=1000):
544 """
545 Find the nearest correlation matrix with factor structure to a
546 given square matrix.
548 Parameters
549 ----------
550 corr : square array
551 The target matrix (to which the nearest correlation matrix is
552 sought). Must be square, but need not be positive
553 semidefinite.
554 rank : int
555 The rank of the factor structure of the solution, i.e., the
556 number of linearly independent columns of X.
557 ctol : positive real
558 Convergence criterion.
559 lam_min : float
560 Tuning parameter for spectral projected gradient optimization
561 (smallest allowed step in the search direction).
562 lam_max : float
563 Tuning parameter for spectral projected gradient optimization
564 (largest allowed step in the search direction).
565 maxiter : int
566 Maximum number of iterations in spectral projected gradient
567 optimization.
569 Returns
570 -------
571 rslt : Bunch
572 rslt.corr is a FactoredPSDMatrix defining the estimated
573 correlation structure. Other fields of `rslt` contain
574 returned values from spg_optim.
576 Notes
577 -----
578 A correlation matrix has factor structure if it can be written in
579 the form I + XX' - diag(XX'), where X is n x k with linearly
580 independent columns, and with each row having sum of squares at
581 most equal to 1. The approximation is made in terms of the
582 Frobenius norm.
584 This routine is useful when one has an approximate correlation
585 matrix that is not positive semidefinite, and there is need to
586 estimate the inverse, square root, or inverse square root of the
587 population correlation matrix. The factor structure allows these
588 tasks to be done without constructing any n x n matrices.
590 This is a non-convex problem with no known guaranteed globally
591 convergent algorithm for computing the solution. Borsdof, Higham
592 and Raydan (2010) compared several methods for this problem and
593 found the spectral projected gradient (SPG) method (used here) to
594 perform best.
596 The input matrix `corr` can be a dense numpy array or any scipy
597 sparse matrix. The latter is useful if the input matrix is
598 obtained by thresholding a very large sample correlation matrix.
599 If `corr` is sparse, the calculations are optimized to save
600 memory, so no working matrix with more than 10^6 elements is
601 constructed.
603 References
604 ----------
605 .. [*] R Borsdof, N Higham, M Raydan (2010). Computing a nearest
606 correlation matrix with factor structure. SIAM J Matrix Anal Appl,
607 31:5, 2603-2622.
608 http://eprints.ma.man.ac.uk/1523/01/covered/MIMS_ep2009_87.pdf
610 Examples
611 --------
612 Hard thresholding a correlation matrix may result in a matrix that
613 is not positive semidefinite. We can approximate a hard
614 thresholded correlation matrix with a PSD matrix as follows, where
615 `corr` is the input correlation matrix.
617 >>> import numpy as np
618 >>> from statsmodels.stats.correlation_tools import corr_nearest_factor
619 >>> np.random.seed(1234)
620 >>> b = 1.5 - np.random.rand(10, 1)
621 >>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10)
622 >>> corr = np.corrcoef(x.T)
623 >>> corr = corr * (np.abs(corr) >= 0.3)
624 >>> rslt = corr_nearest_factor(corr, 3)
625 """
627 p, _ = corr.shape
629 # Starting values (following the PCA method in BHR).
630 u, s, vt = svds(corr, rank)
631 X = u * np.sqrt(s)
632 nm = np.sqrt((X**2).sum(1))
633 ii = np.flatnonzero(nm > 1e-5)
634 X[ii, :] /= nm[ii][:, None]
636 # Zero the diagonal
637 corr1 = corr.copy()
638 if type(corr1) == np.ndarray:
639 np.fill_diagonal(corr1, 0)
640 elif sparse.issparse(corr1):
641 corr1.setdiag(np.zeros(corr1.shape[0]))
642 corr1.eliminate_zeros()
643 corr1.sort_indices()
644 else:
645 raise ValueError("Matrix type not supported")
647 # The gradient, from lemma 4.1 of BHR.
648 def grad(X):
649 gr = np.dot(X, np.dot(X.T, X))
650 if type(corr1) == np.ndarray:
651 gr -= np.dot(corr1, X)
652 else:
653 gr -= corr1.dot(X)
654 gr -= (X*X).sum(1)[:, None] * X
655 return 4*gr
657 # The objective function (sum of squared deviations between fitted
658 # and observed arrays).
659 def func(X):
660 if type(corr1) == np.ndarray:
661 M = np.dot(X, X.T)
662 np.fill_diagonal(M, 0)
663 M -= corr1
664 fval = (M*M).sum()
665 return fval
666 else:
667 fval = 0.
668 # Control the size of intermediates
669 max_ws = 1e6
670 bs = int(max_ws / X.shape[0])
671 ir = 0
672 while ir < X.shape[0]:
673 ir2 = min(ir+bs, X.shape[0])
674 u = np.dot(X[ir:ir2, :], X.T)
675 ii = np.arange(u.shape[0])
676 u[ii, ir+ii] = 0
677 u -= np.asarray(corr1[ir:ir2, :].todense())
678 fval += (u*u).sum()
679 ir += bs
680 return fval
682 rslt = _spg_optim(func, grad, X, _project_correlation_factors, ctol=ctol,
683 lam_min=lam_min, lam_max=lam_max, maxiter=maxiter)
684 root = rslt.params
685 diag = 1 - (root**2).sum(1)
686 soln = FactoredPSDMatrix(diag, root)
687 rslt.corr = soln
688 del rslt.params
689 return rslt
692def cov_nearest_factor_homog(cov, rank):
693 """
694 Approximate an arbitrary square matrix with a factor-structured
695 matrix of the form k*I + XX'.
697 Parameters
698 ----------
699 cov : array_like
700 The input array, must be square but need not be positive
701 semidefinite
702 rank : int
703 The rank of the fitted factor structure
705 Returns
706 -------
707 A FactoredPSDMatrix instance containing the fitted matrix
709 Notes
710 -----
711 This routine is useful if one has an estimated covariance matrix
712 that is not SPD, and the ultimate goal is to estimate the inverse,
713 square root, or inverse square root of the true covariance
714 matrix. The factor structure allows these tasks to be performed
715 without constructing any n x n matrices.
717 The calculations use the fact that if k is known, then X can be
718 determined from the eigen-decomposition of cov - k*I, which can
719 in turn be easily obtained form the eigen-decomposition of `cov`.
720 Thus the problem can be reduced to a 1-dimensional search for k
721 that does not require repeated eigen-decompositions.
723 If the input matrix is sparse, then cov - k*I is also sparse, so
724 the eigen-decomposition can be done efficiently using sparse
725 routines.
727 The one-dimensional search for the optimal value of k is not
728 convex, so a local minimum could be obtained.
730 Examples
731 --------
732 Hard thresholding a covariance matrix may result in a matrix that
733 is not positive semidefinite. We can approximate a hard
734 thresholded covariance matrix with a PSD matrix as follows:
736 >>> import numpy as np
737 >>> np.random.seed(1234)
738 >>> b = 1.5 - np.random.rand(10, 1)
739 >>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10)
740 >>> cov = np.cov(x)
741 >>> cov = cov * (np.abs(cov) >= 0.3)
742 >>> rslt = cov_nearest_factor_homog(cov, 3)
743 """
745 m, n = cov.shape
747 Q, Lambda, _ = svds(cov, rank)
749 if sparse.issparse(cov):
750 QSQ = np.dot(Q.T, cov.dot(Q))
751 ts = cov.diagonal().sum()
752 tss = cov.dot(cov).diagonal().sum()
753 else:
754 QSQ = np.dot(Q.T, np.dot(cov, Q))
755 ts = np.trace(cov)
756 tss = np.trace(np.dot(cov, cov))
758 def fun(k):
759 Lambda_t = Lambda - k
760 v = tss + m*(k**2) + np.sum(Lambda_t**2) - 2*k*ts
761 v += 2*k*np.sum(Lambda_t) - 2*np.sum(np.diag(QSQ) * Lambda_t)
762 return v
764 # Get the optimal decomposition
765 k_opt = fminbound(fun, 0, 1e5)
766 Lambda_opt = Lambda - k_opt
767 fac_opt = Q * np.sqrt(Lambda_opt)
769 diag = k_opt * np.ones(m, dtype=np.float64) # - (fac_opt**2).sum(1)
770 return FactoredPSDMatrix(diag, fac_opt)
773def corr_thresholded(data, minabs=None, max_elt=1e7):
774 r"""
775 Construct a sparse matrix containing the thresholded row-wise
776 correlation matrix from a data array.
778 Parameters
779 ----------
780 data : array_like
781 The data from which the row-wise thresholded correlation
782 matrix is to be computed.
783 minabs : non-negative real
784 The threshold value; correlation coefficients smaller in
785 magnitude than minabs are set to zero. If None, defaults
786 to 1 / sqrt(n), see Notes for more information.
788 Returns
789 -------
790 cormat : sparse.coo_matrix
791 The thresholded correlation matrix, in COO format.
793 Notes
794 -----
795 This is an alternative to C = np.corrcoef(data); C \*= (np.abs(C)
796 >= absmin), suitable for very tall data matrices.
798 If the data are jointly Gaussian, the marginal sampling
799 distributions of the elements of the sample correlation matrix are
800 approximately Gaussian with standard deviation 1 / sqrt(n). The
801 default value of ``minabs`` is thus equal to 1 standard error, which
802 will set to zero approximately 68% of the estimated correlation
803 coefficients for which the population value is zero.
805 No intermediate matrix with more than ``max_elt`` values will be
806 constructed. However memory use could still be high if a large
807 number of correlation values exceed `minabs` in magnitude.
809 The thresholded matrix is returned in COO format, which can easily
810 be converted to other sparse formats.
812 Examples
813 --------
814 Here X is a tall data matrix (e.g. with 100,000 rows and 50
815 columns). The row-wise correlation matrix of X is calculated
816 and stored in sparse form, with all entries smaller than 0.3
817 treated as 0.
819 >>> import numpy as np
820 >>> np.random.seed(1234)
821 >>> b = 1.5 - np.random.rand(10, 1)
822 >>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10)
823 >>> cmat = corr_thresholded(x, 0.3)
824 """
826 nrow, ncol = data.shape
828 if minabs is None:
829 minabs = 1. / float(ncol)
831 # Row-standardize the data
832 data = data.copy()
833 data -= data.mean(1)[:, None]
834 sd = data.std(1, ddof=1)
835 ii = np.flatnonzero(sd > 1e-5)
836 data[ii, :] /= sd[ii][:, None]
837 ii = np.flatnonzero(sd <= 1e-5)
838 data[ii, :] = 0
840 # Number of rows to process in one pass
841 bs = int(np.floor(max_elt / nrow))
843 ipos_all, jpos_all, cor_values = [], [], []
845 ir = 0
846 while ir < nrow:
847 ir2 = min(data.shape[0], ir + bs)
848 cm = np.dot(data[ir:ir2, :], data.T) / (ncol - 1)
849 cma = np.abs(cm)
850 ipos, jpos = np.nonzero(cma >= minabs)
851 ipos_all.append(ipos + ir)
852 jpos_all.append(jpos)
853 cor_values.append(cm[ipos, jpos])
854 ir += bs
856 ipos = np.concatenate(ipos_all)
857 jpos = np.concatenate(jpos_all)
858 cor_values = np.concatenate(cor_values)
860 cmat = sparse.coo_matrix((cor_values, (ipos, jpos)), (nrow, nrow))
862 return cmat
865class MultivariateKernel(object):
866 """
867 Base class for multivariate kernels.
869 An instance of MultivariateKernel implements a `call` method having
870 signature `call(x, loc)`, returning the kernel weights comparing `x`
871 (a 1d ndarray) to each row of `loc` (a 2d ndarray).
872 """
874 def call(self, x, loc):
875 raise NotImplementedError
877 def set_bandwidth(self, bw):
878 """
879 Set the bandwidth to the given vector.
881 Parameters
882 ----------
883 bw : array_like
884 A vector of non-negative bandwidth values.
885 """
887 self.bw = bw
888 self._setup()
890 def _setup(self):
892 # Precompute the squared bandwidth values.
893 self.bwk = np.prod(self.bw)
894 self.bw2 = self.bw * self.bw
896 def set_default_bw(self, loc, bwm=None):
897 """
898 Set default bandwiths based on domain values.
900 Parameters
901 ----------
902 loc : array_like
903 Values from the domain to which the kernel will
904 be applied.
905 bwm : scalar, optional
906 A non-negative scalar that is used to multiply
907 the default bandwidth.
908 """
910 sd = loc.std(0)
911 q25, q75 = np.percentile(loc, [25, 75], axis=0)
912 iqr = (q75 - q25) / 1.349
913 bw = np.where(iqr < sd, iqr, sd)
914 bw *= 0.9 / loc.shape[0] ** 0.2
916 if bwm is not None:
917 bw *= bwm
919 # The final bandwidths
920 self.bw = np.asarray(bw, dtype=np.float64)
922 self._setup()
925class GaussianMultivariateKernel(MultivariateKernel):
926 """
927 The Gaussian (squared exponential) multivariate kernel.
928 """
930 def call(self, x, loc):
931 return np.exp(-(x - loc)**2 / (2 * self.bw2)).sum(1) / self.bwk
934def kernel_covariance(exog, loc, groups, kernel=None, bw=None):
935 """
936 Use kernel averaging to estimate a multivariate covariance function.
938 The goal is to estimate a covariance function C(x, y) =
939 cov(Z(x), Z(y)) where x, y are vectors in R^p (e.g. representing
940 locations in time or space), and Z(.) represents a multivariate
941 process on R^p.
943 The data used for estimation can be observed at arbitrary values of the
944 position vector, and there can be multiple independent observations
945 from the process.
947 Parameters
948 ----------
949 exog : array_like
950 The rows of exog are realizations of the process obtained at
951 specified points.
952 loc : array_like
953 The rows of loc are the locations (e.g. in space or time) at
954 which the rows of exog are observed.
955 groups : array_like
956 The values of groups are labels for distinct independent copies
957 of the process.
958 kernel : MultivariateKernel instance, optional
959 An instance of MultivariateKernel, defaults to
960 GaussianMultivariateKernel.
961 bw : array_like or scalar
962 A bandwidth vector, or bandwidth multiplier. If a 1d array, it
963 contains kernel bandwidths for each component of the process, and
964 must have length equal to the number of columns of exog. If a scalar,
965 bw is a bandwidth multiplier used to adjust the default bandwidth; if
966 None, a default bandwidth is used.
968 Returns
969 -------
970 A real-valued function C(x, y) that returns an estimate of the covariance
971 between values of the process located at x and y.
973 References
974 ----------
975 .. [1] Genton M, W Kleiber (2015). Cross covariance functions for
976 multivariate geostatics. Statistical Science 30(2).
977 https://arxiv.org/pdf/1507.08017.pdf
978 """
980 exog = np.asarray(exog)
981 loc = np.asarray(loc)
982 groups = np.asarray(groups)
984 if loc.ndim == 1:
985 loc = loc[:, None]
987 v = [exog.shape[0], loc.shape[0], len(groups)]
988 if min(v) != max(v):
989 msg = "exog, loc, and groups must have the same number of rows"
990 raise ValueError(msg)
992 # Map from group labels to the row indices in each group.
993 ix = {}
994 for i, g in enumerate(groups):
995 if g not in ix:
996 ix[g] = []
997 ix[g].append(i)
998 for g in ix.keys():
999 ix[g] = np.sort(ix[g])
1001 if kernel is None:
1002 kernel = GaussianMultivariateKernel()
1004 if bw is None:
1005 kernel.set_default_bw(loc)
1006 elif np.isscalar(bw):
1007 kernel.set_default_bw(loc, bwm=bw)
1008 else:
1009 kernel.set_bandwidth(bw)
1011 def cov(x, y):
1013 kx = kernel.call(x, loc)
1014 ky = kernel.call(y, loc)
1016 cm, cw = 0., 0.
1018 for g, ii in ix.items():
1020 m = len(ii)
1021 j1, j2 = np.indices((m, m))
1022 j1 = ii[j1.flat]
1023 j2 = ii[j2.flat]
1024 w = kx[j1] * ky[j2]
1026 # TODO: some other form of broadcasting may be faster than
1027 # einsum here
1028 cm += np.einsum("ij,ik,i->jk", exog[j1, :], exog[j2, :], w)
1029 cw += w.sum()
1031 if cw < 1e-10:
1032 msg = ("Effective sample size is 0. The bandwidth may be too " +
1033 "small, or you are outside the range of your data.")
1034 warnings.warn(msg)
1035 return np.nan * np.ones_like(cm)
1037 return cm / cw
1039 return cov