Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/genmod/cov_struct.py : 11%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Covariance models and estimators for GEE.
4Some details for the covariance calculations can be found in the Stata
5docs:
7http://www.stata.com/manuals13/xtxtgee.pdf
8"""
9from statsmodels.compat.python import iterkeys, itervalues
10from statsmodels.compat.pandas import Appender
12from statsmodels.stats.correlation_tools import cov_nearest
13import numpy as np
14import pandas as pd
15from scipy import linalg as spl
16from collections import defaultdict
17from statsmodels.tools.sm_exceptions import (ConvergenceWarning, OutputWarning,
18 NotImplementedWarning)
19import warnings
22class CovStruct(object):
23 """
24 Base class for correlation and covariance structures.
26 An implementation of this class takes the residuals from a
27 regression model that has been fit to grouped data, and uses
28 them to estimate the within-group dependence structure of the
29 random errors in the model.
31 The current state of the covariance structure is represented
32 through the value of the `dep_params` attribute.
34 The default state of a newly-created instance should always be
35 the identity correlation matrix.
36 """
38 def __init__(self, cov_nearest_method="clipped"):
40 # Parameters describing the dependency structure
41 self.dep_params = None
43 # Keep track of the number of times that the covariance was
44 # adjusted.
45 self.cov_adjust = []
47 # Method for projecting the covariance matrix if it is not
48 # PSD.
49 self.cov_nearest_method = cov_nearest_method
51 def initialize(self, model):
52 """
53 Called by GEE, used by implementations that need additional
54 setup prior to running `fit`.
56 Parameters
57 ----------
58 model : GEE class
59 A reference to the parent GEE class instance.
60 """
61 self.model = model
63 def update(self, params):
64 """
65 Update the association parameter values based on the current
66 regression coefficients.
68 Parameters
69 ----------
70 params : array_like
71 Working values for the regression parameters.
72 """
73 raise NotImplementedError
75 def covariance_matrix(self, endog_expval, index):
76 """
77 Returns the working covariance or correlation matrix for a
78 given cluster of data.
80 Parameters
81 ----------
82 endog_expval: array_like
83 The expected values of endog for the cluster for which the
84 covariance or correlation matrix will be returned
85 index: int
86 The index of the cluster for which the covariance or
87 correlation matrix will be returned
89 Returns
90 -------
91 M: matrix
92 The covariance or correlation matrix of endog
93 is_cor: bool
94 True if M is a correlation matrix, False if M is a
95 covariance matrix
96 """
97 raise NotImplementedError
99 def covariance_matrix_solve(self, expval, index, stdev, rhs):
100 """
101 Solves matrix equations of the form `covmat * soln = rhs` and
102 returns the values of `soln`, where `covmat` is the covariance
103 matrix represented by this class.
105 Parameters
106 ----------
107 expval: array_like
108 The expected value of endog for each observed value in the
109 group.
110 index: int
111 The group index.
112 stdev : array_like
113 The standard deviation of endog for each observation in
114 the group.
115 rhs : list/tuple of array_like
116 A set of right-hand sides; each defines a matrix equation
117 to be solved.
119 Returns
120 -------
121 soln : list/tuple of array_like
122 The solutions to the matrix equations.
124 Notes
125 -----
126 Returns None if the solver fails.
128 Some dependence structures do not use `expval` and/or `index`
129 to determine the correlation matrix. Some families
130 (e.g. binomial) do not use the `stdev` parameter when forming
131 the covariance matrix.
133 If the covariance matrix is singular or not SPD, it is
134 projected to the nearest such matrix. These projection events
135 are recorded in the fit_history attribute of the GEE model.
137 Systems of linear equations with the covariance matrix as the
138 left hand side (LHS) are solved for different right hand sides
139 (RHS); the LHS is only factorized once to save time.
141 This is a default implementation, it can be reimplemented in
142 subclasses to optimize the linear algebra according to the
143 structure of the covariance matrix.
144 """
146 vmat, is_cor = self.covariance_matrix(expval, index)
147 if is_cor:
148 vmat *= np.outer(stdev, stdev)
150 # Factor the covariance matrix. If the factorization fails,
151 # attempt to condition it into a factorizable matrix.
152 threshold = 1e-2
153 success = False
154 cov_adjust = 0
155 for itr in range(20):
156 try:
157 vco = spl.cho_factor(vmat)
158 success = True
159 break
160 except np.linalg.LinAlgError:
161 vmat = cov_nearest(vmat, method=self.cov_nearest_method,
162 threshold=threshold)
163 threshold *= 2
164 cov_adjust += 1
165 msg = "At least one covariance matrix was not PSD "
166 msg += "and required projection."
167 warnings.warn(msg)
169 self.cov_adjust.append(cov_adjust)
171 # Last resort if we still cannot factor the covariance matrix.
172 if not success:
173 warnings.warn(
174 "Unable to condition covariance matrix to an SPD "
175 "matrix using cov_nearest", ConvergenceWarning)
176 vmat = np.diag(np.diag(vmat))
177 vco = spl.cho_factor(vmat)
179 soln = [spl.cho_solve(vco, x) for x in rhs]
180 return soln
182 def summary(self):
183 """
184 Returns a text summary of the current estimate of the
185 dependence structure.
186 """
187 raise NotImplementedError
190class Independence(CovStruct):
191 """
192 An independence working dependence structure.
193 """
195 @Appender(CovStruct.update.__doc__)
196 def update(self, params):
197 # Nothing to update
198 return
200 @Appender(CovStruct.covariance_matrix.__doc__)
201 def covariance_matrix(self, expval, index):
202 dim = len(expval)
203 return np.eye(dim, dtype=np.float64), True
205 @Appender(CovStruct.covariance_matrix_solve.__doc__)
206 def covariance_matrix_solve(self, expval, index, stdev, rhs):
207 v = stdev ** 2
208 rslt = []
209 for x in rhs:
210 if x.ndim == 1:
211 rslt.append(x / v)
212 else:
213 rslt.append(x / v[:, None])
214 return rslt
216 def summary(self):
217 return ("Observations within a cluster are modeled "
218 "as being independent.")
221class Exchangeable(CovStruct):
222 """
223 An exchangeable working dependence structure.
224 """
226 def __init__(self):
228 super(Exchangeable, self).__init__()
230 # The correlation between any two values in the same cluster
231 self.dep_params = 0.
233 @Appender(CovStruct.update.__doc__)
234 def update(self, params):
236 endog = self.model.endog_li
238 nobs = self.model.nobs
240 varfunc = self.model.family.variance
242 cached_means = self.model.cached_means
244 has_weights = self.model.weights is not None
245 weights_li = self.model.weights
247 residsq_sum, scale = 0, 0
248 fsum1, fsum2, n_pairs = 0., 0., 0.
249 for i in range(self.model.num_group):
250 expval, _ = cached_means[i]
251 stdev = np.sqrt(varfunc(expval))
252 resid = (endog[i] - expval) / stdev
253 f = weights_li[i] if has_weights else 1.
255 ssr = np.sum(resid * resid)
256 scale += f * ssr
257 fsum1 += f * len(endog[i])
259 residsq_sum += f * (resid.sum() ** 2 - ssr) / 2
260 ngrp = len(resid)
261 npr = 0.5 * ngrp * (ngrp - 1)
262 fsum2 += f * npr
263 n_pairs += npr
265 ddof = self.model.ddof_scale
266 scale /= (fsum1 * (nobs - ddof) / float(nobs))
267 residsq_sum /= scale
268 self.dep_params = residsq_sum / \
269 (fsum2 * (n_pairs - ddof) / float(n_pairs))
271 @Appender(CovStruct.covariance_matrix.__doc__)
272 def covariance_matrix(self, expval, index):
273 dim = len(expval)
274 dp = self.dep_params * np.ones((dim, dim), dtype=np.float64)
275 np.fill_diagonal(dp, 1)
276 return dp, True
278 @Appender(CovStruct.covariance_matrix_solve.__doc__)
279 def covariance_matrix_solve(self, expval, index, stdev, rhs):
281 k = len(expval)
282 c = self.dep_params / (1. - self.dep_params)
283 c /= 1. + self.dep_params * (k - 1)
285 rslt = []
286 for x in rhs:
287 if x.ndim == 1:
288 x1 = x / stdev
289 y = x1 / (1. - self.dep_params)
290 y -= c * sum(x1)
291 y /= stdev
292 else:
293 x1 = x / stdev[:, None]
294 y = x1 / (1. - self.dep_params)
295 y -= c * x1.sum(0)
296 y /= stdev[:, None]
297 rslt.append(y)
299 return rslt
301 def summary(self):
302 return ("The correlation between two observations in the " +
303 "same cluster is %.3f" % self.dep_params)
306class Nested(CovStruct):
307 """
308 A nested working dependence structure.
310 A nested working dependence structure captures unique variance
311 associated with each level in a hierarchy of partitions of the
312 cases. For each level of the hierarchy, there is a set of iid
313 random effects with mean zero, and with variance that is specific
314 to the level. These variance parameters are estimated from the
315 data using the method of moments.
317 The top level of the hierarchy is always defined by the required
318 `groups` argument to GEE.
320 The `dep_data` argument used to create the GEE defines the
321 remaining levels of the hierarchy. it should be either an array,
322 or if using the formula interface, a string that contains a
323 formula. If an array, it should contain a `n_obs x k` matrix of
324 labels, corresponding to the k levels of partitioning that are
325 nested under the top-level `groups` of the GEE instance. These
326 subgroups should be nested from left to right, so that two
327 observations with the same label for column j of `dep_data` should
328 also have the same label for all columns j' < j (this only applies
329 to observations in the same top-level cluster given by the
330 `groups` argument to GEE).
332 If `dep_data` is a formula, it should usually be of the form `0 +
333 a + b + ...`, where `a`, `b`, etc. contain labels defining group
334 membership. The `0 + ` should be included to prevent creation of
335 an intercept. The variable values are interpreted as labels for
336 group membership, but the variables should not be explicitly coded
337 as categorical, i.e. use `0 + a` not `0 + C(a)`.
339 Notes
340 -----
341 The calculations for the nested structure involve all pairs of
342 observations within the top level `group` passed to GEE. Large
343 group sizes will result in slow iterations.
344 """
346 def initialize(self, model):
347 """
348 Called on the first call to update
350 `ilabels` is a list of n_i x n_i matrices containing integer
351 labels that correspond to specific correlation parameters.
352 Two elements of ilabels[i] with the same label share identical
353 variance components.
355 `designx` is a matrix, with each row containing dummy
356 variables indicating which variance components are associated
357 with the corresponding element of QY.
358 """
360 super(Nested, self).initialize(model)
362 if self.model.weights is not None:
363 warnings.warn("weights not implemented for nested cov_struct, "
364 "using unweighted covariance estimate",
365 NotImplementedWarning)
367 # A bit of processing of the nest data
368 id_matrix = np.asarray(self.model.dep_data)
369 if id_matrix.ndim == 1:
370 id_matrix = id_matrix[:, None]
371 self.id_matrix = id_matrix
373 endog = self.model.endog_li
374 designx, ilabels = [], []
376 # The number of layers of nesting
377 n_nest = self.id_matrix.shape[1]
379 for i in range(self.model.num_group):
380 ngrp = len(endog[i])
381 glab = self.model.group_labels[i]
382 rix = self.model.group_indices[glab]
384 # Determine the number of common variance components
385 # shared by each pair of observations.
386 ix1, ix2 = np.tril_indices(ngrp, -1)
387 ncm = (self.id_matrix[rix[ix1], :] ==
388 self.id_matrix[rix[ix2], :]).sum(1)
390 # This is used to construct the working correlation
391 # matrix.
392 ilabel = np.zeros((ngrp, ngrp), dtype=np.int32)
393 ilabel[(ix1, ix2)] = ncm + 1
394 ilabel[(ix2, ix1)] = ncm + 1
395 ilabels.append(ilabel)
397 # This is used to estimate the variance components.
398 dsx = np.zeros((len(ix1), n_nest + 1), dtype=np.float64)
399 dsx[:, 0] = 1
400 for k in np.unique(ncm):
401 ii = np.flatnonzero(ncm == k)
402 dsx[ii, 1:k + 1] = 1
403 designx.append(dsx)
405 self.designx = np.concatenate(designx, axis=0)
406 self.ilabels = ilabels
408 svd = np.linalg.svd(self.designx, 0)
409 self.designx_u = svd[0]
410 self.designx_s = svd[1]
411 self.designx_v = svd[2].T
413 @Appender(CovStruct.update.__doc__)
414 def update(self, params):
416 endog = self.model.endog_li
418 nobs = self.model.nobs
419 dim = len(params)
421 if self.designx is None:
422 self._compute_design(self.model)
424 cached_means = self.model.cached_means
426 varfunc = self.model.family.variance
428 dvmat = []
429 scale = 0.
430 for i in range(self.model.num_group):
432 expval, _ = cached_means[i]
434 stdev = np.sqrt(varfunc(expval))
435 resid = (endog[i] - expval) / stdev
437 ix1, ix2 = np.tril_indices(len(resid), -1)
438 dvmat.append(resid[ix1] * resid[ix2])
440 scale += np.sum(resid ** 2)
442 dvmat = np.concatenate(dvmat)
443 scale /= (nobs - dim)
445 # Use least squares regression to estimate the variance
446 # components
447 vcomp_coeff = np.dot(self.designx_v, np.dot(self.designx_u.T,
448 dvmat) / self.designx_s)
450 self.vcomp_coeff = np.clip(vcomp_coeff, 0, np.inf)
451 self.scale = scale
453 self.dep_params = self.vcomp_coeff.copy()
455 @Appender(CovStruct.covariance_matrix.__doc__)
456 def covariance_matrix(self, expval, index):
458 dim = len(expval)
460 # First iteration
461 if self.dep_params is None:
462 return np.eye(dim, dtype=np.float64), True
464 ilabel = self.ilabels[index]
466 c = np.r_[self.scale, np.cumsum(self.vcomp_coeff)]
467 vmat = c[ilabel]
468 vmat /= self.scale
469 return vmat, True
471 def summary(self):
472 """
473 Returns a summary string describing the state of the
474 dependence structure.
475 """
477 dep_names = ["Groups"]
478 if hasattr(self.model, "_dep_data_names"):
479 dep_names.extend(self.model._dep_data_names)
480 else:
481 dep_names.extend(["Component %d:" % (k + 1) for k in range(len(self.vcomp_coeff) - 1)])
482 if hasattr(self.model, "_groups_name"):
483 dep_names[0] = self.model._groups_name
484 dep_names.append("Residual")
486 vc = self.vcomp_coeff.tolist()
487 vc.append(self.scale - np.sum(vc))
489 smry = pd.DataFrame({"Variance": vc}, index=dep_names)
491 return smry
494class Stationary(CovStruct):
495 """
496 A stationary covariance structure.
498 The correlation between two observations is an arbitrary function
499 of the distance between them. Distances up to a given maximum
500 value are included in the covariance model.
502 Parameters
503 ----------
504 max_lag : float
505 The largest distance that is included in the covariance model.
506 grid : bool
507 If True, the index positions in the data (after dropping missing
508 values) are used to define distances, and the `time` variable is
509 ignored.
510 """
512 def __init__(self, max_lag=1, grid=False):
514 super(Stationary, self).__init__()
515 self.max_lag = max_lag
516 self.grid = grid
517 self.dep_params = np.zeros(max_lag + 1)
519 def initialize(self, model):
521 super(Stationary, self).initialize(model)
523 # Time used as an index needs to be integer type.
524 if not self.grid:
525 time = self.model.time[:, 0].astype(np.int32)
526 self.time = self.model.cluster_list(time)
528 @Appender(CovStruct.update.__doc__)
529 def update(self, params):
531 if self.grid:
532 self.update_grid(params)
533 else:
534 self.update_nogrid(params)
536 def update_grid(self, params):
538 endog = self.model.endog_li
539 cached_means = self.model.cached_means
540 varfunc = self.model.family.variance
542 dep_params = np.zeros(self.max_lag + 1)
543 for i in range(self.model.num_group):
545 expval, _ = cached_means[i]
546 stdev = np.sqrt(varfunc(expval))
547 resid = (endog[i] - expval) / stdev
549 dep_params[0] += np.sum(resid * resid) / len(resid)
550 for j in range(1, self.max_lag + 1):
551 v = resid[j:]
552 dep_params[j] += np.sum(resid[0:-j] * v) / len(v)
554 dep_params /= dep_params[0]
555 self.dep_params = dep_params
557 def update_nogrid(self, params):
559 endog = self.model.endog_li
560 cached_means = self.model.cached_means
561 varfunc = self.model.family.variance
563 dep_params = np.zeros(self.max_lag + 1)
564 dn = np.zeros(self.max_lag + 1)
565 resid_ssq = 0
566 resid_ssq_n = 0
567 for i in range(self.model.num_group):
569 expval, _ = cached_means[i]
570 stdev = np.sqrt(varfunc(expval))
571 resid = (endog[i] - expval) / stdev
573 j1, j2 = np.tril_indices(len(expval), -1)
574 dx = np.abs(self.time[i][j1] - self.time[i][j2])
575 ii = np.flatnonzero(dx <= self.max_lag)
576 j1 = j1[ii]
577 j2 = j2[ii]
578 dx = dx[ii]
580 vs = np.bincount(dx, weights=resid[j1] * resid[j2],
581 minlength=self.max_lag + 1)
582 vd = np.bincount(dx, minlength=self.max_lag + 1)
584 resid_ssq += np.sum(resid**2)
585 resid_ssq_n += len(resid)
587 ii = np.flatnonzero(vd > 0)
588 if len(ii) > 0:
589 dn[ii] += 1
590 dep_params[ii] += vs[ii] / vd[ii]
592 i0 = np.flatnonzero(dn > 0)
593 dep_params[i0] /= dn[i0]
594 resid_msq = resid_ssq / resid_ssq_n
595 dep_params /= resid_msq
596 self.dep_params = dep_params
598 @Appender(CovStruct.covariance_matrix.__doc__)
599 def covariance_matrix(self, endog_expval, index):
601 if self.grid:
602 return self.covariance_matrix_grid(endog_expval, index)
604 j1, j2 = np.tril_indices(len(endog_expval), -1)
605 dx = np.abs(self.time[index][j1] - self.time[index][j2])
606 ii = np.flatnonzero(dx <= self.max_lag)
607 j1 = j1[ii]
608 j2 = j2[ii]
609 dx = dx[ii]
611 cmat = np.eye(len(endog_expval))
612 cmat[j1, j2] = self.dep_params[dx]
613 cmat[j2, j1] = self.dep_params[dx]
615 return cmat, True
617 def covariance_matrix_grid(self, endog_expval, index):
619 from scipy.linalg import toeplitz
620 r = np.zeros(len(endog_expval))
621 r[0] = 1
622 r[1:self.max_lag + 1] = self.dep_params[1:]
623 return toeplitz(r), True
625 @Appender(CovStruct.covariance_matrix_solve.__doc__)
626 def covariance_matrix_solve(self, expval, index, stdev, rhs):
628 if not self.grid:
629 return super(Stationary, self).covariance_matrix_solve(
630 expval, index, stdev, rhs)
632 from statsmodels.tools.linalg import stationary_solve
633 r = np.zeros(len(expval))
634 r[0:self.max_lag] = self.dep_params[1:]
635 return [stationary_solve(r, x) for x in rhs]
637 def summary(self):
639 lag = np.arange(self.max_lag + 1)
640 return pd.DataFrame({"Lag": lag, "Cov": self.dep_params})
643class Autoregressive(CovStruct):
644 """
645 A first-order autoregressive working dependence structure.
647 The dependence is defined in terms of the `time` component of the
648 parent GEE class, which defaults to the index position of each
649 value within its cluster, based on the order of values in the
650 input data set. Time represents a potentially multidimensional
651 index from which distances between pairs of observations can be
652 determined.
654 The correlation between two observations in the same cluster is
655 dep_params^distance, where `dep_params` contains the (scalar)
656 autocorrelation parameter to be estimated, and `distance` is the
657 distance between the two observations, calculated from their
658 corresponding time values. `time` is stored as an n_obs x k
659 matrix, where `k` represents the number of dimensions in the time
660 index.
662 The autocorrelation parameter is estimated using weighted
663 nonlinear least squares, regressing each value within a cluster on
664 each preceding value in the same cluster.
666 Parameters
667 ----------
668 dist_func : function from R^k x R^k to R^+, optional
669 A function that computes the distance between the two
670 observations based on their `time` values.
672 References
673 ----------
674 B Rosner, A Munoz. Autoregressive modeling for the analysis of
675 longitudinal data with unequally spaced examinations. Statistics
676 in medicine. Vol 7, 59-71, 1988.
677 """
679 def __init__(self, dist_func=None):
681 super(Autoregressive, self).__init__()
683 # The function for determining distances based on time
684 if dist_func is None:
685 self.dist_func = lambda x, y: np.abs(x - y).sum()
686 else:
687 self.dist_func = dist_func
689 self.designx = None
691 # The autocorrelation parameter
692 self.dep_params = 0.
694 @Appender(CovStruct.update.__doc__)
695 def update(self, params):
697 if self.model.weights is not None:
698 warnings.warn("weights not implemented for autoregressive "
699 "cov_struct, using unweighted covariance estimate",
700 NotImplementedWarning)
702 endog = self.model.endog_li
703 time = self.model.time_li
705 # Only need to compute this once
706 if self.designx is not None:
707 designx = self.designx
708 else:
709 designx = []
710 for i in range(self.model.num_group):
712 ngrp = len(endog[i])
713 if ngrp == 0:
714 continue
716 # Loop over pairs of observations within a cluster
717 for j1 in range(ngrp):
718 for j2 in range(j1):
719 designx.append(self.dist_func(time[i][j1, :],
720 time[i][j2, :]))
722 designx = np.array(designx)
723 self.designx = designx
725 scale = self.model.estimate_scale()
726 varfunc = self.model.family.variance
727 cached_means = self.model.cached_means
729 # Weights
730 var = 1. - self.dep_params ** (2 * designx)
731 var /= 1. - self.dep_params ** 2
732 wts = 1. / var
733 wts /= wts.sum()
735 residmat = []
736 for i in range(self.model.num_group):
738 expval, _ = cached_means[i]
739 stdev = np.sqrt(scale * varfunc(expval))
740 resid = (endog[i] - expval) / stdev
742 ngrp = len(resid)
743 for j1 in range(ngrp):
744 for j2 in range(j1):
745 residmat.append([resid[j1], resid[j2]])
747 residmat = np.array(residmat)
749 # Need to minimize this
750 def fitfunc(a):
751 dif = residmat[:, 0] - (a ** designx) * residmat[:, 1]
752 return np.dot(dif ** 2, wts)
754 # Left bracket point
755 b_lft, f_lft = 0., fitfunc(0.)
757 # Center bracket point
758 b_ctr, f_ctr = 0.5, fitfunc(0.5)
759 while f_ctr > f_lft:
760 b_ctr /= 2
761 f_ctr = fitfunc(b_ctr)
762 if b_ctr < 1e-8:
763 self.dep_params = 0
764 return
766 # Right bracket point
767 b_rgt, f_rgt = 0.75, fitfunc(0.75)
768 while f_rgt < f_ctr:
769 b_rgt = b_rgt + (1. - b_rgt) / 2
770 f_rgt = fitfunc(b_rgt)
771 if b_rgt > 1. - 1e-6:
772 raise ValueError(
773 "Autoregressive: unable to find right bracket")
775 from scipy.optimize import brent
776 self.dep_params = brent(fitfunc, brack=[b_lft, b_ctr, b_rgt])
778 @Appender(CovStruct.covariance_matrix.__doc__)
779 def covariance_matrix(self, endog_expval, index):
780 ngrp = len(endog_expval)
781 if self.dep_params == 0:
782 return np.eye(ngrp, dtype=np.float64), True
783 idx = np.arange(ngrp)
784 cmat = self.dep_params ** np.abs(idx[:, None] - idx[None, :])
785 return cmat, True
787 @Appender(CovStruct.covariance_matrix_solve.__doc__)
788 def covariance_matrix_solve(self, expval, index, stdev, rhs):
789 # The inverse of an AR(1) covariance matrix is tri-diagonal.
791 k = len(expval)
792 soln = []
794 # LHS has 1 column
795 if k == 1:
796 return [x / stdev ** 2 for x in rhs]
798 # LHS has 2 columns
799 if k == 2:
800 mat = np.array([[1, -self.dep_params], [-self.dep_params, 1]])
801 mat /= (1. - self.dep_params ** 2)
802 for x in rhs:
803 if x.ndim == 1:
804 x1 = x / stdev
805 else:
806 x1 = x / stdev[:, None]
807 x1 = np.dot(mat, x1)
808 if x.ndim == 1:
809 x1 /= stdev
810 else:
811 x1 /= stdev[:, None]
812 soln.append(x1)
813 return soln
815 # LHS has >= 3 columns: values c0, c1, c2 defined below give
816 # the inverse. c0 is on the diagonal, except for the first
817 # and last position. c1 is on the first and last position of
818 # the diagonal. c2 is on the sub/super diagonal.
819 c0 = (1. + self.dep_params ** 2) / (1. - self.dep_params ** 2)
820 c1 = 1. / (1. - self.dep_params ** 2)
821 c2 = -self.dep_params / (1. - self.dep_params ** 2)
822 soln = []
823 for x in rhs:
824 flatten = False
825 if x.ndim == 1:
826 x = x[:, None]
827 flatten = True
828 x1 = x / stdev[:, None]
830 z0 = np.zeros((1, x.shape[1]))
831 rhs1 = np.concatenate((x[1:, :], z0), axis=0)
832 rhs2 = np.concatenate((z0, x[0:-1, :]), axis=0)
834 y = c0 * x + c2 * rhs1 + c2 * rhs2
835 y[0, :] = c1 * x[0, :] + c2 * x[1, :]
836 y[-1, :] = c1 * x[-1, :] + c2 * x[-2, :]
838 y /= stdev[:, None]
840 if flatten:
841 y = np.squeeze(y)
843 soln.append(y)
845 return soln
847 def summary(self):
849 return ("Autoregressive(1) dependence parameter: %.3f\n" %
850 self.dep_params)
853class CategoricalCovStruct(CovStruct):
854 """
855 Parent class for covariance structure for categorical data models.
857 Attributes
858 ----------
859 nlevel : int
860 The number of distinct levels for the outcome variable.
861 ibd : list
862 A list whose i^th element ibd[i] is an array whose rows
863 contain integer pairs (a,b), where endog_li[i][a:b] is the
864 subvector of binary indicators derived from the same ordinal
865 value.
866 """
868 def initialize(self, model):
870 super(CategoricalCovStruct, self).initialize(model)
872 self.nlevel = len(model.endog_values)
873 self._ncut = self.nlevel - 1
875 from numpy.lib.stride_tricks import as_strided
876 b = np.dtype(np.int64).itemsize
878 ibd = []
879 for v in model.endog_li:
880 jj = np.arange(0, len(v) + 1, self._ncut, dtype=np.int64)
881 jj = as_strided(jj, shape=(len(jj) - 1, 2), strides=(b, b))
882 ibd.append(jj)
884 self.ibd = ibd
887class GlobalOddsRatio(CategoricalCovStruct):
888 """
889 Estimate the global odds ratio for a GEE with ordinal or nominal
890 data.
892 References
893 ----------
894 PJ Heagerty and S Zeger. "Marginal Regression Models for Clustered
895 Ordinal Measurements". Journal of the American Statistical
896 Association Vol. 91, Issue 435 (1996).
898 Thomas Lumley. Generalized Estimating Equations for Ordinal Data:
899 A Note on Working Correlation Structures. Biometrics Vol. 52,
900 No. 1 (Mar., 1996), pp. 354-361
901 http://www.jstor.org/stable/2533173
903 Notes
904 -----
905 The following data structures are calculated in the class:
907 'ibd' is a list whose i^th element ibd[i] is a sequence of integer
908 pairs (a,b), where endog_li[i][a:b] is the subvector of binary
909 indicators derived from the same ordinal value.
911 `cpp` is a dictionary where cpp[group] is a map from cut-point
912 pairs (c,c') to the indices of all between-subject pairs derived
913 from the given cut points.
914 """
916 def __init__(self, endog_type):
917 super(GlobalOddsRatio, self).__init__()
918 self.endog_type = endog_type
919 self.dep_params = 0.
921 def initialize(self, model):
923 super(GlobalOddsRatio, self).initialize(model)
925 if self.model.weights is not None:
926 warnings.warn("weights not implemented for GlobalOddsRatio "
927 "cov_struct, using unweighted covariance estimate",
928 NotImplementedWarning)
930 # Need to restrict to between-subject pairs
931 cpp = []
932 for v in model.endog_li:
934 # Number of subjects in this group
935 m = int(len(v) / self._ncut)
936 i1, i2 = np.tril_indices(m, -1)
938 cpp1 = {}
939 for k1 in range(self._ncut):
940 for k2 in range(k1 + 1):
941 jj = np.zeros((len(i1), 2), dtype=np.int64)
942 jj[:, 0] = i1 * self._ncut + k1
943 jj[:, 1] = i2 * self._ncut + k2
944 cpp1[(k2, k1)] = jj
946 cpp.append(cpp1)
948 self.cpp = cpp
950 # Initialize the dependence parameters
951 self.crude_or = self.observed_crude_oddsratio()
952 if self.model.update_dep:
953 self.dep_params = self.crude_or
955 def pooled_odds_ratio(self, tables):
956 """
957 Returns the pooled odds ratio for a list of 2x2 tables.
959 The pooled odds ratio is the inverse variance weighted average
960 of the sample odds ratios of the tables.
961 """
963 if len(tables) == 0:
964 return 1.
966 # Get the sampled odds ratios and variances
967 log_oddsratio, var = [], []
968 for table in tables:
969 lor = np.log(table[1, 1]) + np.log(table[0, 0]) -\
970 np.log(table[0, 1]) - np.log(table[1, 0])
971 log_oddsratio.append(lor)
972 var.append((1 / table.astype(np.float64)).sum())
974 # Calculate the inverse variance weighted average
975 wts = [1 / v for v in var]
976 wtsum = sum(wts)
977 wts = [w / wtsum for w in wts]
978 log_pooled_or = sum([w * e for w, e in zip(wts, log_oddsratio)])
980 return np.exp(log_pooled_or)
982 @Appender(CovStruct.covariance_matrix.__doc__)
983 def covariance_matrix(self, expected_value, index):
985 vmat = self.get_eyy(expected_value, index)
986 vmat -= np.outer(expected_value, expected_value)
987 return vmat, False
989 def observed_crude_oddsratio(self):
990 """
991 To obtain the crude (global) odds ratio, first pool all binary
992 indicators corresponding to a given pair of cut points (c,c'),
993 then calculate the odds ratio for this 2x2 table. The crude
994 odds ratio is the inverse variance weighted average of these
995 odds ratios. Since the covariate effects are ignored, this OR
996 will generally be greater than the stratified OR.
997 """
999 cpp = self.cpp
1000 endog = self.model.endog_li
1002 # Storage for the contingency tables for each (c,c')
1003 tables = {}
1004 for ii in iterkeys(cpp[0]):
1005 tables[ii] = np.zeros((2, 2), dtype=np.float64)
1007 # Get the observed crude OR
1008 for i in range(len(endog)):
1010 # The observed joint values for the current cluster
1011 yvec = endog[i]
1012 endog_11 = np.outer(yvec, yvec)
1013 endog_10 = np.outer(yvec, 1. - yvec)
1014 endog_01 = np.outer(1. - yvec, yvec)
1015 endog_00 = np.outer(1. - yvec, 1. - yvec)
1017 cpp1 = cpp[i]
1018 for ky in iterkeys(cpp1):
1019 ix = cpp1[ky]
1020 tables[ky][1, 1] += endog_11[ix[:, 0], ix[:, 1]].sum()
1021 tables[ky][1, 0] += endog_10[ix[:, 0], ix[:, 1]].sum()
1022 tables[ky][0, 1] += endog_01[ix[:, 0], ix[:, 1]].sum()
1023 tables[ky][0, 0] += endog_00[ix[:, 0], ix[:, 1]].sum()
1025 return self.pooled_odds_ratio(list(itervalues(tables)))
1027 def get_eyy(self, endog_expval, index):
1028 """
1029 Returns a matrix V such that V[i,j] is the joint probability
1030 that endog[i] = 1 and endog[j] = 1, based on the marginal
1031 probabilities of endog and the global odds ratio `current_or`.
1032 """
1034 current_or = self.dep_params
1035 ibd = self.ibd[index]
1037 # The between-observation joint probabilities
1038 if current_or == 1.0:
1039 vmat = np.outer(endog_expval, endog_expval)
1040 else:
1041 psum = endog_expval[:, None] + endog_expval[None, :]
1042 pprod = endog_expval[:, None] * endog_expval[None, :]
1043 pfac = np.sqrt((1. + psum * (current_or - 1.)) ** 2 +
1044 4 * current_or * (1. - current_or) * pprod)
1045 vmat = 1. + psum * (current_or - 1.) - pfac
1046 vmat /= 2. * (current_or - 1)
1048 # Fix E[YY'] for elements that belong to same observation
1049 for bdl in ibd:
1050 evy = endog_expval[bdl[0]:bdl[1]]
1051 if self.endog_type == "ordinal":
1052 vmat[bdl[0]:bdl[1], bdl[0]:bdl[1]] =\
1053 np.minimum.outer(evy, evy)
1054 else:
1055 vmat[bdl[0]:bdl[1], bdl[0]:bdl[1]] = np.diag(evy)
1057 return vmat
1059 @Appender(CovStruct.update.__doc__)
1060 def update(self, params):
1061 """
1062 Update the global odds ratio based on the current value of
1063 params.
1064 """
1066 cpp = self.cpp
1067 cached_means = self.model.cached_means
1069 # This will happen if all the clusters have only
1070 # one observation
1071 if len(cpp[0]) == 0:
1072 return
1074 tables = {}
1075 for ii in cpp[0]:
1076 tables[ii] = np.zeros((2, 2), dtype=np.float64)
1078 for i in range(self.model.num_group):
1080 endog_expval, _ = cached_means[i]
1082 emat_11 = self.get_eyy(endog_expval, i)
1083 emat_10 = endog_expval[:, None] - emat_11
1084 emat_01 = -emat_11 + endog_expval
1085 emat_00 = 1. - (emat_11 + emat_10 + emat_01)
1087 cpp1 = cpp[i]
1088 for ky in iterkeys(cpp1):
1089 ix = cpp1[ky]
1090 tables[ky][1, 1] += emat_11[ix[:, 0], ix[:, 1]].sum()
1091 tables[ky][1, 0] += emat_10[ix[:, 0], ix[:, 1]].sum()
1092 tables[ky][0, 1] += emat_01[ix[:, 0], ix[:, 1]].sum()
1093 tables[ky][0, 0] += emat_00[ix[:, 0], ix[:, 1]].sum()
1095 cor_expval = self.pooled_odds_ratio(list(itervalues(tables)))
1097 self.dep_params *= self.crude_or / cor_expval
1098 if not np.isfinite(self.dep_params):
1099 self.dep_params = 1.
1100 warnings.warn("dep_params became inf, resetting to 1",
1101 ConvergenceWarning)
1103 def summary(self):
1104 return "Global odds ratio: %.3f\n" % self.dep_params
1107class OrdinalIndependence(CategoricalCovStruct):
1108 """
1109 An independence covariance structure for ordinal models.
1111 The working covariance between indicators derived from different
1112 observations is zero. The working covariance between indicators
1113 derived form a common observation is determined from their current
1114 mean values.
1116 There are no parameters to estimate in this covariance structure.
1117 """
1119 def covariance_matrix(self, expected_value, index):
1121 ibd = self.ibd[index]
1122 n = len(expected_value)
1123 vmat = np.zeros((n, n))
1125 for bdl in ibd:
1126 ev = expected_value[bdl[0]:bdl[1]]
1127 vmat[bdl[0]:bdl[1], bdl[0]:bdl[1]] =\
1128 np.minimum.outer(ev, ev) - np.outer(ev, ev)
1130 return vmat, False
1132 # Nothing to update
1133 def update(self, params):
1134 pass
1137class NominalIndependence(CategoricalCovStruct):
1138 """
1139 An independence covariance structure for nominal models.
1141 The working covariance between indicators derived from different
1142 observations is zero. The working covariance between indicators
1143 derived form a common observation is determined from their current
1144 mean values.
1146 There are no parameters to estimate in this covariance structure.
1147 """
1149 def covariance_matrix(self, expected_value, index):
1151 ibd = self.ibd[index]
1152 n = len(expected_value)
1153 vmat = np.zeros((n, n))
1155 for bdl in ibd:
1156 ev = expected_value[bdl[0]:bdl[1]]
1157 vmat[bdl[0]:bdl[1], bdl[0]:bdl[1]] =\
1158 np.diag(ev) - np.outer(ev, ev)
1160 return vmat, False
1162 # Nothing to update
1163 def update(self, params):
1164 pass
1167class Equivalence(CovStruct):
1168 """
1169 A covariance structure defined in terms of equivalence classes.
1171 An 'equivalence class' is a set of pairs of observations such that
1172 the covariance of every pair within the equivalence class has a
1173 common value.
1175 Parameters
1176 ----------
1177 pairs : dict-like
1178 A dictionary of dictionaries, where `pairs[group][label]`
1179 provides the indices of all pairs of observations in the group
1180 that have the same covariance value. Specifically,
1181 `pairs[group][label]` is a tuple `(j1, j2)`, where `j1` and `j2`
1182 are integer arrays of the same length. `j1[i], j2[i]` is one
1183 index pair that belongs to the `label` equivalence class. Only
1184 one triangle of each covariance matrix should be included.
1185 Positions where j1 and j2 have the same value are variance
1186 parameters.
1187 labels : array_like
1188 An array of labels such that every distinct pair of labels
1189 defines an equivalence class. Either `labels` or `pairs` must
1190 be provided. When the two labels in a pair are equal two
1191 equivalence classes are defined: one for the diagonal elements
1192 (corresponding to variances) and one for the off-diagonal
1193 elements (corresponding to covariances).
1194 return_cov : bool
1195 If True, `covariance_matrix` returns an estimate of the
1196 covariance matrix, otherwise returns an estimate of the
1197 correlation matrix.
1199 Notes
1200 -----
1201 Using `labels` to define the class is much easier than using
1202 `pairs`, but is less general.
1204 Any pair of values not contained in `pairs` will be assigned zero
1205 covariance.
1207 The index values in `pairs` are row indices into the `exog`
1208 matrix. They are not updated if missing data are present. When
1209 using this covariance structure, missing data should be removed
1210 before constructing the model.
1212 If using `labels`, after a model is defined using the covariance
1213 structure it is possible to remove a label pair from the second
1214 level of the `pairs` dictionary to force the corresponding
1215 covariance to be zero.
1217 Examples
1218 --------
1219 The following sets up the `pairs` dictionary for a model with two
1220 groups, equal variance for all observations, and constant
1221 covariance for all pairs of observations within each group.
1223 >> pairs = {0: {}, 1: {}}
1224 >> pairs[0][0] = (np.r_[0, 1, 2], np.r_[0, 1, 2])
1225 >> pairs[0][1] = np.tril_indices(3, -1)
1226 >> pairs[1][0] = (np.r_[3, 4, 5], np.r_[3, 4, 5])
1227 >> pairs[1][2] = 3 + np.tril_indices(3, -1)
1228 """
1230 def __init__(self, pairs=None, labels=None, return_cov=False):
1232 super(Equivalence, self).__init__()
1234 if (pairs is None) and (labels is None):
1235 raise ValueError(
1236 "Equivalence cov_struct requires either `pairs` or `labels`")
1238 if (pairs is not None) and (labels is not None):
1239 raise ValueError(
1240 "Equivalence cov_struct accepts only one of `pairs` "
1241 "and `labels`")
1243 if pairs is not None:
1244 import copy
1245 self.pairs = copy.deepcopy(pairs)
1247 if labels is not None:
1248 self.labels = np.asarray(labels)
1250 self.return_cov = return_cov
1252 def _make_pairs(self, i, j):
1253 """
1254 Create arrays containing all unique ordered pairs of i, j.
1256 The arrays i and j must be one-dimensional containing non-negative
1257 integers.
1258 """
1260 mat = np.zeros((len(i) * len(j), 2), dtype=np.int32)
1262 # Create the pairs and order them
1263 f = np.ones(len(j))
1264 mat[:, 0] = np.kron(f, i).astype(np.int32)
1265 f = np.ones(len(i))
1266 mat[:, 1] = np.kron(j, f).astype(np.int32)
1267 mat.sort(1)
1269 # Remove repeated rows
1270 try:
1271 dtype = np.dtype((np.void, mat.dtype.itemsize * mat.shape[1]))
1272 bmat = np.ascontiguousarray(mat).view(dtype)
1273 _, idx = np.unique(bmat, return_index=True)
1274 except TypeError:
1275 # workaround for old numpy that cannot call unique with complex
1276 # dtypes
1277 rs = np.random.RandomState(4234)
1278 bmat = np.dot(mat, rs.uniform(size=mat.shape[1]))
1279 _, idx = np.unique(bmat, return_index=True)
1280 mat = mat[idx, :]
1282 return mat[:, 0], mat[:, 1]
1284 def _pairs_from_labels(self):
1286 from collections import defaultdict
1287 pairs = defaultdict(lambda: defaultdict(lambda: None))
1289 model = self.model
1291 df = pd.DataFrame({"labels": self.labels, "groups": model.groups})
1292 gb = df.groupby(["groups", "labels"])
1294 ulabels = np.unique(self.labels)
1296 for g_ix, g_lb in enumerate(model.group_labels):
1298 # Loop over label pairs
1299 for lx1 in range(len(ulabels)):
1300 for lx2 in range(lx1 + 1):
1302 lb1 = ulabels[lx1]
1303 lb2 = ulabels[lx2]
1305 try:
1306 i1 = gb.groups[(g_lb, lb1)]
1307 i2 = gb.groups[(g_lb, lb2)]
1308 except KeyError:
1309 continue
1311 i1, i2 = self._make_pairs(i1, i2)
1313 clabel = str(lb1) + "/" + str(lb2)
1315 # Variance parameters belong in their own equiv class.
1316 jj = np.flatnonzero(i1 == i2)
1317 if len(jj) > 0:
1318 clabelv = clabel + "/v"
1319 pairs[g_lb][clabelv] = (i1[jj], i2[jj])
1321 # Covariance parameters
1322 jj = np.flatnonzero(i1 != i2)
1323 if len(jj) > 0:
1324 i1 = i1[jj]
1325 i2 = i2[jj]
1326 pairs[g_lb][clabel] = (i1, i2)
1328 self.pairs = pairs
1330 def initialize(self, model):
1332 super(Equivalence, self).initialize(model)
1334 if self.model.weights is not None:
1335 warnings.warn("weights not implemented for equalence cov_struct, "
1336 "using unweighted covariance estimate",
1337 NotImplementedWarning)
1339 if not hasattr(self, 'pairs'):
1340 self._pairs_from_labels()
1342 # Initialize so that any equivalence class containing a
1343 # variance parameter has value 1.
1344 self.dep_params = defaultdict(lambda: 0.)
1345 self._var_classes = set([])
1346 for gp in self.model.group_labels:
1347 for lb in self.pairs[gp]:
1348 j1, j2 = self.pairs[gp][lb]
1349 if np.any(j1 == j2):
1350 if not np.all(j1 == j2):
1351 warnings.warn(
1352 "equivalence class contains both variance "
1353 "and covariance parameters", OutputWarning)
1354 self._var_classes.add(lb)
1355 self.dep_params[lb] = 1
1357 # Need to start indexing at 0 within each group.
1358 # rx maps olds indices to new indices
1359 rx = -1 * np.ones(len(self.model.endog), dtype=np.int32)
1360 for g_ix, g_lb in enumerate(self.model.group_labels):
1361 ii = self.model.group_indices[g_lb]
1362 rx[ii] = np.arange(len(ii), dtype=np.int32)
1364 # Reindex
1365 for gp in self.model.group_labels:
1366 for lb in self.pairs[gp].keys():
1367 a, b = self.pairs[gp][lb]
1368 self.pairs[gp][lb] = (rx[a], rx[b])
1370 @Appender(CovStruct.update.__doc__)
1371 def update(self, params):
1373 endog = self.model.endog_li
1374 varfunc = self.model.family.variance
1375 cached_means = self.model.cached_means
1376 dep_params = defaultdict(lambda: [0., 0., 0.])
1377 n_pairs = defaultdict(lambda: 0)
1378 dim = len(params)
1380 for k, gp in enumerate(self.model.group_labels):
1381 expval, _ = cached_means[k]
1382 stdev = np.sqrt(varfunc(expval))
1383 resid = (endog[k] - expval) / stdev
1384 for lb in self.pairs[gp].keys():
1385 if (not self.return_cov) and lb in self._var_classes:
1386 continue
1387 jj = self.pairs[gp][lb]
1388 dep_params[lb][0] += np.sum(resid[jj[0]] * resid[jj[1]])
1389 if not self.return_cov:
1390 dep_params[lb][1] += np.sum(resid[jj[0]] ** 2)
1391 dep_params[lb][2] += np.sum(resid[jj[1]] ** 2)
1392 n_pairs[lb] += len(jj[0])
1394 if self.return_cov:
1395 for lb in dep_params.keys():
1396 dep_params[lb] = dep_params[lb][0] / (n_pairs[lb] - dim)
1397 else:
1398 for lb in dep_params.keys():
1399 den = np.sqrt(dep_params[lb][1] * dep_params[lb][2])
1400 dep_params[lb] = dep_params[lb][0] / den
1401 for lb in self._var_classes:
1402 dep_params[lb] = 1.
1404 self.dep_params = dep_params
1405 self.n_pairs = n_pairs
1407 @Appender(CovStruct.covariance_matrix.__doc__)
1408 def covariance_matrix(self, expval, index):
1409 dim = len(expval)
1410 cmat = np.zeros((dim, dim))
1411 g_lb = self.model.group_labels[index]
1413 for lb in self.pairs[g_lb].keys():
1414 j1, j2 = self.pairs[g_lb][lb]
1415 cmat[j1, j2] = self.dep_params[lb]
1417 cmat = cmat + cmat.T
1418 np.fill_diagonal(cmat, cmat.diagonal() / 2)
1420 return cmat, not self.return_cov