Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/genmod/generalized_estimating_equations.py : 12%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Procedures for fitting marginal regression models to dependent data
3using Generalized Estimating Equations.
5References
6----------
7KY Liang and S Zeger. "Longitudinal data analysis using
8generalized linear models". Biometrika (1986) 73 (1): 13-22.
10S Zeger and KY Liang. "Longitudinal Data Analysis for Discrete and
11Continuous Outcomes". Biometrics Vol. 42, No. 1 (Mar., 1986),
12pp. 121-130
14A Rotnitzky and NP Jewell (1990). "Hypothesis testing of regression
15parameters in semiparametric generalized linear models for cluster
16correlated data", Biometrika, 77, 485-497.
18Xu Guo and Wei Pan (2002). "Small sample performance of the score
19test in GEE".
20http://www.sph.umn.edu/faculty1/wp-content/uploads/2012/11/rr2002-013.pdf
22LA Mancl LA, TA DeRouen (2001). A covariance estimator for GEE with
23improved small-sample properties. Biometrics. 2001 Mar;57(1):126-34.
24"""
25from statsmodels.compat.python import lzip
26from statsmodels.compat.pandas import Appender
28import numpy as np
29from scipy import stats
30import pandas as pd
31import patsy
32from collections import defaultdict
33from statsmodels.tools.decorators import cache_readonly
34import statsmodels.base.model as base
35# used for wrapper:
36import statsmodels.regression.linear_model as lm
37import statsmodels.base.wrapper as wrap
39from statsmodels.genmod import families
40from statsmodels.genmod.generalized_linear_model import GLM
41from statsmodels.genmod import cov_struct as cov_structs
43import statsmodels.genmod.families.varfuncs as varfuncs
44from statsmodels.genmod.families.links import Link
46from statsmodels.tools.sm_exceptions import (ConvergenceWarning,
47 DomainWarning,
48 IterationLimitWarning,
49 ValueWarning)
50import warnings
52from statsmodels.graphics._regressionplots_doc import (
53 _plot_added_variable_doc,
54 _plot_partial_residuals_doc,
55 _plot_ceres_residuals_doc)
56from statsmodels.discrete.discrete_margins import (
57 _get_margeff_exog, _check_margeff_args, _effects_at, margeff_cov_with_se,
58 _check_at_is_all, _transform_names, _check_discrete_args,
59 _get_dummy_index, _get_count_index)
62class ParameterConstraint(object):
63 """
64 A class for managing linear equality constraints for a parameter
65 vector.
66 """
68 def __init__(self, lhs, rhs, exog):
69 """
70 Parameters
71 ----------
72 lhs : ndarray
73 A q x p matrix which is the left hand side of the
74 constraint lhs * param = rhs. The number of constraints is
75 q >= 1 and p is the dimension of the parameter vector.
76 rhs : ndarray
77 A 1-dimensional vector of length q which is the right hand
78 side of the constraint equation.
79 exog : ndarray
80 The n x p exognenous data for the full model.
81 """
83 # In case a row or column vector is passed (patsy linear
84 # constraints passes a column vector).
85 rhs = np.atleast_1d(rhs.squeeze())
87 if rhs.ndim > 1:
88 raise ValueError("The right hand side of the constraint "
89 "must be a vector.")
91 if len(rhs) != lhs.shape[0]:
92 raise ValueError("The number of rows of the left hand "
93 "side constraint matrix L must equal "
94 "the length of the right hand side "
95 "constraint vector R.")
97 self.lhs = lhs
98 self.rhs = rhs
100 # The columns of lhs0 are an orthogonal basis for the
101 # orthogonal complement to row(lhs), the columns of lhs1 are
102 # an orthogonal basis for row(lhs). The columns of lhsf =
103 # [lhs0, lhs1] are mutually orthogonal.
104 lhs_u, lhs_s, lhs_vt = np.linalg.svd(lhs.T, full_matrices=1)
105 self.lhs0 = lhs_u[:, len(lhs_s):]
106 self.lhs1 = lhs_u[:, 0:len(lhs_s)]
107 self.lhsf = np.hstack((self.lhs0, self.lhs1))
109 # param0 is one solution to the underdetermined system
110 # L * param = R.
111 self.param0 = np.dot(self.lhs1, np.dot(lhs_vt, self.rhs) /
112 lhs_s)
114 self._offset_increment = np.dot(exog, self.param0)
116 self.orig_exog = exog
117 self.exog_fulltrans = np.dot(exog, self.lhsf)
119 def offset_increment(self):
120 """
121 Returns a vector that should be added to the offset vector to
122 accommodate the constraint.
124 Parameters
125 ----------
126 exog : array_like
127 The exogeneous data for the model.
128 """
130 return self._offset_increment
132 def reduced_exog(self):
133 """
134 Returns a linearly transformed exog matrix whose columns span
135 the constrained model space.
137 Parameters
138 ----------
139 exog : array_like
140 The exogeneous data for the model.
141 """
142 return self.exog_fulltrans[:, 0:self.lhs0.shape[1]]
144 def restore_exog(self):
145 """
146 Returns the full exog matrix before it was reduced to
147 satisfy the constraint.
148 """
149 return self.orig_exog
151 def unpack_param(self, params):
152 """
153 Converts the parameter vector `params` from reduced to full
154 coordinates.
155 """
157 return self.param0 + np.dot(self.lhs0, params)
159 def unpack_cov(self, bcov):
160 """
161 Converts the covariance matrix `bcov` from reduced to full
162 coordinates.
163 """
165 return np.dot(self.lhs0, np.dot(bcov, self.lhs0.T))
168_gee_init_doc = """
169 Marginal regression model fit using Generalized Estimating Equations.
171 GEE can be used to fit Generalized Linear Models (GLMs) when the
172 data have a grouped structure, and the observations are possibly
173 correlated within groups but not between groups.
175 Parameters
176 ----------
177 endog : array_like
178 1d array of endogenous values (i.e. responses, outcomes,
179 dependent variables, or 'Y' values).
180 exog : array_like
181 2d array of exogeneous values (i.e. covariates, predictors,
182 independent variables, regressors, or 'X' values). A `nobs x
183 k` array where `nobs` is the number of observations and `k` is
184 the number of regressors. An intercept is not included by
185 default and should be added by the user. See
186 `statsmodels.tools.add_constant`.
187 groups : array_like
188 A 1d array of length `nobs` containing the group labels.
189 time : array_like
190 A 2d array of time (or other index) values, used by some
191 dependence structures to define similarity relationships among
192 observations within a cluster.
193 family : family class instance
194%(family_doc)s
195 cov_struct : CovStruct class instance
196 The default is Independence. To specify an exchangeable
197 structure use cov_struct = Exchangeable(). See
198 statsmodels.genmod.cov_struct.CovStruct for more
199 information.
200 offset : array_like
201 An offset to be included in the fit. If provided, must be
202 an array whose length is the number of rows in exog.
203 dep_data : array_like
204 Additional data passed to the dependence structure.
205 constraint : (ndarray, ndarray)
206 If provided, the constraint is a tuple (L, R) such that the
207 model parameters are estimated under the constraint L *
208 param = R, where L is a q x p matrix and R is a
209 q-dimensional vector. If constraint is provided, a score
210 test is performed to compare the constrained model to the
211 unconstrained model.
212 update_dep : bool
213 If true, the dependence parameters are optimized, otherwise
214 they are held fixed at their starting values.
215 weights : array_like
216 An array of weights to use in the analysis. The weights must
217 be constant within each group. These correspond to
218 probability weights (pweights) in Stata.
219 %(extra_params)s
221 See Also
222 --------
223 statsmodels.genmod.families.family
224 :ref:`families`
225 :ref:`links`
227 Notes
228 -----
229 Only the following combinations make sense for family and link ::
231 + ident log logit probit cloglog pow opow nbinom loglog logc
232 Gaussian | x x x
233 inv Gaussian | x x x
234 binomial | x x x x x x x x x
235 Poisson | x x x
236 neg binomial | x x x x
237 gamma | x x x
239 Not all of these link functions are currently available.
241 Endog and exog are references so that if the data they refer
242 to are already arrays and these arrays are changed, endog and
243 exog will change.
245 The "robust" covariance type is the standard "sandwich estimator"
246 (e.g. Liang and Zeger (1986)). It is the default here and in most
247 other packages. The "naive" estimator gives smaller standard
248 errors, but is only correct if the working correlation structure
249 is correctly specified. The "bias reduced" estimator of Mancl and
250 DeRouen (Biometrics, 2001) reduces the downward bias of the robust
251 estimator.
253 The robust covariance provided here follows Liang and Zeger (1986)
254 and agrees with R's gee implementation. To obtain the robust
255 standard errors reported in Stata, multiply by sqrt(N / (N - g)),
256 where N is the total sample size, and g is the average group size.
258 Examples
259 --------
260 %(example)s
261"""
263_gee_family_doc = """\
264 The default is Gaussian. To specify the binomial
265 distribution use `family=sm.families.Binomial()`. Each family
266 can take a link instance as an argument. See
267 statsmodels.genmod.families.family for more information."""
269_gee_ordinal_family_doc = """\
270 The only family supported is `Binomial`. The default `Logit`
271 link may be replaced with `probit` if desired."""
273_gee_nominal_family_doc = """\
274 The default value `None` uses a multinomial logit family
275 specifically designed for use with GEE. Setting this
276 argument to a non-default value is not currently supported."""
278_gee_fit_doc = """
279 Fits a marginal regression model using generalized estimating
280 equations (GEE).
282 Parameters
283 ----------
284 maxiter : int
285 The maximum number of iterations
286 ctol : float
287 The convergence criterion for stopping the Gauss-Seidel
288 iterations
289 start_params : array_like
290 A vector of starting values for the regression
291 coefficients. If None, a default is chosen.
292 params_niter : int
293 The number of Gauss-Seidel updates of the mean structure
294 parameters that take place prior to each update of the
295 dependence structure.
296 first_dep_update : int
297 No dependence structure updates occur before this
298 iteration number.
299 cov_type : str
300 One of "robust", "naive", or "bias_reduced".
301 ddof_scale : scalar or None
302 The scale parameter is estimated as the sum of squared
303 Pearson residuals divided by `N - ddof_scale`, where N
304 is the total sample size. If `ddof_scale` is None, the
305 number of covariates (including an intercept if present)
306 is used.
307 scaling_factor : scalar
308 The estimated covariance of the parameter estimates is
309 scaled by this value. Default is 1, Stata uses N / (N - g),
310 where N is the total sample size and g is the average group
311 size.
312 scale : str or float, optional
313 `scale` can be None, 'X2', or a float
314 If a float, its value is used as the scale parameter.
315 The default value is None, which uses `X2` (Pearson's
316 chi-square) for Gamma, Gaussian, and Inverse Gaussian.
317 The default is 1 for the Binomial and Poisson families.
319 Returns
320 -------
321 An instance of the GEEResults class or subclass
323 Notes
324 -----
325 If convergence difficulties occur, increase the values of
326 `first_dep_update` and/or `params_niter`. Setting
327 `first_dep_update` to a greater value (e.g. ~10-20) causes the
328 algorithm to move close to the GLM solution before attempting
329 to identify the dependence structure.
331 For the Gaussian family, there is no benefit to setting
332 `params_niter` to a value greater than 1, since the mean
333 structure parameters converge in one step.
334"""
336_gee_results_doc = """
337 Attributes
338 ----------
340 cov_params_default : ndarray
341 default covariance of the parameter estimates. Is chosen among one
342 of the following three based on `cov_type`
343 cov_robust : ndarray
344 covariance of the parameter estimates that is robust
345 cov_naive : ndarray
346 covariance of the parameter estimates that is not robust to
347 correlation or variance misspecification
348 cov_robust_bc : ndarray
349 covariance of the parameter estimates that is robust and bias
350 reduced
351 converged : bool
352 indicator for convergence of the optimization.
353 True if the norm of the score is smaller than a threshold
354 cov_type : str
355 string indicating whether a "robust", "naive" or "bias_reduced"
356 covariance is used as default
357 fit_history : dict
358 Contains information about the iterations.
359 fittedvalues : ndarray
360 Linear predicted values for the fitted model.
361 dot(exog, params)
362 model : class instance
363 Pointer to GEE model instance that called `fit`.
364 normalized_cov_params : ndarray
365 See GEE docstring
366 params : ndarray
367 The coefficients of the fitted model. Note that
368 interpretation of the coefficients often depends on the
369 distribution family and the data.
370 scale : float
371 The estimate of the scale / dispersion for the model fit.
372 See GEE.fit for more information.
373 score_norm : float
374 norm of the score at the end of the iterative estimation.
375 bse : ndarray
376 The standard errors of the fitted GEE parameters.
377"""
379_gee_example = """
380 Logistic regression with autoregressive working dependence:
382 >>> import statsmodels.api as sm
383 >>> family = sm.families.Binomial()
384 >>> va = sm.cov_struct.Autoregressive()
385 >>> model = sm.GEE(endog, exog, group, family=family, cov_struct=va)
386 >>> result = model.fit()
387 >>> print(result.summary())
389 Use formulas to fit a Poisson GLM with independent working
390 dependence:
392 >>> import statsmodels.api as sm
393 >>> fam = sm.families.Poisson()
394 >>> ind = sm.cov_struct.Independence()
395 >>> model = sm.GEE.from_formula("y ~ age + trt + base", "subject", \
396 data, cov_struct=ind, family=fam)
397 >>> result = model.fit()
398 >>> print(result.summary())
400 Equivalent, using the formula API:
402 >>> import statsmodels.api as sm
403 >>> import statsmodels.formula.api as smf
404 >>> fam = sm.families.Poisson()
405 >>> ind = sm.cov_struct.Independence()
406 >>> model = smf.gee("y ~ age + trt + base", "subject", \
407 data, cov_struct=ind, family=fam)
408 >>> result = model.fit()
409 >>> print(result.summary())
410"""
412_gee_ordinal_example = """
413 Fit an ordinal regression model using GEE, with "global
414 odds ratio" dependence:
416 >>> import statsmodels.api as sm
417 >>> gor = sm.cov_struct.GlobalOddsRatio("ordinal")
418 >>> model = sm.OrdinalGEE(endog, exog, groups, cov_struct=gor)
419 >>> result = model.fit()
420 >>> print(result.summary())
422 Using formulas:
424 >>> import statsmodels.formula.api as smf
425 >>> model = smf.ordinal_gee("y ~ x1 + x2", groups, data,
426 cov_struct=gor)
427 >>> result = model.fit()
428 >>> print(result.summary())
429"""
431_gee_nominal_example = """
432 Fit a nominal regression model using GEE:
434 >>> import statsmodels.api as sm
435 >>> import statsmodels.formula.api as smf
436 >>> gor = sm.cov_struct.GlobalOddsRatio("nominal")
437 >>> model = sm.NominalGEE(endog, exog, groups, cov_struct=gor)
438 >>> result = model.fit()
439 >>> print(result.summary())
441 Using formulas:
443 >>> import statsmodels.api as sm
444 >>> model = sm.NominalGEE.from_formula("y ~ x1 + x2", groups,
445 data, cov_struct=gor)
446 >>> result = model.fit()
447 >>> print(result.summary())
449 Using the formula API:
451 >>> import statsmodels.formula.api as smf
452 >>> model = smf.nominal_gee("y ~ x1 + x2", groups, data,
453 cov_struct=gor)
454 >>> result = model.fit()
455 >>> print(result.summary())
456"""
459def _check_args(endog, exog, groups, time, offset, exposure):
461 if endog.size != exog.shape[0]:
462 raise ValueError("Leading dimension of 'exog' should match "
463 "length of 'endog'")
465 if groups.size != endog.size:
466 raise ValueError("'groups' and 'endog' should have the same size")
468 if time is not None and (time.size != endog.size):
469 raise ValueError("'time' and 'endog' should have the same size")
471 if offset is not None and (offset.size != endog.size):
472 raise ValueError("'offset and 'endog' should have the same size")
474 if exposure is not None and (exposure.size != endog.size):
475 raise ValueError("'exposure' and 'endog' should have the same size")
478class GEE(base.Model):
480 __doc__ = (
481 " Marginal Regression Model using Generalized Estimating "
482 "Equations.\n" + _gee_init_doc %
483 {'extra_params': base._missing_param_doc,
484 'family_doc': _gee_family_doc,
485 'example': _gee_example})
487 cached_means = None
489 def __init__(self, endog, exog, groups, time=None, family=None,
490 cov_struct=None, missing='none', offset=None,
491 exposure=None, dep_data=None, constraint=None,
492 update_dep=True, weights=None, **kwargs):
494 if family is not None:
495 if not isinstance(family.link, tuple(family.safe_links)):
496 import warnings
497 msg = ("The {0} link function does not respect the "
498 "domain of the {1} family.")
499 warnings.warn(msg.format(family.link.__class__.__name__,
500 family.__class__.__name__),
501 DomainWarning)
503 groups = np.asarray(groups) # in case groups is pandas
505 if "missing_idx" in kwargs and kwargs["missing_idx"] is not None:
506 # If here, we are entering from super.from_formula; missing
507 # has already been dropped from endog and exog, but not from
508 # the other variables.
509 ii = ~kwargs["missing_idx"]
510 groups = groups[ii]
511 if time is not None:
512 time = time[ii]
513 if offset is not None:
514 offset = offset[ii]
515 if exposure is not None:
516 exposure = exposure[ii]
517 del kwargs["missing_idx"]
519 _check_args(endog, exog, groups, time, offset, exposure)
521 self.missing = missing
522 self.dep_data = dep_data
523 self.constraint = constraint
524 self.update_dep = update_dep
526 self._fit_history = defaultdict(list)
528 # Pass groups, time, offset, and dep_data so they are
529 # processed for missing data along with endog and exog.
530 # Calling super creates self.exog, self.endog, etc. as
531 # ndarrays and the original exog, endog, etc. are
532 # self.data.endog, etc.
533 super(GEE, self).__init__(endog, exog, groups=groups,
534 time=time, offset=offset,
535 exposure=exposure, weights=weights,
536 dep_data=dep_data, missing=missing,
537 **kwargs)
539 self._init_keys.extend(["update_dep", "constraint", "family",
540 "cov_struct"])
542 # Handle the family argument
543 if family is None:
544 family = families.Gaussian()
545 else:
546 if not issubclass(family.__class__, families.Family):
547 raise ValueError("GEE: `family` must be a genmod "
548 "family instance")
549 self.family = family
551 # Handle the cov_struct argument
552 if cov_struct is None:
553 cov_struct = cov_structs.Independence()
554 else:
555 if not issubclass(cov_struct.__class__, cov_structs.CovStruct):
556 raise ValueError("GEE: `cov_struct` must be a genmod "
557 "cov_struct instance")
559 self.cov_struct = cov_struct
561 # Handle the offset and exposure
562 self._offset_exposure = None
563 if offset is not None:
564 self._offset_exposure = self.offset.copy()
565 self.offset = offset
566 if exposure is not None:
567 if not isinstance(self.family.link, families.links.Log):
568 raise ValueError(
569 "exposure can only be used with the log link function")
570 if self._offset_exposure is not None:
571 self._offset_exposure += np.log(exposure)
572 else:
573 self._offset_exposure = np.log(exposure)
574 self.exposure = exposure
576 # Handle the constraint
577 self.constraint = None
578 if constraint is not None:
579 if len(constraint) != 2:
580 raise ValueError("GEE: `constraint` must be a 2-tuple.")
581 if constraint[0].shape[1] != self.exog.shape[1]:
582 raise ValueError(
583 "GEE: the left hand side of the constraint must have "
584 "the same number of columns as the exog matrix.")
585 self.constraint = ParameterConstraint(constraint[0],
586 constraint[1],
587 self.exog)
589 if self._offset_exposure is not None:
590 self._offset_exposure += self.constraint.offset_increment()
591 else:
592 self._offset_exposure = (
593 self.constraint.offset_increment().copy())
594 self.exog = self.constraint.reduced_exog()
596 # Create list of row indices for each group
597 group_labels, ix = np.unique(self.groups, return_inverse=True)
598 se = pd.Series(index=np.arange(len(ix)), dtype="int")
599 gb = se.groupby(ix).groups
600 dk = [(lb, np.asarray(gb[k])) for k, lb in enumerate(group_labels)]
601 self.group_indices = dict(dk)
602 self.group_labels = group_labels
604 # Convert the data to the internal representation, which is a
605 # list of arrays, corresponding to the groups.
606 self.endog_li = self.cluster_list(self.endog)
607 self.exog_li = self.cluster_list(self.exog)
609 if self.weights is not None:
610 self.weights_li = self.cluster_list(self.weights)
611 self.weights_li = [x[0] for x in self.weights_li]
612 self.weights_li = np.asarray(self.weights_li)
614 self.num_group = len(self.endog_li)
616 # Time defaults to a 1d grid with equal spacing
617 if self.time is not None:
618 self.time = np.asarray(self.time, np.float64)
619 if self.time.ndim == 1:
620 self.time = self.time[:, None]
621 self.time_li = self.cluster_list(self.time)
622 else:
623 self.time_li = \
624 [np.arange(len(y), dtype=np.float64)[:, None]
625 for y in self.endog_li]
626 self.time = np.concatenate(self.time_li)
628 if self._offset_exposure is not None:
629 self.offset_li = self.cluster_list(self._offset_exposure)
630 else:
631 self.offset_li = None
632 if constraint is not None:
633 self.constraint.exog_fulltrans_li = \
634 self.cluster_list(self.constraint.exog_fulltrans)
636 self.family = family
638 self.cov_struct.initialize(self)
640 # Total sample size
641 group_ns = [len(y) for y in self.endog_li]
642 self.nobs = sum(group_ns)
643 # The following are column based, not on rank see #1928
644 self.df_model = self.exog.shape[1] - 1 # assumes constant
645 self.df_resid = self.nobs - self.exog.shape[1]
647 # Skip the covariance updates if all groups have a single
648 # observation (reduces to fitting a GLM).
649 maxgroup = max([len(x) for x in self.endog_li])
650 if maxgroup == 1:
651 self.update_dep = False
653 # Override to allow groups and time to be passed as variable
654 # names.
655 @classmethod
656 def from_formula(cls, formula, groups, data, subset=None,
657 time=None, offset=None, exposure=None,
658 *args, **kwargs):
659 """
660 Create a GEE model instance from a formula and dataframe.
662 Parameters
663 ----------
664 formula : str or generic Formula object
665 The formula specifying the model
666 groups : array_like or string
667 Array of grouping labels. If a string, this is the name
668 of a variable in `data` that contains the grouping labels.
669 data : array_like
670 The data for the model.
671 subset : array_like
672 An array-like object of booleans, integers, or index
673 values that indicate the subset of the data to used when
674 fitting the model.
675 time : array_like or string
676 The time values, used for dependence structures involving
677 distances between observations. If a string, this is the
678 name of a variable in `data` that contains the time
679 values.
680 offset : array_like or string
681 The offset values, added to the linear predictor. If a
682 string, this is the name of a variable in `data` that
683 contains the offset values.
684 exposure : array_like or string
685 The exposure values, only used if the link function is the
686 logarithm function, in which case the log of `exposure`
687 is added to the offset (if any). If a string, this is the
688 name of a variable in `data` that contains the offset
689 values.
690 %(missing_param_doc)s
691 args : extra arguments
692 These are passed to the model
693 kwargs : extra keyword arguments
694 These are passed to the model with two exceptions. `dep_data`
695 is processed as described below. The ``eval_env`` keyword is
696 passed to patsy. It can be either a
697 :class:`patsy:patsy.EvalEnvironment` object or an integer
698 indicating the depth of the namespace to use. For example, the
699 default ``eval_env=0`` uses the calling namespace.
700 If you wish to use a "clean" environment set ``eval_env=-1``.
702 Optional arguments
703 ------------------
704 dep_data : str or array_like
705 Data used for estimating the dependence structure. See
706 specific dependence structure classes (e.g. Nested) for
707 details. If `dep_data` is a string, it is interpreted as
708 a formula that is applied to `data`. If it is an array, it
709 must be an array of strings corresponding to column names in
710 `data`. Otherwise it must be an array-like with the same
711 number of rows as data.
713 Returns
714 -------
715 model : GEE model instance
717 Notes
718 -----
719 `data` must define __getitem__ with the keys in the formula
720 terms args and kwargs are passed on to the model
721 instantiation. E.g., a numpy structured or rec array, a
722 dictionary, or a pandas DataFrame.
723 """ % {'missing_param_doc': base._missing_param_doc}
725 groups_name = "Groups"
726 if isinstance(groups, str):
727 groups_name = groups
728 groups = data[groups]
730 if isinstance(time, str):
731 time = data[time]
733 if isinstance(offset, str):
734 offset = data[offset]
736 if isinstance(exposure, str):
737 exposure = data[exposure]
739 dep_data = kwargs.get("dep_data")
740 dep_data_names = None
741 if dep_data is not None:
742 if isinstance(dep_data, str):
743 dep_data = patsy.dmatrix(dep_data, data,
744 return_type='dataframe')
745 dep_data_names = dep_data.columns.tolist()
746 else:
747 dep_data_names = list(dep_data)
748 dep_data = data[dep_data]
749 kwargs["dep_data"] = np.asarray(dep_data)
751 model = super(GEE, cls).from_formula(formula, data=data, subset=subset,
752 groups=groups, time=time,
753 offset=offset,
754 exposure=exposure,
755 *args, **kwargs)
757 if dep_data_names is not None:
758 model._dep_data_names = dep_data_names
759 model._groups_name = groups_name
761 return model
763 def cluster_list(self, array):
764 """
765 Returns `array` split into subarrays corresponding to the
766 cluster structure.
767 """
769 if array.ndim == 1:
770 return [np.array(array[self.group_indices[k]])
771 for k in self.group_labels]
772 else:
773 return [np.array(array[self.group_indices[k], :])
774 for k in self.group_labels]
776 def compare_score_test(self, submodel):
777 """
778 Perform a score test for the given submodel against this model.
780 Parameters
781 ----------
782 submodel : GEEResults instance
783 A fitted GEE model that is a submodel of this model.
785 Returns
786 -------
787 A dictionary with keys "statistic", "p-value", and "df",
788 containing the score test statistic, its chi^2 p-value,
789 and the degrees of freedom used to compute the p-value.
791 Notes
792 -----
793 The score test can be performed without calling 'fit' on the
794 larger model. The provided submodel must be obtained from a
795 fitted GEE.
797 This method performs the same score test as can be obtained by
798 fitting the GEE with a linear constraint and calling `score_test`
799 on the results.
801 References
802 ----------
803 Xu Guo and Wei Pan (2002). "Small sample performance of the score
804 test in GEE".
805 http://www.sph.umn.edu/faculty1/wp-content/uploads/2012/11/rr2002-013.pdf
806 """
808 # Since the model has not been fit, its scaletype has not been
809 # set. So give it the scaletype of the submodel.
810 self.scaletype = submodel.model.scaletype
812 # Check consistency between model and submodel (not a comprehensive
813 # check)
814 submod = submodel.model
815 if self.exog.shape[0] != submod.exog.shape[0]:
816 msg = "Model and submodel have different numbers of cases."
817 raise ValueError(msg)
818 if self.exog.shape[1] == submod.exog.shape[1]:
819 msg = "Model and submodel have the same number of variables"
820 warnings.warn(msg)
821 if not isinstance(self.family, type(submod.family)):
822 msg = "Model and submodel have different GLM families."
823 warnings.warn(msg)
824 if not isinstance(self.cov_struct, type(submod.cov_struct)):
825 warnings.warn("Model and submodel have different GEE covariance "
826 "structures.")
827 if not np.equal(self.weights, submod.weights).all():
828 msg = "Model and submodel should have the same weights."
829 warnings.warn(msg)
831 # Get the positions of the submodel variables in the
832 # parent model
833 qm, qc = _score_test_submodel(self, submodel.model)
834 if qm is None:
835 msg = "The provided model is not a submodel."
836 raise ValueError(msg)
838 # Embed the submodel params into a params vector for the
839 # parent model
840 params_ex = np.dot(qm, submodel.params)
842 # Attempt to preserve the state of the parent model
843 cov_struct_save = self.cov_struct
844 import copy
845 cached_means_save = copy.deepcopy(self.cached_means)
847 # Get the score vector of the submodel params in
848 # the parent model
849 self.cov_struct = submodel.cov_struct
850 self.update_cached_means(params_ex)
851 _, score = self._update_mean_params()
852 if score is None:
853 msg = "Singular matrix encountered in GEE score test"
854 warnings.warn(msg, ConvergenceWarning)
855 return None
857 if not hasattr(self, "ddof_scale"):
858 self.ddof_scale = self.exog.shape[1]
860 if not hasattr(self, "scaling_factor"):
861 self.scaling_factor = 1
863 _, ncov1, cmat = self._covmat()
864 scale = self.estimate_scale()
865 cmat = cmat / scale ** 2
866 score2 = np.dot(qc.T, score) / scale
868 amat = np.linalg.inv(ncov1)
870 bmat_11 = np.dot(qm.T, np.dot(cmat, qm))
871 bmat_22 = np.dot(qc.T, np.dot(cmat, qc))
872 bmat_12 = np.dot(qm.T, np.dot(cmat, qc))
874 amat_11 = np.dot(qm.T, np.dot(amat, qm))
875 amat_12 = np.dot(qm.T, np.dot(amat, qc))
877 score_cov = bmat_22 - np.dot(amat_12.T,
878 np.linalg.solve(amat_11, bmat_12))
879 score_cov -= np.dot(bmat_12.T,
880 np.linalg.solve(amat_11, amat_12))
881 score_cov += np.dot(amat_12.T,
882 np.dot(np.linalg.solve(amat_11, bmat_11),
883 np.linalg.solve(amat_11, amat_12)))
885 # Attempt to restore state
886 self.cov_struct = cov_struct_save
887 self.cached_means = cached_means_save
889 from scipy.stats.distributions import chi2
890 score_statistic = np.dot(score2,
891 np.linalg.solve(score_cov, score2))
892 score_df = len(score2)
893 score_pvalue = 1 - chi2.cdf(score_statistic, score_df)
894 return {"statistic": score_statistic,
895 "df": score_df,
896 "p-value": score_pvalue}
898 def estimate_scale(self):
899 """
900 Estimate the dispersion/scale.
901 """
903 if self.scaletype is None:
904 if isinstance(self.family, (families.Binomial, families.Poisson,
905 families.NegativeBinomial,
906 _Multinomial)):
907 return 1.
908 elif isinstance(self.scaletype, float):
909 return np.array(self.scaletype)
911 endog = self.endog_li
912 cached_means = self.cached_means
913 nobs = self.nobs
914 varfunc = self.family.variance
916 scale = 0.
917 fsum = 0.
918 for i in range(self.num_group):
920 if len(endog[i]) == 0:
921 continue
923 expval, _ = cached_means[i]
925 f = self.weights_li[i] if self.weights is not None else 1.
927 sdev = np.sqrt(varfunc(expval))
928 resid = (endog[i] - expval) / sdev
930 scale += f * np.sum(resid ** 2)
931 fsum += f * len(endog[i])
933 scale /= (fsum * (nobs - self.ddof_scale) / float(nobs))
935 return scale
937 def mean_deriv(self, exog, lin_pred):
938 """
939 Derivative of the expected endog with respect to the parameters.
941 Parameters
942 ----------
943 exog : array_like
944 The exogeneous data at which the derivative is computed.
945 lin_pred : array_like
946 The values of the linear predictor.
948 Returns
949 -------
950 The value of the derivative of the expected endog with respect
951 to the parameter vector.
953 Notes
954 -----
955 If there is an offset or exposure, it should be added to
956 `lin_pred` prior to calling this function.
957 """
959 idl = self.family.link.inverse_deriv(lin_pred)
960 dmat = exog * idl[:, None]
961 return dmat
963 def mean_deriv_exog(self, exog, params, offset_exposure=None):
964 """
965 Derivative of the expected endog with respect to exog.
967 Parameters
968 ----------
969 exog : array_like
970 Values of the independent variables at which the derivative
971 is calculated.
972 params : array_like
973 Parameter values at which the derivative is calculated.
974 offset_exposure : array_like, optional
975 Combined offset and exposure.
977 Returns
978 -------
979 The derivative of the expected endog with respect to exog.
980 """
982 lin_pred = np.dot(exog, params)
983 if offset_exposure is not None:
984 lin_pred += offset_exposure
986 idl = self.family.link.inverse_deriv(lin_pred)
987 dmat = np.outer(idl, params)
988 return dmat
990 def _update_mean_params(self):
991 """
992 Returns
993 -------
994 update : array_like
995 The update vector such that params + update is the next
996 iterate when solving the score equations.
997 score : array_like
998 The current value of the score equations, not
999 incorporating the scale parameter. If desired,
1000 multiply this vector by the scale parameter to
1001 incorporate the scale.
1002 """
1004 endog = self.endog_li
1005 exog = self.exog_li
1007 cached_means = self.cached_means
1009 varfunc = self.family.variance
1011 bmat, score = 0, 0
1012 for i in range(self.num_group):
1014 expval, lpr = cached_means[i]
1015 resid = endog[i] - expval
1016 dmat = self.mean_deriv(exog[i], lpr)
1017 sdev = np.sqrt(varfunc(expval))
1019 rslt = self.cov_struct.covariance_matrix_solve(expval, i,
1020 sdev, (dmat, resid))
1021 if rslt is None:
1022 return None, None
1023 vinv_d, vinv_resid = tuple(rslt)
1025 f = self.weights_li[i] if self.weights is not None else 1.
1027 bmat += f * np.dot(dmat.T, vinv_d)
1028 score += f * np.dot(dmat.T, vinv_resid)
1030 update = np.linalg.solve(bmat, score)
1032 self._fit_history["cov_adjust"].append(
1033 self.cov_struct.cov_adjust)
1035 return update, score
1037 def update_cached_means(self, mean_params):
1038 """
1039 cached_means should always contain the most recent calculation
1040 of the group-wise mean vectors. This function should be
1041 called every time the regression parameters are changed, to
1042 keep the cached means up to date.
1043 """
1045 endog = self.endog_li
1046 exog = self.exog_li
1047 offset = self.offset_li
1049 linkinv = self.family.link.inverse
1051 self.cached_means = []
1053 for i in range(self.num_group):
1055 if len(endog[i]) == 0:
1056 continue
1058 lpr = np.dot(exog[i], mean_params)
1059 if offset is not None:
1060 lpr += offset[i]
1061 expval = linkinv(lpr)
1063 self.cached_means.append((expval, lpr))
1065 def _covmat(self):
1066 """
1067 Returns the sampling covariance matrix of the regression
1068 parameters and related quantities.
1070 Returns
1071 -------
1072 cov_robust : array_like
1073 The robust, or sandwich estimate of the covariance, which
1074 is meaningful even if the working covariance structure is
1075 incorrectly specified.
1076 cov_naive : array_like
1077 The model-based estimate of the covariance, which is
1078 meaningful if the covariance structure is correctly
1079 specified.
1080 cmat : array_like
1081 The center matrix of the sandwich expression, used in
1082 obtaining score test results.
1083 """
1085 endog = self.endog_li
1086 exog = self.exog_li
1087 varfunc = self.family.variance
1088 cached_means = self.cached_means
1090 # Calculate the naive (model-based) and robust (sandwich)
1091 # covariances.
1092 bmat, cmat = 0, 0
1093 for i in range(self.num_group):
1095 expval, lpr = cached_means[i]
1096 resid = endog[i] - expval
1097 dmat = self.mean_deriv(exog[i], lpr)
1098 sdev = np.sqrt(varfunc(expval))
1100 rslt = self.cov_struct.covariance_matrix_solve(
1101 expval, i, sdev, (dmat, resid))
1102 if rslt is None:
1103 return None, None, None, None
1104 vinv_d, vinv_resid = tuple(rslt)
1106 f = self.weights_li[i] if self.weights is not None else 1.
1108 bmat += f * np.dot(dmat.T, vinv_d)
1109 dvinv_resid = f * np.dot(dmat.T, vinv_resid)
1110 cmat += np.outer(dvinv_resid, dvinv_resid)
1112 scale = self.estimate_scale()
1114 bmati = np.linalg.inv(bmat)
1115 cov_naive = bmati * scale
1116 cov_robust = np.dot(bmati, np.dot(cmat, bmati))
1118 cov_naive *= self.scaling_factor
1119 cov_robust *= self.scaling_factor
1120 return cov_robust, cov_naive, cmat
1122 # Calculate the bias-corrected sandwich estimate of Mancl and
1123 # DeRouen.
1124 def _bc_covmat(self, cov_naive):
1126 cov_naive = cov_naive / self.scaling_factor
1127 endog = self.endog_li
1128 exog = self.exog_li
1129 varfunc = self.family.variance
1130 cached_means = self.cached_means
1131 scale = self.estimate_scale()
1133 bcm = 0
1134 for i in range(self.num_group):
1136 expval, lpr = cached_means[i]
1137 resid = endog[i] - expval
1138 dmat = self.mean_deriv(exog[i], lpr)
1139 sdev = np.sqrt(varfunc(expval))
1141 rslt = self.cov_struct.covariance_matrix_solve(
1142 expval, i, sdev, (dmat,))
1143 if rslt is None:
1144 return None
1145 vinv_d = rslt[0]
1146 vinv_d /= scale
1148 hmat = np.dot(vinv_d, cov_naive)
1149 hmat = np.dot(hmat, dmat.T).T
1151 f = self.weights_li[i] if self.weights is not None else 1.
1153 aresid = np.linalg.solve(np.eye(len(resid)) - hmat, resid)
1154 rslt = self.cov_struct.covariance_matrix_solve(
1155 expval, i, sdev, (aresid,))
1156 if rslt is None:
1157 return None
1158 srt = rslt[0]
1159 srt = f * np.dot(dmat.T, srt) / scale
1160 bcm += np.outer(srt, srt)
1162 cov_robust_bc = np.dot(cov_naive, np.dot(bcm, cov_naive))
1163 cov_robust_bc *= self.scaling_factor
1165 return cov_robust_bc
1167 def predict(self, params, exog=None, offset=None,
1168 exposure=None, linear=False):
1169 """
1170 Return predicted values for a marginal regression model fit
1171 using GEE.
1173 Parameters
1174 ----------
1175 params : array_like
1176 Parameters / coefficients of a marginal regression model.
1177 exog : array_like, optional
1178 Design / exogenous data. If exog is None, model exog is
1179 used.
1180 offset : array_like, optional
1181 Offset for exog if provided. If offset is None, model
1182 offset is used.
1183 exposure : array_like, optional
1184 Exposure for exog, if exposure is None, model exposure is
1185 used. Only allowed if link function is the logarithm.
1186 linear : bool
1187 If True, returns the linear predicted values. If False,
1188 returns the value of the inverse of the model's link
1189 function at the linear predicted values.
1191 Returns
1192 -------
1193 An array of fitted values
1195 Notes
1196 -----
1197 Using log(V) as the offset is equivalent to using V as the
1198 exposure. If exposure U and offset V are both provided, then
1199 log(U) + V is added to the linear predictor.
1200 """
1202 # TODO: many paths through this, not well covered in tests
1204 if exposure is not None:
1205 if not isinstance(self.family.link, families.links.Log):
1206 raise ValueError(
1207 "exposure can only be used with the log link function")
1209 # This is the combined offset and exposure
1210 _offset = 0.
1212 # Using model exog
1213 if exog is None:
1214 exog = self.exog
1216 if not isinstance(self.family.link, families.links.Log):
1217 # Do not need to worry about exposure
1218 if offset is None:
1219 if self._offset_exposure is not None:
1220 _offset = self._offset_exposure.copy()
1221 else:
1222 _offset = offset
1224 else:
1225 if offset is None and exposure is None:
1226 if self._offset_exposure is not None:
1227 _offset = self._offset_exposure
1228 elif offset is None and exposure is not None:
1229 _offset = np.log(exposure)
1230 if hasattr(self, "offset"):
1231 _offset = _offset + self.offset
1232 elif offset is not None and exposure is None:
1233 _offset = offset
1234 if hasattr(self, "exposure"):
1235 _offset = offset + np.log(self.exposure)
1236 else:
1237 _offset = offset + np.log(exposure)
1239 # exog is provided: this is simpler than above because we
1240 # never use model exog or exposure if exog is provided.
1241 else:
1242 if offset is not None:
1243 _offset = _offset + offset
1244 if exposure is not None:
1245 _offset += np.log(exposure)
1247 lin_pred = _offset + np.dot(exog, params)
1249 if not linear:
1250 return self.family.link.inverse(lin_pred)
1252 return lin_pred
1254 def _starting_params(self):
1256 model = GLM(self.endog, self.exog, family=self.family,
1257 offset=self._offset_exposure,
1258 freq_weights=self.weights)
1259 result = model.fit()
1260 return result.params
1262 @Appender(_gee_fit_doc)
1263 def fit(self, maxiter=60, ctol=1e-6, start_params=None,
1264 params_niter=1, first_dep_update=0,
1265 cov_type='robust', ddof_scale=None, scaling_factor=1.,
1266 scale=None):
1268 self.scaletype = scale
1270 # Subtract this number from the total sample size when
1271 # normalizing the scale parameter estimate.
1272 if ddof_scale is None:
1273 self.ddof_scale = self.exog.shape[1]
1274 else:
1275 if not ddof_scale >= 0:
1276 raise ValueError(
1277 "ddof_scale must be a non-negative number or None")
1278 self.ddof_scale = ddof_scale
1280 self.scaling_factor = scaling_factor
1282 self._fit_history = defaultdict(list)
1284 if self.weights is not None and cov_type == 'naive':
1285 raise ValueError("when using weights, cov_type may not be naive")
1287 if start_params is None:
1288 mean_params = self._starting_params()
1289 else:
1290 start_params = np.asarray(start_params)
1291 mean_params = start_params.copy()
1293 self.update_cached_means(mean_params)
1295 del_params = -1.
1296 num_assoc_updates = 0
1297 for itr in range(maxiter):
1299 update, score = self._update_mean_params()
1300 if update is None:
1301 warnings.warn("Singular matrix encountered in GEE update",
1302 ConvergenceWarning)
1303 break
1304 mean_params += update
1305 self.update_cached_means(mean_params)
1307 # L2 norm of the change in mean structure parameters at
1308 # this iteration.
1309 del_params = np.sqrt(np.sum(score ** 2))
1311 self._fit_history['params'].append(mean_params.copy())
1312 self._fit_history['score'].append(score)
1313 self._fit_history['dep_params'].append(
1314 self.cov_struct.dep_params)
1316 # Do not exit until the association parameters have been
1317 # updated at least once.
1318 if (del_params < ctol and
1319 (num_assoc_updates > 0 or self.update_dep is False)):
1320 break
1322 # Update the dependence structure
1323 if (self.update_dep and (itr % params_niter) == 0
1324 and (itr >= first_dep_update)):
1325 self._update_assoc(mean_params)
1326 num_assoc_updates += 1
1328 if del_params >= ctol:
1329 warnings.warn("Iteration limit reached prior to convergence",
1330 IterationLimitWarning)
1332 if mean_params is None:
1333 warnings.warn("Unable to estimate GEE parameters.",
1334 ConvergenceWarning)
1335 return None
1337 bcov, ncov, _ = self._covmat()
1338 if bcov is None:
1339 warnings.warn("Estimated covariance structure for GEE "
1340 "estimates is singular", ConvergenceWarning)
1341 return None
1342 bc_cov = None
1343 if cov_type == "bias_reduced":
1344 bc_cov = self._bc_covmat(ncov)
1346 if self.constraint is not None:
1347 x = mean_params.copy()
1348 mean_params, bcov = self._handle_constraint(mean_params, bcov)
1349 if mean_params is None:
1350 warnings.warn("Unable to estimate constrained GEE "
1351 "parameters.", ConvergenceWarning)
1352 return None
1354 y, ncov = self._handle_constraint(x, ncov)
1355 if y is None:
1356 warnings.warn("Unable to estimate constrained GEE "
1357 "parameters.", ConvergenceWarning)
1358 return None
1360 if bc_cov is not None:
1361 y, bc_cov = self._handle_constraint(x, bc_cov)
1362 if x is None:
1363 warnings.warn("Unable to estimate constrained GEE "
1364 "parameters.", ConvergenceWarning)
1365 return None
1367 scale = self.estimate_scale()
1369 # kwargs to add to results instance, need to be available in __init__
1370 res_kwds = dict(cov_type=cov_type,
1371 cov_robust=bcov,
1372 cov_naive=ncov,
1373 cov_robust_bc=bc_cov)
1375 # The superclass constructor will multiply the covariance
1376 # matrix argument bcov by scale, which we do not want, so we
1377 # divide bcov by the scale parameter here
1378 results = GEEResults(self, mean_params, bcov / scale, scale,
1379 cov_type=cov_type, use_t=False,
1380 attr_kwds=res_kwds)
1382 # attributes not needed during results__init__
1383 results.fit_history = self._fit_history
1384 self.fit_history = defaultdict(list)
1385 results.score_norm = del_params
1386 results.converged = (del_params < ctol)
1387 results.cov_struct = self.cov_struct
1388 results.params_niter = params_niter
1389 results.first_dep_update = first_dep_update
1390 results.ctol = ctol
1391 results.maxiter = maxiter
1393 # These will be copied over to subclasses when upgrading.
1394 results._props = ["cov_type", "use_t",
1395 "cov_params_default", "cov_robust",
1396 "cov_naive", "cov_robust_bc",
1397 "fit_history",
1398 "score_norm", "converged", "cov_struct",
1399 "params_niter", "first_dep_update", "ctol",
1400 "maxiter"]
1402 return GEEResultsWrapper(results)
1404 def _update_regularized(self, params, pen_wt, scad_param, eps):
1406 sn, hm = 0, 0
1408 for i in range(self.num_group):
1410 expval, _ = self.cached_means[i]
1411 resid = self.endog_li[i] - expval
1412 sdev = np.sqrt(self.family.variance(expval))
1414 ex = self.exog_li[i] * sdev[:, None]**2
1415 rslt = self.cov_struct.covariance_matrix_solve(
1416 expval, i, sdev, (resid, ex))
1417 sn0 = rslt[0]
1418 sn += np.dot(ex.T, sn0)
1419 hm0 = rslt[1]
1420 hm += np.dot(ex.T, hm0)
1422 # Wang et al. divide sn here by num_group, but that
1423 # seems to be incorrect
1425 ap = np.abs(params)
1426 clipped = np.clip(scad_param * pen_wt - ap, 0, np.inf)
1427 en = pen_wt * clipped * (ap > pen_wt)
1428 en /= (scad_param - 1) * pen_wt
1429 en += pen_wt * (ap <= pen_wt)
1430 en /= eps + ap
1432 hm.flat[::hm.shape[0] + 1] += self.num_group * en
1433 sn -= self.num_group * en * params
1434 update = np.linalg.solve(hm, sn)
1435 hm *= self.estimate_scale()
1437 return update, hm
1439 def _regularized_covmat(self, mean_params):
1441 self.update_cached_means(mean_params)
1443 ma = 0
1445 for i in range(self.num_group):
1447 expval, _ = self.cached_means[i]
1448 resid = self.endog_li[i] - expval
1449 sdev = np.sqrt(self.family.variance(expval))
1451 ex = self.exog_li[i] * sdev[:, None]**2
1452 rslt = self.cov_struct.covariance_matrix_solve(
1453 expval, i, sdev, (resid,))
1454 ma0 = np.dot(ex.T, rslt[0])
1455 ma += np.outer(ma0, ma0)
1457 return ma
1459 def fit_regularized(self, pen_wt, scad_param=3.7, maxiter=100,
1460 ddof_scale=None, update_assoc=5,
1461 ctol=1e-5, ztol=1e-3, eps=1e-6, scale=None):
1462 """
1463 Regularized estimation for GEE.
1465 Parameters
1466 ----------
1467 pen_wt : float
1468 The penalty weight (a non-negative scalar).
1469 scad_param : float
1470 Non-negative scalar determining the shape of the Scad
1471 penalty.
1472 maxiter : int
1473 The maximum number of iterations.
1474 ddof_scale : int
1475 Value to subtract from `nobs` when calculating the
1476 denominator degrees of freedom for t-statistics, defaults
1477 to the number of columns in `exog`.
1478 update_assoc : int
1479 The dependence parameters are updated every `update_assoc`
1480 iterations of the mean structure parameter updates.
1481 ctol : float
1482 Convergence criterion, default is one order of magnitude
1483 smaller than proposed in section 3.1 of Wang et al.
1484 ztol : float
1485 Coefficients smaller than this value are treated as
1486 being zero, default is based on section 5 of Wang et al.
1487 eps : non-negative scalar
1488 Numerical constant, see section 3.2 of Wang et al.
1489 scale : float or string
1490 If a float, this value is used as the scale parameter.
1491 If "X2", the scale parameter is always estimated using
1492 Pearson's chi-square method (e.g. as in a quasi-Poisson
1493 analysis). If None, the default approach for the family
1494 is used to estimate the scale parameter.
1496 Returns
1497 -------
1498 GEEResults instance. Note that not all methods of the results
1499 class make sense when the model has been fit with regularization.
1501 Notes
1502 -----
1503 This implementation assumes that the link is canonical.
1505 References
1506 ----------
1507 Wang L, Zhou J, Qu A. (2012). Penalized generalized estimating
1508 equations for high-dimensional longitudinal data analysis.
1509 Biometrics. 2012 Jun;68(2):353-60.
1510 doi: 10.1111/j.1541-0420.2011.01678.x.
1511 https://www.ncbi.nlm.nih.gov/pubmed/21955051
1512 http://users.stat.umn.edu/~wangx346/research/GEE_selection.pdf
1513 """
1515 self.scaletype = scale
1517 mean_params = np.zeros(self.exog.shape[1])
1518 self.update_cached_means(mean_params)
1519 converged = False
1520 fit_history = defaultdict(list)
1522 # Subtract this number from the total sample size when
1523 # normalizing the scale parameter estimate.
1524 if ddof_scale is None:
1525 self.ddof_scale = self.exog.shape[1]
1526 else:
1527 if not ddof_scale >= 0:
1528 raise ValueError(
1529 "ddof_scale must be a non-negative number or None")
1530 self.ddof_scale = ddof_scale
1532 # Keep this private for now. In some cases the early steps are
1533 # very small so it seems necessary to ensure a certain minimum
1534 # number of iterations before testing for convergence.
1535 miniter = 20
1537 for itr in range(maxiter):
1539 update, hm = self._update_regularized(
1540 mean_params, pen_wt, scad_param, eps)
1541 if update is None:
1542 msg = "Singular matrix encountered in regularized GEE update",
1543 warnings.warn(msg, ConvergenceWarning)
1544 break
1545 if itr > miniter and np.sqrt(np.sum(update**2)) < ctol:
1546 converged = True
1547 break
1548 mean_params += update
1549 fit_history['params'].append(mean_params.copy())
1550 self.update_cached_means(mean_params)
1552 if itr != 0 and (itr % update_assoc == 0):
1553 self._update_assoc(mean_params)
1555 if not converged:
1556 msg = "GEE.fit_regularized did not converge"
1557 warnings.warn(msg)
1559 mean_params[np.abs(mean_params) < ztol] = 0
1561 self._update_assoc(mean_params)
1562 ma = self._regularized_covmat(mean_params)
1563 cov = np.linalg.solve(hm, ma)
1564 cov = np.linalg.solve(hm, cov.T)
1566 # kwargs to add to results instance, need to be available in __init__
1567 res_kwds = dict(cov_type="robust", cov_robust=cov)
1569 scale = self.estimate_scale()
1570 rslt = GEEResults(self, mean_params, cov, scale,
1571 regularized=True, attr_kwds=res_kwds)
1572 rslt.fit_history = fit_history
1574 return GEEResultsWrapper(rslt)
1576 def _handle_constraint(self, mean_params, bcov):
1577 """
1578 Expand the parameter estimate `mean_params` and covariance matrix
1579 `bcov` to the coordinate system of the unconstrained model.
1581 Parameters
1582 ----------
1583 mean_params : array_like
1584 A parameter vector estimate for the reduced model.
1585 bcov : array_like
1586 The covariance matrix of mean_params.
1588 Returns
1589 -------
1590 mean_params : array_like
1591 The input parameter vector mean_params, expanded to the
1592 coordinate system of the full model
1593 bcov : array_like
1594 The input covariance matrix bcov, expanded to the
1595 coordinate system of the full model
1596 """
1598 # The number of variables in the full model
1599 red_p = len(mean_params)
1600 full_p = self.constraint.lhs.shape[1]
1601 mean_params0 = np.r_[mean_params, np.zeros(full_p - red_p)]
1603 # Get the score vector under the full model.
1604 save_exog_li = self.exog_li
1605 self.exog_li = self.constraint.exog_fulltrans_li
1606 import copy
1607 save_cached_means = copy.deepcopy(self.cached_means)
1608 self.update_cached_means(mean_params0)
1609 _, score = self._update_mean_params()
1611 if score is None:
1612 warnings.warn("Singular matrix encountered in GEE score test",
1613 ConvergenceWarning)
1614 return None, None
1616 _, ncov1, cmat = self._covmat()
1617 scale = self.estimate_scale()
1618 cmat = cmat / scale ** 2
1619 score2 = score[red_p:] / scale
1620 amat = np.linalg.inv(ncov1)
1622 bmat_11 = cmat[0:red_p, 0:red_p]
1623 bmat_22 = cmat[red_p:, red_p:]
1624 bmat_12 = cmat[0:red_p, red_p:]
1625 amat_11 = amat[0:red_p, 0:red_p]
1626 amat_12 = amat[0:red_p, red_p:]
1628 score_cov = bmat_22 - np.dot(amat_12.T,
1629 np.linalg.solve(amat_11, bmat_12))
1630 score_cov -= np.dot(bmat_12.T,
1631 np.linalg.solve(amat_11, amat_12))
1632 score_cov += np.dot(amat_12.T,
1633 np.dot(np.linalg.solve(amat_11, bmat_11),
1634 np.linalg.solve(amat_11, amat_12)))
1636 from scipy.stats.distributions import chi2
1637 score_statistic = np.dot(score2,
1638 np.linalg.solve(score_cov, score2))
1639 score_df = len(score2)
1640 score_pvalue = 1 - chi2.cdf(score_statistic, score_df)
1641 self.score_test_results = {"statistic": score_statistic,
1642 "df": score_df,
1643 "p-value": score_pvalue}
1645 mean_params = self.constraint.unpack_param(mean_params)
1646 bcov = self.constraint.unpack_cov(bcov)
1648 self.exog_li = save_exog_li
1649 self.cached_means = save_cached_means
1650 self.exog = self.constraint.restore_exog()
1652 return mean_params, bcov
1654 def _update_assoc(self, params):
1655 """
1656 Update the association parameters
1657 """
1659 self.cov_struct.update(params)
1661 def _derivative_exog(self, params, exog=None, transform='dydx',
1662 dummy_idx=None, count_idx=None):
1663 """
1664 For computing marginal effects, returns dF(XB) / dX where F(.)
1665 is the fitted mean.
1667 transform can be 'dydx', 'dyex', 'eydx', or 'eyex'.
1669 Not all of these make sense in the presence of discrete regressors,
1670 but checks are done in the results in get_margeff.
1671 """
1672 # This form should be appropriate for group 1 probit, logit,
1673 # logistic, cloglog, heckprob, xtprobit.
1674 offset_exposure = None
1675 if exog is None:
1676 exog = self.exog
1677 offset_exposure = self._offset_exposure
1679 margeff = self.mean_deriv_exog(exog, params, offset_exposure)
1681 if 'ex' in transform:
1682 margeff *= exog
1683 if 'ey' in transform:
1684 margeff /= self.predict(params, exog)[:, None]
1685 if count_idx is not None:
1686 from statsmodels.discrete.discrete_margins import (
1687 _get_count_effects)
1688 margeff = _get_count_effects(margeff, exog, count_idx, transform,
1689 self, params)
1690 if dummy_idx is not None:
1691 from statsmodels.discrete.discrete_margins import (
1692 _get_dummy_effects)
1693 margeff = _get_dummy_effects(margeff, exog, dummy_idx, transform,
1694 self, params)
1695 return margeff
1697 def qic(self, params, scale, cov_params):
1698 """
1699 Returns quasi-information criteria and quasi-likelihood values.
1701 Parameters
1702 ----------
1703 params : array_like
1704 The GEE estimates of the regression parameters.
1705 scale : scalar
1706 Estimated scale parameter
1707 cov_params : array_like
1708 An estimate of the covariance matrix for the
1709 model parameters. Conventionally this is the robust
1710 covariance matrix.
1712 Returns
1713 -------
1714 ql : scalar
1715 The quasi-likelihood value
1716 qic : scalar
1717 A QIC that can be used to compare the mean and covariance
1718 structures of the model.
1719 qicu : scalar
1720 A simplified QIC that can be used to compare mean structures
1721 but not covariance structures
1723 Notes
1724 -----
1725 The quasi-likelihood used here is obtained by numerically evaluating
1726 Wedderburn's integral representation of the quasi-likelihood function.
1727 This approach is valid for all families and links. Many other
1728 packages use analytical expressions for quasi-likelihoods that are
1729 valid in special cases where the link function is canonical. These
1730 analytical expressions may omit additive constants that only depend
1731 on the data. Therefore, the numerical values of our QL and QIC values
1732 will differ from the values reported by other packages. However only
1733 the differences between two QIC values calculated for different models
1734 using the same data are meaningful. Our QIC should produce the same
1735 QIC differences as other software.
1737 When using the QIC for models with unknown scale parameter, use a
1738 common estimate of the scale parameter for all models being compared.
1740 References
1741 ----------
1742 .. [*] W. Pan (2001). Akaike's information criterion in generalized
1743 estimating equations. Biometrics (57) 1.
1744 """
1746 varfunc = self.family.variance
1748 means = []
1749 omega = 0.0
1750 # omega^-1 is the model-based covariance assuming independence
1752 for i in range(self.num_group):
1753 expval, lpr = self.cached_means[i]
1754 means.append(expval)
1755 dmat = self.mean_deriv(self.exog_li[i], lpr)
1756 omega += np.dot(dmat.T, dmat) / scale
1758 means = np.concatenate(means)
1760 # The quasi-likelihood, use change of variables so the integration is
1761 # from -1 to 1.
1762 du = means - self.endog
1763 nstep = 10000
1764 qv = np.empty(nstep)
1765 xv = np.linspace(-0.99999, 1, nstep)
1766 for i, g in enumerate(xv):
1767 u = self.endog + (g + 1) * du / 2.0
1768 vu = varfunc(u)
1769 qv[i] = -np.sum(du**2 * (g + 1) / vu)
1770 qv /= (4 * scale)
1772 from scipy.integrate import trapz
1773 ql = trapz(qv, dx=xv[1] - xv[0])
1775 qicu = -2 * ql + 2 * self.exog.shape[1]
1776 qic = -2 * ql + 2 * np.trace(np.dot(omega, cov_params))
1778 return ql, qic, qicu
1781class GEEResults(base.LikelihoodModelResults):
1783 __doc__ = (
1784 "This class summarizes the fit of a marginal regression model "
1785 "using GEE.\n" + _gee_results_doc)
1787 def __init__(self, model, params, cov_params, scale,
1788 cov_type='robust', use_t=False, regularized=False,
1789 **kwds):
1791 super(GEEResults, self).__init__(
1792 model, params, normalized_cov_params=cov_params,
1793 scale=scale)
1795 # not added by super
1796 self.df_resid = model.df_resid
1797 self.df_model = model.df_model
1798 self.family = model.family
1800 attr_kwds = kwds.pop('attr_kwds', {})
1801 self.__dict__.update(attr_kwds)
1803 # we do not do this if the cov_type has already been set
1804 # subclasses can set it through attr_kwds
1805 if not (hasattr(self, 'cov_type') and
1806 hasattr(self, 'cov_params_default')):
1807 self.cov_type = cov_type # keep alias
1808 covariance_type = self.cov_type.lower()
1809 allowed_covariances = ["robust", "naive", "bias_reduced"]
1810 if covariance_type not in allowed_covariances:
1811 msg = ("GEE: `cov_type` must be one of " +
1812 ", ".join(allowed_covariances))
1813 raise ValueError(msg)
1815 if cov_type == "robust":
1816 cov = self.cov_robust
1817 elif cov_type == "naive":
1818 cov = self.cov_naive
1819 elif cov_type == "bias_reduced":
1820 cov = self.cov_robust_bc
1822 self.cov_params_default = cov
1823 else:
1824 if self.cov_type != cov_type:
1825 raise ValueError('cov_type in argument is different from '
1826 'already attached cov_type')
1828 def standard_errors(self, cov_type="robust"):
1829 """
1830 This is a convenience function that returns the standard
1831 errors for any covariance type. The value of `bse` is the
1832 standard errors for whichever covariance type is specified as
1833 an argument to `fit` (defaults to "robust").
1835 Parameters
1836 ----------
1837 cov_type : str
1838 One of "robust", "naive", or "bias_reduced". Determines
1839 the covariance used to compute standard errors. Defaults
1840 to "robust".
1841 """
1843 # Check covariance_type
1844 covariance_type = cov_type.lower()
1845 allowed_covariances = ["robust", "naive", "bias_reduced"]
1846 if covariance_type not in allowed_covariances:
1847 msg = ("GEE: `covariance_type` must be one of " +
1848 ", ".join(allowed_covariances))
1849 raise ValueError(msg)
1851 if covariance_type == "robust":
1852 return np.sqrt(np.diag(self.cov_robust))
1853 elif covariance_type == "naive":
1854 return np.sqrt(np.diag(self.cov_naive))
1855 elif covariance_type == "bias_reduced":
1856 if self.cov_robust_bc is None:
1857 raise ValueError(
1858 "GEE: `bias_reduced` covariance not available")
1859 return np.sqrt(np.diag(self.cov_robust_bc))
1861 # Need to override to allow for different covariance types.
1862 @cache_readonly
1863 def bse(self):
1864 return self.standard_errors(self.cov_type)
1866 @cache_readonly
1867 def resid(self):
1868 """
1869 Returns the residuals, the endogeneous data minus the fitted
1870 values from the model.
1871 """
1872 return self.model.endog - self.fittedvalues
1874 def score_test(self):
1875 """
1876 Return the results of a score test for a linear constraint.
1878 Returns
1879 -------
1880 Adictionary containing the p-value, the test statistic,
1881 and the degrees of freedom for the score test.
1883 Notes
1884 -----
1885 See also GEE.compare_score_test for an alternative way to perform
1886 a score test. GEEResults.score_test is more general, in that it
1887 supports testing arbitrary linear equality constraints. However
1888 GEE.compare_score_test might be easier to use when comparing
1889 two explicit models.
1891 References
1892 ----------
1893 Xu Guo and Wei Pan (2002). "Small sample performance of the score
1894 test in GEE".
1895 http://www.sph.umn.edu/faculty1/wp-content/uploads/2012/11/rr2002-013.pdf
1896 """
1898 if not hasattr(self.model, "score_test_results"):
1899 msg = "score_test on results instance only available when "
1900 msg += " model was fit with constraints"
1901 raise ValueError(msg)
1903 return self.model.score_test_results
1905 @cache_readonly
1906 def resid_split(self):
1907 """
1908 Returns the residuals, the endogeneous data minus the fitted
1909 values from the model. The residuals are returned as a list
1910 of arrays containing the residuals for each cluster.
1911 """
1912 sresid = []
1913 for v in self.model.group_labels:
1914 ii = self.model.group_indices[v]
1915 sresid.append(self.resid[ii])
1916 return sresid
1918 @cache_readonly
1919 def resid_centered(self):
1920 """
1921 Returns the residuals centered within each group.
1922 """
1923 cresid = self.resid.copy()
1924 for v in self.model.group_labels:
1925 ii = self.model.group_indices[v]
1926 cresid[ii] -= cresid[ii].mean()
1927 return cresid
1929 @cache_readonly
1930 def resid_centered_split(self):
1931 """
1932 Returns the residuals centered within each group. The
1933 residuals are returned as a list of arrays containing the
1934 centered residuals for each cluster.
1935 """
1936 sresid = []
1937 for v in self.model.group_labels:
1938 ii = self.model.group_indices[v]
1939 sresid.append(self.centered_resid[ii])
1940 return sresid
1942 def qic(self, scale=None):
1943 """
1944 Returns the QIC and QICu information criteria.
1946 For families with a scale parameter (e.g. Gaussian), provide
1947 as the scale argument the estimated scale from the largest
1948 model under consideration.
1950 If the scale parameter is not provided, the estimated scale
1951 parameter is used. Doing this does not allow comparisons of
1952 QIC values between models.
1953 """
1955 # It is easy to forget to set the scale parameter. Sometimes
1956 # this is intentional, so we warn.
1957 if scale is None:
1958 warnings.warn("QIC values obtained using scale=None are not "
1959 "appropriate for comparing models")
1961 if scale is None:
1962 scale = self.scale
1964 _, qic, qicu = self.model.qic(self.params, scale,
1965 self.cov_params())
1967 return qic, qicu
1969 # FIXME: alias to be removed, temporary backwards compatibility
1970 split_resid = resid_split
1971 centered_resid = resid_centered
1972 split_centered_resid = resid_centered_split
1974 @cache_readonly
1975 def resid_response(self):
1976 return self.model.endog - self.fittedvalues
1978 @cache_readonly
1979 def resid_pearson(self):
1980 val = self.model.endog - self.fittedvalues
1981 val = val / np.sqrt(self.family.variance(self.fittedvalues))
1982 return val
1984 @cache_readonly
1985 def resid_working(self):
1986 val = self.resid_response
1987 val = val * self.family.link.deriv(self.fittedvalues)
1988 return val
1990 @cache_readonly
1991 def resid_anscombe(self):
1992 return self.family.resid_anscombe(self.model.endog, self.fittedvalues)
1994 @cache_readonly
1995 def resid_deviance(self):
1996 return self.family.resid_dev(self.model.endog, self.fittedvalues)
1998 @cache_readonly
1999 def fittedvalues(self):
2000 """
2001 Returns the fitted values from the model.
2002 """
2003 return self.model.family.link.inverse(np.dot(self.model.exog,
2004 self.params))
2006 @Appender(_plot_added_variable_doc % {'extra_params_doc': ''})
2007 def plot_added_variable(self, focus_exog, resid_type=None,
2008 use_glm_weights=True, fit_kwargs=None,
2009 ax=None):
2011 from statsmodels.graphics.regressionplots import plot_added_variable
2013 fig = plot_added_variable(self, focus_exog,
2014 resid_type=resid_type,
2015 use_glm_weights=use_glm_weights,
2016 fit_kwargs=fit_kwargs, ax=ax)
2018 return fig
2020 @Appender(_plot_partial_residuals_doc % {'extra_params_doc': ''})
2021 def plot_partial_residuals(self, focus_exog, ax=None):
2023 from statsmodels.graphics.regressionplots import plot_partial_residuals
2025 return plot_partial_residuals(self, focus_exog, ax=ax)
2027 @Appender(_plot_ceres_residuals_doc % {'extra_params_doc': ''})
2028 def plot_ceres_residuals(self, focus_exog, frac=0.66, cond_means=None,
2029 ax=None):
2031 from statsmodels.graphics.regressionplots import plot_ceres_residuals
2033 return plot_ceres_residuals(self, focus_exog, frac,
2034 cond_means=cond_means, ax=ax)
2036 def conf_int(self, alpha=.05, cols=None, cov_type=None):
2037 """
2038 Returns confidence intervals for the fitted parameters.
2040 Parameters
2041 ----------
2042 alpha : float, optional
2043 The `alpha` level for the confidence interval. i.e., The
2044 default `alpha` = .05 returns a 95% confidence interval.
2045 cols : array_like, optional
2046 `cols` specifies which confidence intervals to return
2047 cov_type : str
2048 The covariance type used for computing standard errors;
2049 must be one of 'robust', 'naive', and 'bias reduced'.
2050 See `GEE` for details.
2052 Notes
2053 -----
2054 The confidence interval is based on the Gaussian distribution.
2055 """
2056 # super does not allow to specify cov_type and method is not
2057 # implemented,
2058 # FIXME: remove this method here
2059 if cov_type is None:
2060 bse = self.bse
2061 else:
2062 bse = self.standard_errors(cov_type=cov_type)
2063 params = self.params
2064 dist = stats.norm
2065 q = dist.ppf(1 - alpha / 2)
2067 if cols is None:
2068 lower = self.params - q * bse
2069 upper = self.params + q * bse
2070 else:
2071 cols = np.asarray(cols)
2072 lower = params[cols] - q * bse[cols]
2073 upper = params[cols] + q * bse[cols]
2074 return np.asarray(lzip(lower, upper))
2076 def summary(self, yname=None, xname=None, title=None, alpha=.05):
2077 """
2078 Summarize the GEE regression results
2080 Parameters
2081 ----------
2082 yname : str, optional
2083 Default is `y`
2084 xname : list[str], optional
2085 Names for the exogenous variables, default is `var_#` for ## in
2086 the number of regressors. Must match the number of parameters in
2087 the model
2088 title : str, optional
2089 Title for the top table. If not None, then this replaces
2090 the default title
2091 alpha : float
2092 significance level for the confidence intervals
2093 cov_type : str
2094 The covariance type used to compute the standard errors;
2095 one of 'robust' (the usual robust sandwich-type covariance
2096 estimate), 'naive' (ignores dependence), and 'bias
2097 reduced' (the Mancl/DeRouen estimate).
2099 Returns
2100 -------
2101 smry : Summary instance
2102 this holds the summary tables and text, which can be
2103 printed or converted to various output formats.
2105 See Also
2106 --------
2107 statsmodels.iolib.summary.Summary : class to hold summary results
2108 """
2110 top_left = [('Dep. Variable:', None),
2111 ('Model:', None),
2112 ('Method:', ['Generalized']),
2113 ('', ['Estimating Equations']),
2114 ('Family:', [self.model.family.__class__.__name__]),
2115 ('Dependence structure:',
2116 [self.model.cov_struct.__class__.__name__]),
2117 ('Date:', None),
2118 ('Covariance type: ', [self.cov_type, ])
2119 ]
2121 NY = [len(y) for y in self.model.endog_li]
2123 top_right = [('No. Observations:', [sum(NY)]),
2124 ('No. clusters:', [len(self.model.endog_li)]),
2125 ('Min. cluster size:', [min(NY)]),
2126 ('Max. cluster size:', [max(NY)]),
2127 ('Mean cluster size:', ["%.1f" % np.mean(NY)]),
2128 ('Num. iterations:', ['%d' %
2129 len(self.fit_history['params'])]),
2130 ('Scale:', ["%.3f" % self.scale]),
2131 ('Time:', None),
2132 ]
2134 # The skew of the residuals
2135 skew1 = stats.skew(self.resid)
2136 kurt1 = stats.kurtosis(self.resid)
2137 skew2 = stats.skew(self.centered_resid)
2138 kurt2 = stats.kurtosis(self.centered_resid)
2140 diagn_left = [('Skew:', ["%12.4f" % skew1]),
2141 ('Centered skew:', ["%12.4f" % skew2])]
2143 diagn_right = [('Kurtosis:', ["%12.4f" % kurt1]),
2144 ('Centered kurtosis:', ["%12.4f" % kurt2])
2145 ]
2147 if title is None:
2148 title = self.model.__class__.__name__ + ' ' +\
2149 "Regression Results"
2151 # Override the exog variable names if xname is provided as an
2152 # argument.
2153 if xname is None:
2154 xname = self.model.exog_names
2156 if yname is None:
2157 yname = self.model.endog_names
2159 # Create summary table instance
2160 from statsmodels.iolib.summary import Summary
2161 smry = Summary()
2162 smry.add_table_2cols(self, gleft=top_left, gright=top_right,
2163 yname=yname, xname=xname,
2164 title=title)
2165 smry.add_table_params(self, yname=yname, xname=xname,
2166 alpha=alpha, use_t=False)
2167 smry.add_table_2cols(self, gleft=diagn_left,
2168 gright=diagn_right, yname=yname,
2169 xname=xname, title="")
2171 return smry
2173 def get_margeff(self, at='overall', method='dydx', atexog=None,
2174 dummy=False, count=False):
2175 """Get marginal effects of the fitted model.
2177 Parameters
2178 ----------
2179 at : str, optional
2180 Options are:
2182 - 'overall', The average of the marginal effects at each
2183 observation.
2184 - 'mean', The marginal effects at the mean of each regressor.
2185 - 'median', The marginal effects at the median of each regressor.
2186 - 'zero', The marginal effects at zero for each regressor.
2187 - 'all', The marginal effects at each observation. If `at` is 'all'
2188 only margeff will be available.
2190 Note that if `exog` is specified, then marginal effects for all
2191 variables not specified by `exog` are calculated using the `at`
2192 option.
2193 method : str, optional
2194 Options are:
2196 - 'dydx' - dy/dx - No transformation is made and marginal effects
2197 are returned. This is the default.
2198 - 'eyex' - estimate elasticities of variables in `exog` --
2199 d(lny)/d(lnx)
2200 - 'dyex' - estimate semi-elasticity -- dy/d(lnx)
2201 - 'eydx' - estimate semi-elasticity -- d(lny)/dx
2203 Note that tranformations are done after each observation is
2204 calculated. Semi-elasticities for binary variables are computed
2205 using the midpoint method. 'dyex' and 'eyex' do not make sense
2206 for discrete variables.
2207 atexog : array_like, optional
2208 Optionally, you can provide the exogenous variables over which to
2209 get the marginal effects. This should be a dictionary with the key
2210 as the zero-indexed column number and the value of the dictionary.
2211 Default is None for all independent variables less the constant.
2212 dummy : bool, optional
2213 If False, treats binary variables (if present) as continuous. This
2214 is the default. Else if True, treats binary variables as
2215 changing from 0 to 1. Note that any variable that is either 0 or 1
2216 is treated as binary. Each binary variable is treated separately
2217 for now.
2218 count : bool, optional
2219 If False, treats count variables (if present) as continuous. This
2220 is the default. Else if True, the marginal effect is the
2221 change in probabilities when each observation is increased by one.
2223 Returns
2224 -------
2225 effects : ndarray
2226 the marginal effect corresponding to the input options
2228 Notes
2229 -----
2230 When using after Poisson, returns the expected number of events
2231 per period, assuming that the model is loglinear.
2232 """
2234 if self.model.constraint is not None:
2235 warnings.warn("marginal effects ignore constraints",
2236 ValueWarning)
2238 return GEEMargins(self, (at, method, atexog, dummy, count))
2240 def plot_isotropic_dependence(self, ax=None, xpoints=10,
2241 min_n=50):
2242 """
2243 Create a plot of the pairwise products of within-group
2244 residuals against the corresponding time differences. This
2245 plot can be used to assess the possible form of an isotropic
2246 covariance structure.
2248 Parameters
2249 ----------
2250 ax : AxesSubplot
2251 An axes on which to draw the graph. If None, new
2252 figure and axes objects are created
2253 xpoints : scalar or array_like
2254 If scalar, the number of points equally spaced points on
2255 the time difference axis used to define bins for
2256 calculating local means. If an array, the specific points
2257 that define the bins.
2258 min_n : int
2259 The minimum sample size in a bin for the mean residual
2260 product to be included on the plot.
2261 """
2263 from statsmodels.graphics import utils as gutils
2265 resid = self.model.cluster_list(self.resid)
2266 time = self.model.cluster_list(self.model.time)
2268 # All within-group pairwise time distances (xdt) and the
2269 # corresponding products of scaled residuals (xre).
2270 xre, xdt = [], []
2271 for re, ti in zip(resid, time):
2272 ix = np.tril_indices(re.shape[0], 0)
2273 re = re[ix[0]] * re[ix[1]] / self.scale ** 2
2274 xre.append(re)
2275 dists = np.sqrt(((ti[ix[0], :] - ti[ix[1], :]) ** 2).sum(1))
2276 xdt.append(dists)
2278 xre = np.concatenate(xre)
2279 xdt = np.concatenate(xdt)
2281 if ax is None:
2282 fig, ax = gutils.create_mpl_ax(ax)
2283 else:
2284 fig = ax.get_figure()
2286 # Convert to a correlation
2287 ii = np.flatnonzero(xdt == 0)
2288 v0 = np.mean(xre[ii])
2289 xre /= v0
2291 # Use the simple average to smooth, since fancier smoothers
2292 # that trim and downweight outliers give biased results (we
2293 # need the actual mean of a skewed distribution).
2294 if np.isscalar(xpoints):
2295 xpoints = np.linspace(0, max(xdt), xpoints)
2296 dg = np.digitize(xdt, xpoints)
2297 dgu = np.unique(dg)
2298 hist = np.asarray([np.sum(dg == k) for k in dgu])
2299 ii = np.flatnonzero(hist >= min_n)
2300 dgu = dgu[ii]
2301 dgy = np.asarray([np.mean(xre[dg == k]) for k in dgu])
2302 dgx = np.asarray([np.mean(xdt[dg == k]) for k in dgu])
2304 ax.plot(dgx, dgy, '-', color='orange', lw=5)
2305 ax.set_xlabel("Time difference")
2306 ax.set_ylabel("Product of scaled residuals")
2308 return fig
2310 def sensitivity_params(self, dep_params_first,
2311 dep_params_last, num_steps):
2312 """
2313 Refits the GEE model using a sequence of values for the
2314 dependence parameters.
2316 Parameters
2317 ----------
2318 dep_params_first : array_like
2319 The first dep_params in the sequence
2320 dep_params_last : array_like
2321 The last dep_params in the sequence
2322 num_steps : int
2323 The number of dep_params in the sequence
2325 Returns
2326 -------
2327 results : array_like
2328 The GEEResults objects resulting from the fits.
2329 """
2331 model = self.model
2333 import copy
2334 cov_struct = copy.deepcopy(self.model.cov_struct)
2336 # We are fixing the dependence structure in each run.
2337 update_dep = model.update_dep
2338 model.update_dep = False
2340 dep_params = []
2341 results = []
2342 for x in np.linspace(0, 1, num_steps):
2344 dp = x * dep_params_last + (1 - x) * dep_params_first
2345 dep_params.append(dp)
2347 model.cov_struct = copy.deepcopy(cov_struct)
2348 model.cov_struct.dep_params = dp
2349 rslt = model.fit(start_params=self.params,
2350 ctol=self.ctol,
2351 params_niter=self.params_niter,
2352 first_dep_update=self.first_dep_update,
2353 cov_type=self.cov_type)
2354 results.append(rslt)
2356 model.update_dep = update_dep
2358 return results
2360 # FIXME: alias to be removed, temporary backwards compatibility
2361 params_sensitivity = sensitivity_params
2364class GEEResultsWrapper(lm.RegressionResultsWrapper):
2365 _attrs = {
2366 'centered_resid': 'rows',
2367 }
2368 _wrap_attrs = wrap.union_dicts(lm.RegressionResultsWrapper._wrap_attrs,
2369 _attrs)
2370wrap.populate_wrapper(GEEResultsWrapper, GEEResults) # noqa:E305
2373class OrdinalGEE(GEE):
2375 __doc__ = (
2376 " Ordinal Response Marginal Regression Model using GEE\n" +
2377 _gee_init_doc % {'extra_params': base._missing_param_doc,
2378 'family_doc': _gee_ordinal_family_doc,
2379 'example': _gee_ordinal_example})
2381 def __init__(self, endog, exog, groups, time=None, family=None,
2382 cov_struct=None, missing='none', offset=None,
2383 dep_data=None, constraint=None, **kwargs):
2385 if family is None:
2386 family = families.Binomial()
2387 else:
2388 if not isinstance(family, families.Binomial):
2389 raise ValueError("ordinal GEE must use a Binomial family")
2391 if cov_struct is None:
2392 cov_struct = cov_structs.OrdinalIndependence()
2394 endog, exog, groups, time, offset = self.setup_ordinal(
2395 endog, exog, groups, time, offset)
2397 super(OrdinalGEE, self).__init__(endog, exog, groups, time,
2398 family, cov_struct, missing,
2399 offset, dep_data, constraint)
2401 def setup_ordinal(self, endog, exog, groups, time, offset):
2402 """
2403 Restructure ordinal data as binary indicators so that they can
2404 be analyzed using Generalized Estimating Equations.
2405 """
2407 self.endog_orig = endog.copy()
2408 self.exog_orig = exog.copy()
2409 self.groups_orig = groups.copy()
2410 if offset is not None:
2411 self.offset_orig = offset.copy()
2412 else:
2413 self.offset_orig = None
2414 offset = np.zeros(len(endog))
2415 if time is not None:
2416 self.time_orig = time.copy()
2417 else:
2418 self.time_orig = None
2419 time = np.zeros((len(endog), 1))
2421 exog = np.asarray(exog)
2422 endog = np.asarray(endog)
2423 groups = np.asarray(groups)
2424 time = np.asarray(time)
2425 offset = np.asarray(offset)
2427 # The unique outcomes, except the greatest one.
2428 self.endog_values = np.unique(endog)
2429 endog_cuts = self.endog_values[0:-1]
2430 ncut = len(endog_cuts)
2432 nrows = ncut * len(endog)
2433 exog_out = np.zeros((nrows, exog.shape[1]),
2434 dtype=np.float64)
2435 endog_out = np.zeros(nrows, dtype=np.float64)
2436 intercepts = np.zeros((nrows, ncut), dtype=np.float64)
2437 groups_out = np.zeros(nrows, dtype=groups.dtype)
2438 time_out = np.zeros((nrows, time.shape[1]),
2439 dtype=np.float64)
2440 offset_out = np.zeros(nrows, dtype=np.float64)
2442 jrow = 0
2443 zipper = zip(exog, endog, groups, time, offset)
2444 for (exog_row, endog_value, group_value, time_value,
2445 offset_value) in zipper:
2447 # Loop over thresholds for the indicators
2448 for thresh_ix, thresh in enumerate(endog_cuts):
2450 exog_out[jrow, :] = exog_row
2451 endog_out[jrow] = (int(endog_value > thresh))
2452 intercepts[jrow, thresh_ix] = 1
2453 groups_out[jrow] = group_value
2454 time_out[jrow] = time_value
2455 offset_out[jrow] = offset_value
2456 jrow += 1
2458 exog_out = np.concatenate((intercepts, exog_out), axis=1)
2460 # exog column names, including intercepts
2461 xnames = ["I(y>%.1f)" % v for v in endog_cuts]
2462 if type(self.exog_orig) == pd.DataFrame:
2463 xnames.extend(self.exog_orig.columns)
2464 else:
2465 xnames.extend(["x%d" % k for k in range(1, exog.shape[1] + 1)])
2466 exog_out = pd.DataFrame(exog_out, columns=xnames)
2468 # Preserve the endog name if there is one
2469 if type(self.endog_orig) == pd.Series:
2470 endog_out = pd.Series(endog_out, name=self.endog_orig.name)
2472 return endog_out, exog_out, groups_out, time_out, offset_out
2474 def _starting_params(self):
2475 model = GEE(self.endog, self.exog, self.groups,
2476 time=self.time, family=families.Binomial(),
2477 offset=self.offset, exposure=self.exposure)
2478 result = model.fit()
2479 return result.params
2481 @Appender(_gee_fit_doc)
2482 def fit(self, maxiter=60, ctol=1e-6, start_params=None,
2483 params_niter=1, first_dep_update=0,
2484 cov_type='robust'):
2486 rslt = super(OrdinalGEE, self).fit(maxiter, ctol, start_params,
2487 params_niter, first_dep_update,
2488 cov_type=cov_type)
2490 rslt = rslt._results # use unwrapped instance
2491 res_kwds = dict(((k, getattr(rslt, k)) for k in rslt._props))
2492 # Convert the GEEResults to an OrdinalGEEResults
2493 ord_rslt = OrdinalGEEResults(self, rslt.params,
2494 rslt.cov_params() / rslt.scale,
2495 rslt.scale,
2496 cov_type=cov_type,
2497 attr_kwds=res_kwds)
2498 # for k in rslt._props:
2499 # setattr(ord_rslt, k, getattr(rslt, k))
2500 # TODO: document or delete
2502 return OrdinalGEEResultsWrapper(ord_rslt)
2505class OrdinalGEEResults(GEEResults):
2507 __doc__ = (
2508 "This class summarizes the fit of a marginal regression model"
2509 "for an ordinal response using GEE.\n"
2510 + _gee_results_doc)
2512 def plot_distribution(self, ax=None, exog_values=None):
2513 """
2514 Plot the fitted probabilities of endog in an ordinal model,
2515 for specified values of the predictors.
2517 Parameters
2518 ----------
2519 ax : AxesSubplot
2520 An axes on which to draw the graph. If None, new
2521 figure and axes objects are created
2522 exog_values : array_like
2523 A list of dictionaries, with each dictionary mapping
2524 variable names to values at which the variable is held
2525 fixed. The values P(endog=y | exog) are plotted for all
2526 possible values of y, at the given exog value. Variables
2527 not included in a dictionary are held fixed at the mean
2528 value.
2530 Example:
2531 --------
2532 We have a model with covariates 'age' and 'sex', and wish to
2533 plot the probabilities P(endog=y | exog) for males (sex=0) and
2534 for females (sex=1), as separate paths on the plot. Since
2535 'age' is not included below in the map, it is held fixed at
2536 its mean value.
2538 >>> ev = [{"sex": 1}, {"sex": 0}]
2539 >>> rslt.distribution_plot(exog_values=ev)
2540 """
2542 from statsmodels.graphics import utils as gutils
2544 if ax is None:
2545 fig, ax = gutils.create_mpl_ax(ax)
2546 else:
2547 fig = ax.get_figure()
2549 # If no covariate patterns are specified, create one with all
2550 # variables set to their mean values.
2551 if exog_values is None:
2552 exog_values = [{}, ]
2554 exog_means = self.model.exog.mean(0)
2555 ix_icept = [i for i, x in enumerate(self.model.exog_names) if
2556 x.startswith("I(")]
2558 for ev in exog_values:
2560 for k in ev.keys():
2561 if k not in self.model.exog_names:
2562 raise ValueError("%s is not a variable in the model"
2563 % k)
2565 # Get the fitted probability for each level, at the given
2566 # covariate values.
2567 pr = []
2568 for j in ix_icept:
2570 xp = np.zeros_like(self.params)
2571 xp[j] = 1.
2572 for i, vn in enumerate(self.model.exog_names):
2573 if i in ix_icept:
2574 continue
2575 # User-specified value
2576 if vn in ev:
2577 xp[i] = ev[vn]
2578 # Mean value
2579 else:
2580 xp[i] = exog_means[i]
2582 p = 1 / (1 + np.exp(-np.dot(xp, self.params)))
2583 pr.append(p)
2585 pr.insert(0, 1)
2586 pr.append(0)
2587 pr = np.asarray(pr)
2588 prd = -np.diff(pr)
2590 ax.plot(self.model.endog_values, prd, 'o-')
2592 ax.set_xlabel("Response value")
2593 ax.set_ylabel("Probability")
2594 ax.set_ylim(0, 1)
2596 return fig
2599def _score_test_submodel(par, sub):
2600 """
2601 Return transformation matrices for design matrices.
2603 Parameters
2604 ----------
2605 par : instance
2606 The parent model
2607 sub : instance
2608 The sub-model
2610 Returns
2611 -------
2612 qm : array_like
2613 Matrix mapping the design matrix of the parent to the design matrix
2614 for the sub-model.
2615 qc : array_like
2616 Matrix mapping the design matrix of the parent to the orthogonal
2617 complement of the columnspace of the submodel in the columnspace
2618 of the parent.
2620 Notes
2621 -----
2622 Returns None, None if the provided submodel is not actually a submodel.
2623 """
2625 x1 = par.exog
2626 x2 = sub.exog
2628 u, s, vt = np.linalg.svd(x1, 0)
2630 # Get the orthogonal complement of col(x2) in col(x1).
2631 a, _, _ = np.linalg.svd(x2, 0)
2632 a = u - np.dot(a, np.dot(a.T, u))
2633 x2c, sb, _ = np.linalg.svd(a, 0)
2634 x2c = x2c[:, sb > 1e-12]
2636 # x1 * qm = x2
2637 qm = np.dot(vt.T, np.dot(u.T, x2) / s[:, None])
2639 e = np.max(np.abs(x2 - np.dot(x1, qm)))
2640 if e > 1e-8:
2641 return None, None
2643 # x1 * qc = x2c
2644 qc = np.dot(vt.T, np.dot(u.T, x2c) / s[:, None])
2646 return qm, qc
2649class OrdinalGEEResultsWrapper(GEEResultsWrapper):
2650 pass
2651wrap.populate_wrapper(OrdinalGEEResultsWrapper, OrdinalGEEResults) # noqa:E305
2654class NominalGEE(GEE):
2656 __doc__ = (
2657 " Nominal Response Marginal Regression Model using GEE.\n" +
2658 _gee_init_doc % {'extra_params': base._missing_param_doc,
2659 'family_doc': _gee_nominal_family_doc,
2660 'example': _gee_nominal_example})
2662 def __init__(self, endog, exog, groups, time=None, family=None,
2663 cov_struct=None, missing='none', offset=None,
2664 dep_data=None, constraint=None, **kwargs):
2666 endog, exog, groups, time, offset = self.setup_nominal(
2667 endog, exog, groups, time, offset)
2669 if family is None:
2670 family = _Multinomial(self.ncut + 1)
2672 if cov_struct is None:
2673 cov_struct = cov_structs.NominalIndependence()
2675 super(NominalGEE, self).__init__(
2676 endog, exog, groups, time, family, cov_struct, missing,
2677 offset, dep_data, constraint)
2679 def _starting_params(self):
2680 model = GEE(self.endog, self.exog, self.groups,
2681 time=self.time, family=families.Binomial(),
2682 offset=self.offset, exposure=self.exposure)
2683 result = model.fit()
2684 return result.params
2686 def setup_nominal(self, endog, exog, groups, time, offset):
2687 """
2688 Restructure nominal data as binary indicators so that they can
2689 be analyzed using Generalized Estimating Equations.
2690 """
2692 self.endog_orig = endog.copy()
2693 self.exog_orig = exog.copy()
2694 self.groups_orig = groups.copy()
2695 if offset is not None:
2696 self.offset_orig = offset.copy()
2697 else:
2698 self.offset_orig = None
2699 offset = np.zeros(len(endog))
2700 if time is not None:
2701 self.time_orig = time.copy()
2702 else:
2703 self.time_orig = None
2704 time = np.zeros((len(endog), 1))
2706 exog = np.asarray(exog)
2707 endog = np.asarray(endog)
2708 groups = np.asarray(groups)
2709 time = np.asarray(time)
2710 offset = np.asarray(offset)
2712 # The unique outcomes, except the greatest one.
2713 self.endog_values = np.unique(endog)
2714 endog_cuts = self.endog_values[0:-1]
2715 ncut = len(endog_cuts)
2716 self.ncut = ncut
2718 nrows = len(endog_cuts) * exog.shape[0]
2719 ncols = len(endog_cuts) * exog.shape[1]
2720 exog_out = np.zeros((nrows, ncols), dtype=np.float64)
2721 endog_out = np.zeros(nrows, dtype=np.float64)
2722 groups_out = np.zeros(nrows, dtype=np.float64)
2723 time_out = np.zeros((nrows, time.shape[1]),
2724 dtype=np.float64)
2725 offset_out = np.zeros(nrows, dtype=np.float64)
2727 jrow = 0
2728 zipper = zip(exog, endog, groups, time, offset)
2729 for (exog_row, endog_value, group_value, time_value,
2730 offset_value) in zipper:
2732 # Loop over thresholds for the indicators
2733 for thresh_ix, thresh in enumerate(endog_cuts):
2735 u = np.zeros(len(endog_cuts), dtype=np.float64)
2736 u[thresh_ix] = 1
2737 exog_out[jrow, :] = np.kron(u, exog_row)
2738 endog_out[jrow] = (int(endog_value == thresh))
2739 groups_out[jrow] = group_value
2740 time_out[jrow] = time_value
2741 offset_out[jrow] = offset_value
2742 jrow += 1
2744 # exog names
2745 if isinstance(self.exog_orig, pd.DataFrame):
2746 xnames_in = self.exog_orig.columns
2747 else:
2748 xnames_in = ["x%d" % k for k in range(1, exog.shape[1] + 1)]
2749 xnames = []
2750 for tr in endog_cuts:
2751 xnames.extend(["%s[%.1f]" % (v, tr) for v in xnames_in])
2752 exog_out = pd.DataFrame(exog_out, columns=xnames)
2753 exog_out = pd.DataFrame(exog_out, columns=xnames)
2755 # Preserve endog name if there is one
2756 if isinstance(self.endog_orig, pd.Series):
2757 endog_out = pd.Series(endog_out, name=self.endog_orig.name)
2759 return endog_out, exog_out, groups_out, time_out, offset_out
2761 def mean_deriv(self, exog, lin_pred):
2762 """
2763 Derivative of the expected endog with respect to the parameters.
2765 Parameters
2766 ----------
2767 exog : array_like
2768 The exogeneous data at which the derivative is computed,
2769 number of rows must be a multiple of `ncut`.
2770 lin_pred : array_like
2771 The values of the linear predictor, length must be multiple
2772 of `ncut`.
2774 Returns
2775 -------
2776 The derivative of the expected endog with respect to the
2777 parameters.
2778 """
2780 expval = np.exp(lin_pred)
2782 # Reshape so that each row contains all the indicators
2783 # corresponding to one multinomial observation.
2784 expval_m = np.reshape(expval, (len(expval) // self.ncut,
2785 self.ncut))
2787 # The normalizing constant for the multinomial probabilities.
2788 denom = 1 + expval_m.sum(1)
2789 denom = np.kron(denom, np.ones(self.ncut, dtype=np.float64))
2791 # The multinomial probabilities
2792 mprob = expval / denom
2794 # First term of the derivative: denom * expval' / denom^2 =
2795 # expval' / denom.
2796 dmat = mprob[:, None] * exog
2798 # Second term of the derivative: -expval * denom' / denom^2
2799 ddenom = expval[:, None] * exog
2800 dmat -= mprob[:, None] * ddenom / denom[:, None]
2802 return dmat
2804 def mean_deriv_exog(self, exog, params, offset_exposure=None):
2805 """
2806 Derivative of the expected endog with respect to exog for the
2807 multinomial model, used in analyzing marginal effects.
2809 Parameters
2810 ----------
2811 exog : array_like
2812 The exogeneous data at which the derivative is computed,
2813 number of rows must be a multiple of `ncut`.
2814 lpr : array_like
2815 The linear predictor values, length must be multiple of
2816 `ncut`.
2818 Returns
2819 -------
2820 The value of the derivative of the expected endog with respect
2821 to exog.
2823 Notes
2824 -----
2825 offset_exposure must be set at None for the multinomial family.
2826 """
2828 if offset_exposure is not None:
2829 warnings.warn("Offset/exposure ignored for the multinomial family",
2830 ValueWarning)
2832 lpr = np.dot(exog, params)
2833 expval = np.exp(lpr)
2835 expval_m = np.reshape(expval, (len(expval) // self.ncut,
2836 self.ncut))
2838 denom = 1 + expval_m.sum(1)
2839 denom = np.kron(denom, np.ones(self.ncut, dtype=np.float64))
2841 bmat0 = np.outer(np.ones(exog.shape[0]), params)
2843 # Masking matrix
2844 qmat = []
2845 for j in range(self.ncut):
2846 ee = np.zeros(self.ncut, dtype=np.float64)
2847 ee[j] = 1
2848 qmat.append(np.kron(ee, np.ones(len(params) // self.ncut)))
2849 qmat = np.array(qmat)
2850 qmat = np.kron(np.ones((exog.shape[0] // self.ncut, 1)), qmat)
2851 bmat = bmat0 * qmat
2853 dmat = expval[:, None] * bmat / denom[:, None]
2855 expval_mb = np.kron(expval_m, np.ones((self.ncut, 1)))
2856 expval_mb = np.kron(expval_mb, np.ones((1, self.ncut)))
2858 dmat -= expval[:, None] * (bmat * expval_mb) / denom[:, None] ** 2
2860 return dmat
2862 @Appender(_gee_fit_doc)
2863 def fit(self, maxiter=60, ctol=1e-6, start_params=None,
2864 params_niter=1, first_dep_update=0,
2865 cov_type='robust'):
2867 rslt = super(NominalGEE, self).fit(maxiter, ctol, start_params,
2868 params_niter, first_dep_update,
2869 cov_type=cov_type)
2870 if rslt is None:
2871 warnings.warn("GEE updates did not converge",
2872 ConvergenceWarning)
2873 return None
2875 rslt = rslt._results # use unwrapped instance
2876 res_kwds = dict(((k, getattr(rslt, k)) for k in rslt._props))
2877 # Convert the GEEResults to a NominalGEEResults
2878 nom_rslt = NominalGEEResults(self, rslt.params,
2879 rslt.cov_params() / rslt.scale,
2880 rslt.scale,
2881 cov_type=cov_type,
2882 attr_kwds=res_kwds)
2883 # TODO: document or delete
2884 # for k in rslt._props:
2885 # setattr(nom_rslt, k, getattr(rslt, k))
2887 return NominalGEEResultsWrapper(nom_rslt)
2890class NominalGEEResults(GEEResults):
2892 __doc__ = (
2893 "This class summarizes the fit of a marginal regression model"
2894 "for a nominal response using GEE.\n"
2895 + _gee_results_doc)
2897 def plot_distribution(self, ax=None, exog_values=None):
2898 """
2899 Plot the fitted probabilities of endog in an nominal model,
2900 for specified values of the predictors.
2902 Parameters
2903 ----------
2904 ax : AxesSubplot
2905 An axes on which to draw the graph. If None, new
2906 figure and axes objects are created
2907 exog_values : array_like
2908 A list of dictionaries, with each dictionary mapping
2909 variable names to values at which the variable is held
2910 fixed. The values P(endog=y | exog) are plotted for all
2911 possible values of y, at the given exog value. Variables
2912 not included in a dictionary are held fixed at the mean
2913 value.
2915 Example:
2916 --------
2917 We have a model with covariates 'age' and 'sex', and wish to
2918 plot the probabilities P(endog=y | exog) for males (sex=0) and
2919 for females (sex=1), as separate paths on the plot. Since
2920 'age' is not included below in the map, it is held fixed at
2921 its mean value.
2923 >>> ex = [{"sex": 1}, {"sex": 0}]
2924 >>> rslt.distribution_plot(exog_values=ex)
2925 """
2927 from statsmodels.graphics import utils as gutils
2929 if ax is None:
2930 fig, ax = gutils.create_mpl_ax(ax)
2931 else:
2932 fig = ax.get_figure()
2934 # If no covariate patterns are specified, create one with all
2935 # variables set to their mean values.
2936 if exog_values is None:
2937 exog_values = [{}, ]
2939 link = self.model.family.link.inverse
2940 ncut = self.model.family.ncut
2942 k = int(self.model.exog.shape[1] / ncut)
2943 exog_means = self.model.exog.mean(0)[0:k]
2944 exog_names = self.model.exog_names[0:k]
2945 exog_names = [x.split("[")[0] for x in exog_names]
2947 params = np.reshape(self.params,
2948 (ncut, len(self.params) // ncut))
2950 for ev in exog_values:
2952 exog = exog_means.copy()
2954 for k in ev.keys():
2955 if k not in exog_names:
2956 raise ValueError("%s is not a variable in the model"
2957 % k)
2959 ii = exog_names.index(k)
2960 exog[ii] = ev[k]
2962 lpr = np.dot(params, exog)
2963 pr = link(lpr)
2964 pr = np.r_[pr, 1 - pr.sum()]
2966 ax.plot(self.model.endog_values, pr, 'o-')
2968 ax.set_xlabel("Response value")
2969 ax.set_ylabel("Probability")
2970 ax.set_xticks(self.model.endog_values)
2971 ax.set_xticklabels(self.model.endog_values)
2972 ax.set_ylim(0, 1)
2974 return fig
2977class NominalGEEResultsWrapper(GEEResultsWrapper):
2978 pass
2979wrap.populate_wrapper(NominalGEEResultsWrapper, NominalGEEResults) # noqa:E305
2982class _MultinomialLogit(Link):
2983 """
2984 The multinomial logit transform, only for use with GEE.
2986 Notes
2987 -----
2988 The data are assumed coded as binary indicators, where each
2989 observed multinomial value y is coded as I(y == S[0]), ..., I(y ==
2990 S[-1]), where S is the set of possible response labels, excluding
2991 the largest one. Thererefore functions in this class should only
2992 be called using vector argument whose length is a multiple of |S|
2993 = ncut, which is an argument to be provided when initializing the
2994 class.
2996 call and derivative use a private method _clean to trim p by 1e-10
2997 so that p is in (0, 1)
2998 """
3000 def __init__(self, ncut):
3001 self.ncut = ncut
3003 def inverse(self, lpr):
3004 """
3005 Inverse of the multinomial logit transform, which gives the
3006 expected values of the data as a function of the linear
3007 predictors.
3009 Parameters
3010 ----------
3011 lpr : array_like (length must be divisible by `ncut`)
3012 The linear predictors
3014 Returns
3015 -------
3016 prob : ndarray
3017 Probabilities, or expected values
3018 """
3020 expval = np.exp(lpr)
3022 denom = 1 + np.reshape(expval, (len(expval) // self.ncut,
3023 self.ncut)).sum(1)
3024 denom = np.kron(denom, np.ones(self.ncut, dtype=np.float64))
3026 prob = expval / denom
3028 return prob
3031class _Multinomial(families.Family):
3032 """
3033 Pseudo-link function for fitting nominal multinomial models with
3034 GEE. Not for use outside the GEE class.
3035 """
3037 links = [_MultinomialLogit, ]
3038 variance = varfuncs.binary
3039 safe_links = [_MultinomialLogit, ]
3041 def __init__(self, nlevels):
3042 """
3043 Parameters
3044 ----------
3045 nlevels : int
3046 The number of distinct categories for the multinomial
3047 distribution.
3048 """
3049 self.initialize(nlevels)
3051 def initialize(self, nlevels):
3052 self.ncut = nlevels - 1
3053 self.link = _MultinomialLogit(self.ncut)
3056class GEEMargins(object):
3057 """
3058 Estimated marginal effects for a regression model fit with GEE.
3060 Parameters
3061 ----------
3062 results : GEEResults instance
3063 The results instance of a fitted discrete choice model
3064 args : tuple
3065 Args are passed to `get_margeff`. This is the same as
3066 results.get_margeff. See there for more information.
3067 kwargs : dict
3068 Keyword args are passed to `get_margeff`. This is the same as
3069 results.get_margeff. See there for more information.
3070 """
3072 def __init__(self, results, args, kwargs={}):
3073 self._cache = {}
3074 self.results = results
3075 self.get_margeff(*args, **kwargs)
3077 def _reset(self):
3078 self._cache = {}
3080 @cache_readonly
3081 def tvalues(self):
3082 _check_at_is_all(self.margeff_options)
3083 return self.margeff / self.margeff_se
3085 def summary_frame(self, alpha=.05):
3086 """
3087 Returns a DataFrame summarizing the marginal effects.
3089 Parameters
3090 ----------
3091 alpha : float
3092 Number between 0 and 1. The confidence intervals have the
3093 probability 1-alpha.
3095 Returns
3096 -------
3097 frame : DataFrames
3098 A DataFrame summarizing the marginal effects.
3099 """
3100 _check_at_is_all(self.margeff_options)
3101 from pandas import DataFrame
3102 names = [_transform_names[self.margeff_options['method']],
3103 'Std. Err.', 'z', 'Pr(>|z|)',
3104 'Conf. Int. Low', 'Cont. Int. Hi.']
3105 ind = self.results.model.exog.var(0) != 0 # True if not a constant
3106 exog_names = self.results.model.exog_names
3107 var_names = [name for i, name in enumerate(exog_names) if ind[i]]
3108 table = np.column_stack((self.margeff, self.margeff_se, self.tvalues,
3109 self.pvalues, self.conf_int(alpha)))
3110 return DataFrame(table, columns=names, index=var_names)
3112 @cache_readonly
3113 def pvalues(self):
3114 _check_at_is_all(self.margeff_options)
3115 return stats.norm.sf(np.abs(self.tvalues)) * 2
3117 def conf_int(self, alpha=.05):
3118 """
3119 Returns the confidence intervals of the marginal effects
3121 Parameters
3122 ----------
3123 alpha : float
3124 Number between 0 and 1. The confidence intervals have the
3125 probability 1-alpha.
3127 Returns
3128 -------
3129 conf_int : ndarray
3130 An array with lower, upper confidence intervals for the marginal
3131 effects.
3132 """
3133 _check_at_is_all(self.margeff_options)
3134 me_se = self.margeff_se
3135 q = stats.norm.ppf(1 - alpha / 2)
3136 lower = self.margeff - q * me_se
3137 upper = self.margeff + q * me_se
3138 return np.asarray(lzip(lower, upper))
3140 def summary(self, alpha=.05):
3141 """
3142 Returns a summary table for marginal effects
3144 Parameters
3145 ----------
3146 alpha : float
3147 Number between 0 and 1. The confidence intervals have the
3148 probability 1-alpha.
3150 Returns
3151 -------
3152 Summary : SummaryTable
3153 A SummaryTable instance
3154 """
3155 _check_at_is_all(self.margeff_options)
3156 results = self.results
3157 model = results.model
3158 title = model.__class__.__name__ + " Marginal Effects"
3159 method = self.margeff_options['method']
3160 top_left = [('Dep. Variable:', [model.endog_names]),
3161 ('Method:', [method]),
3162 ('At:', [self.margeff_options['at']]), ]
3164 from statsmodels.iolib.summary import (Summary, summary_params,
3165 table_extend)
3166 exog_names = model.exog_names[:] # copy
3167 smry = Summary()
3169 const_idx = model.data.const_idx
3170 if const_idx is not None:
3171 exog_names.pop(const_idx)
3173 J = int(getattr(model, "J", 1))
3174 if J > 1:
3175 yname, yname_list = results._get_endog_name(model.endog_names,
3176 None, all=True)
3177 else:
3178 yname = model.endog_names
3179 yname_list = [yname]
3181 smry.add_table_2cols(self, gleft=top_left, gright=[],
3182 yname=yname, xname=exog_names, title=title)
3184 # NOTE: add_table_params is not general enough yet for margeff
3185 # could use a refactor with getattr instead of hard-coded params
3186 # tvalues etc.
3187 table = []
3188 conf_int = self.conf_int(alpha)
3189 margeff = self.margeff
3190 margeff_se = self.margeff_se
3191 tvalues = self.tvalues
3192 pvalues = self.pvalues
3193 if J > 1:
3194 for eq in range(J):
3195 restup = (results, margeff[:, eq], margeff_se[:, eq],
3196 tvalues[:, eq], pvalues[:, eq], conf_int[:, :, eq])
3197 tble = summary_params(restup, yname=yname_list[eq],
3198 xname=exog_names, alpha=alpha,
3199 use_t=False,
3200 skip_header=True)
3201 tble.title = yname_list[eq]
3202 # overwrite coef with method name
3203 header = ['', _transform_names[method], 'std err', 'z',
3204 'P>|z|',
3205 '[%3.1f%% Conf. Int.]' % (100 - alpha * 100)]
3206 tble.insert_header_row(0, header)
3207 # from IPython.core.debugger import Pdb; Pdb().set_trace()
3208 table.append(tble)
3210 table = table_extend(table, keep_headers=True)
3211 else:
3212 restup = (results, margeff, margeff_se, tvalues, pvalues, conf_int)
3213 table = summary_params(restup, yname=yname, xname=exog_names,
3214 alpha=alpha, use_t=False, skip_header=True)
3215 header = ['', _transform_names[method], 'std err', 'z',
3216 'P>|z|', '[%3.1f%% Conf. Int.]' % (100 - alpha * 100)]
3217 table.insert_header_row(0, header)
3219 smry.tables.append(table)
3220 return smry
3222 def get_margeff(self, at='overall', method='dydx', atexog=None,
3223 dummy=False, count=False):
3225 self._reset() # always reset the cache when this is called
3226 # TODO: if at is not all or overall, we can also put atexog values
3227 # in summary table head
3228 method = method.lower()
3229 at = at.lower()
3230 _check_margeff_args(at, method)
3231 self.margeff_options = dict(method=method, at=at)
3232 results = self.results
3233 model = results.model
3234 params = results.params
3235 exog = model.exog.copy() # copy because values are changed
3236 effects_idx = exog.var(0) != 0
3237 const_idx = model.data.const_idx
3239 if dummy:
3240 _check_discrete_args(at, method)
3241 dummy_idx, dummy = _get_dummy_index(exog, const_idx)
3242 else:
3243 dummy_idx = None
3245 if count:
3246 _check_discrete_args(at, method)
3247 count_idx, count = _get_count_index(exog, const_idx)
3248 else:
3249 count_idx = None
3251 # get the exogenous variables
3252 exog = _get_margeff_exog(exog, at, atexog, effects_idx)
3254 # get base marginal effects, handled by sub-classes
3255 effects = model._derivative_exog(params, exog, method,
3256 dummy_idx, count_idx)
3257 effects = _effects_at(effects, at)
3259 if at == 'all':
3260 self.margeff = effects[:, effects_idx]
3261 else:
3262 # Set standard error of the marginal effects by Delta method.
3263 margeff_cov, margeff_se = margeff_cov_with_se(
3264 model, params, exog, results.cov_params(), at,
3265 model._derivative_exog, dummy_idx, count_idx,
3266 method, 1)
3268 # do not care about at constant
3269 self.margeff_cov = margeff_cov[effects_idx][:, effects_idx]
3270 self.margeff_se = margeff_se[effects_idx]
3271 self.margeff = effects[effects_idx]