Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/genmod/families/family.py : 34%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1'''
2The one parameter exponential family distributions used by GLM.
3'''
4# TODO: quasi, quasibinomial, quasipoisson
5# see
6# http://www.biostat.jhsph.edu/~qli/biostatistics_r_doc/library/stats/html/family.html
7# for comparison to R, and McCullagh and Nelder
10import warnings
11import inspect
12import numpy as np
13from scipy import special
14from . import links as L
15from . import varfuncs as V
17FLOAT_EPS = np.finfo(float).eps
20class Family(object):
21 """
22 The parent class for one-parameter exponential families.
24 Parameters
25 ----------
26 link : a link function instance
27 Link is the linear transformation function.
28 See the individual families for available links.
29 variance : a variance function
30 Measures the variance as a function of the mean probabilities.
31 See the individual families for the default variance function.
33 See Also
34 --------
35 :ref:`links` : Further details on links.
36 """
37 # TODO: change these class attributes, use valid somewhere...
38 valid = [-np.inf, np.inf]
39 links = []
41 def _setlink(self, link):
42 """
43 Helper method to set the link for a family.
45 Raises a ``ValueError`` exception if the link is not available. Note
46 that the error message might not be that informative because it tells
47 you that the link should be in the base class for the link function.
49 See statsmodels.genmod.generalized_linear_model.GLM for a list of
50 appropriate links for each family but note that not all of these are
51 currently available.
52 """
53 # TODO: change the links class attribute in the families to hold
54 # meaningful information instead of a list of links instances such as
55 # [<statsmodels.family.links.Log object at 0x9a4240c>,
56 # <statsmodels.family.links.Power object at 0x9a423ec>,
57 # <statsmodels.family.links.Power object at 0x9a4236c>]
58 # for Poisson...
59 self._link = link
60 if not isinstance(link, L.Link):
61 raise TypeError("The input should be a valid Link object.")
62 if hasattr(self, "links"):
63 validlink = max([isinstance(link, _) for _ in self.links])
64 if not validlink:
65 errmsg = "Invalid link for family, should be in %s. (got %s)"
66 raise ValueError(errmsg % (repr(self.links), link))
68 def _getlink(self):
69 """
70 Helper method to get the link for a family.
71 """
72 return self._link
74 # link property for each family is a pointer to link instance
75 link = property(_getlink, _setlink, doc="Link function for family")
77 def __init__(self, link, variance):
78 if inspect.isclass(link):
79 warnmssg = "Calling Family(..) with a link class as argument "
80 warnmssg += "is deprecated.\n"
81 warnmssg += "Use an instance of a link class instead."
82 lvl = 2 if type(self) is Family else 3
83 warnings.warn(warnmssg,
84 category=DeprecationWarning, stacklevel=lvl)
85 self.link = link()
86 else:
87 self.link = link
88 self.variance = variance
90 def starting_mu(self, y):
91 r"""
92 Starting value for mu in the IRLS algorithm.
94 Parameters
95 ----------
96 y : ndarray
97 The untransformed response variable.
99 Returns
100 -------
101 mu_0 : ndarray
102 The first guess on the transformed response variable.
104 Notes
105 -----
106 .. math::
108 \mu_0 = (Y + \overline{Y})/2
110 Only the Binomial family takes a different initial value.
111 """
112 return (y + y.mean())/2.
114 def weights(self, mu):
115 r"""
116 Weights for IRLS steps
118 Parameters
119 ----------
120 mu : array_like
121 The transformed mean response variable in the exponential family
123 Returns
124 -------
125 w : ndarray
126 The weights for the IRLS steps
128 Notes
129 -----
130 .. math::
132 w = 1 / (g'(\mu)^2 * Var(\mu))
133 """
134 return 1. / (self.link.deriv(mu)**2 * self.variance(mu))
136 def deviance(self, endog, mu, var_weights=1., freq_weights=1., scale=1.):
137 r"""
138 The deviance function evaluated at (endog, mu, var_weights,
139 freq_weights, scale) for the distribution.
141 Deviance is usually defined as twice the loglikelihood ratio.
143 Parameters
144 ----------
145 endog : array_like
146 The endogenous response variable
147 mu : array_like
148 The inverse of the link function at the linear predicted values.
149 var_weights : array_like
150 1d array of variance (analytic) weights. The default is 1.
151 freq_weights : array_like
152 1d array of frequency weights. The default is 1.
153 scale : float, optional
154 An optional scale argument. The default is 1.
156 Returns
157 -------
158 Deviance : ndarray
159 The value of deviance function defined below.
161 Notes
162 -----
163 Deviance is defined
165 .. math::
167 D = 2\sum_i (freq\_weights_i * var\_weights *
168 (llf(endog_i, endog_i) - llf(endog_i, \mu_i)))
170 where y is the endogenous variable. The deviance functions are
171 analytically defined for each family.
173 Internally, we calculate deviance as:
175 .. math::
176 D = \sum_i freq\_weights_i * var\_weights * resid\_dev_i / scale
177 """
178 resid_dev = self._resid_dev(endog, mu)
179 return np.sum(resid_dev * freq_weights * var_weights / scale)
181 def resid_dev(self, endog, mu, var_weights=1., scale=1.):
182 r"""
183 The deviance residuals
185 Parameters
186 ----------
187 endog : array_like
188 The endogenous response variable
189 mu : array_like
190 The inverse of the link function at the linear predicted values.
191 var_weights : array_like
192 1d array of variance (analytic) weights. The default is 1.
193 scale : float, optional
194 An optional scale argument. The default is 1.
196 Returns
197 -------
198 resid_dev : float
199 Deviance residuals as defined below.
201 Notes
202 -----
203 The deviance residuals are defined by the contribution D_i of
204 observation i to the deviance as
206 .. math::
207 resid\_dev_i = sign(y_i-\mu_i) \sqrt{D_i}
209 D_i is calculated from the _resid_dev method in each family.
210 Distribution-specific documentation of the calculation is available
211 there.
212 """
213 resid_dev = self._resid_dev(endog, mu)
214 resid_dev *= var_weights / scale
215 return np.sign(endog - mu) * np.sqrt(np.clip(resid_dev, 0., np.inf))
217 def fitted(self, lin_pred):
218 r"""
219 Fitted values based on linear predictors lin_pred.
221 Parameters
222 ----------
223 lin_pred : ndarray
224 Values of the linear predictor of the model.
225 :math:`X \cdot \beta` in a classical linear model.
227 Returns
228 -------
229 mu : ndarray
230 The mean response variables given by the inverse of the link
231 function.
232 """
233 fits = self.link.inverse(lin_pred)
234 return fits
236 def predict(self, mu):
237 """
238 Linear predictors based on given mu values.
240 Parameters
241 ----------
242 mu : ndarray
243 The mean response variables
245 Returns
246 -------
247 lin_pred : ndarray
248 Linear predictors based on the mean response variables. The value
249 of the link function at the given mu.
250 """
251 return self.link(mu)
253 def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
254 r"""
255 The log-likelihood function for each observation in terms of the fitted
256 mean response for the distribution.
258 Parameters
259 ----------
260 endog : ndarray
261 Usually the endogenous response variable.
262 mu : ndarray
263 Usually but not always the fitted mean response variable.
264 var_weights : array_like
265 1d array of variance (analytic) weights. The default is 1.
266 scale : float
267 The scale parameter. The default is 1.
269 Returns
270 -------
271 ll_i : float
272 The value of the loglikelihood evaluated at
273 (endog, mu, var_weights, scale) as defined below.
275 Notes
276 -----
277 This is defined for each family. endog and mu are not restricted to
278 ``endog`` and ``mu`` respectively. For instance, you could call
279 both ``loglike(endog, endog)`` and ``loglike(endog, mu)`` to get the
280 log-likelihood ratio.
281 """
282 raise NotImplementedError
284 def loglike(self, endog, mu, var_weights=1., freq_weights=1., scale=1.):
285 r"""
286 The log-likelihood function in terms of the fitted mean response.
288 Parameters
289 ----------
290 endog : ndarray
291 Usually the endogenous response variable.
292 mu : ndarray
293 Usually but not always the fitted mean response variable.
294 var_weights : array_like
295 1d array of variance (analytic) weights. The default is 1.
296 freq_weights : array_like
297 1d array of frequency weights. The default is 1.
298 scale : float
299 The scale parameter. The default is 1.
301 Returns
302 -------
303 ll : float
304 The value of the loglikelihood evaluated at
305 (endog, mu, var_weights, freq_weights, scale) as defined below.
307 Notes
308 -----
309 Where :math:`ll_i` is the by-observation log-likelihood:
311 .. math::
312 ll = \sum(ll_i * freq\_weights_i)
314 ``ll_i`` is defined for each family. endog and mu are not restricted
315 to ``endog`` and ``mu`` respectively. For instance, you could call
316 both ``loglike(endog, endog)`` and ``loglike(endog, mu)`` to get the
317 log-likelihood ratio.
318 """
319 ll_obs = self.loglike_obs(endog, mu, var_weights, scale)
320 return np.sum(ll_obs * freq_weights)
322 def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
323 r"""
324 The Anscombe residuals
326 Parameters
327 ----------
328 endog : ndarray
329 The endogenous response variable
330 mu : ndarray
331 The inverse of the link function at the linear predicted values.
332 var_weights : array_like
333 1d array of variance (analytic) weights. The default is 1.
334 scale : float, optional
335 An optional argument to divide the residuals by sqrt(scale).
336 The default is 1.
338 See Also
339 --------
340 statsmodels.genmod.families.family.Family : `resid_anscombe` for the
341 individual families for more information
343 Notes
344 -----
345 Anscombe residuals are defined by
347 .. math::
348 resid\_anscombe_i = \frac{A(y)-A(\mu)}{A'(\mu)\sqrt{Var[\mu]}} *
349 \sqrt(var\_weights)
351 where :math:`A'(y)=v(y)^{-\frac{1}{3}}` and :math:`v(\mu)` is the
352 variance function :math:`Var[y]=\frac{\phi}{w}v(mu)`.
353 The transformation :math:`A(y)` makes the residuals more normal
354 distributed.
355 """
356 raise NotImplementedError
358 def _clean(self, x):
359 """
360 Helper function to trim the data so that it is in (0,inf)
362 Notes
363 -----
364 The need for this function was discovered through usage and its
365 possible that other families might need a check for validity of the
366 domain.
367 """
368 return np.clip(x, FLOAT_EPS, np.inf)
371class Poisson(Family):
372 """
373 Poisson exponential family.
375 Parameters
376 ----------
377 link : a link instance, optional
378 The default link for the Poisson family is the log link. Available
379 links are log, identity, and sqrt. See statsmodels.families.links for
380 more information.
382 Attributes
383 ----------
384 Poisson.link : a link instance
385 The link function of the Poisson instance.
386 Poisson.variance : varfuncs instance
387 ``variance`` is an instance of
388 statsmodels.genmod.families.varfuncs.mu
390 See Also
391 --------
392 statsmodels.genmod.families.family.Family : Parent class for all links.
393 :ref:`links` : Further details on links.
394 """
395 links = [L.log, L.identity, L.sqrt]
396 variance = V.mu
397 valid = [0, np.inf]
398 safe_links = [L.Log, ]
400 def __init__(self, link=None):
401 if link is None:
402 link = L.log()
403 super(Poisson, self).__init__(link=link, variance=Poisson.variance)
405 def _resid_dev(self, endog, mu):
406 r"""
407 Poisson deviance residuals
409 Parameters
410 ----------
411 endog : ndarray
412 The endogenous response variable.
413 mu : ndarray
414 The inverse of the link function at the linear predicted values.
416 Returns
417 -------
418 resid_dev : float
419 Deviance residuals as defined below.
421 Notes
422 -----
423 .. math::
425 resid\_dev_i = 2 * (endog_i * \ln(endog_i / \mu_i) -
426 (endog_i - \mu_i))
427 """
428 endog_mu = self._clean(endog / mu)
429 resid_dev = endog * np.log(endog_mu) - (endog - mu)
430 return 2 * resid_dev
432 def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
433 r"""
434 The log-likelihood function for each observation in terms of the fitted
435 mean response for the Poisson distribution.
437 Parameters
438 ----------
439 endog : ndarray
440 Usually the endogenous response variable.
441 mu : ndarray
442 Usually but not always the fitted mean response variable.
443 var_weights : array_like
444 1d array of variance (analytic) weights. The default is 1.
445 scale : float
446 The scale parameter. The default is 1.
448 Returns
449 -------
450 ll_i : float
451 The value of the loglikelihood evaluated at
452 (endog, mu, var_weights, scale) as defined below.
454 Notes
455 -----
456 .. math::
457 ll_i = var\_weights_i / scale * (endog_i * \ln(\mu_i) - \mu_i -
458 \ln \Gamma(endog_i + 1))
459 """
460 return var_weights / scale * (endog * np.log(mu) - mu -
461 special.gammaln(endog + 1))
463 def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
464 r"""
465 The Anscombe residuals
467 Parameters
468 ----------
469 endog : ndarray
470 The endogenous response variable
471 mu : ndarray
472 The inverse of the link function at the linear predicted values.
473 var_weights : array_like
474 1d array of variance (analytic) weights. The default is 1.
475 scale : float, optional
476 An optional argument to divide the residuals by sqrt(scale).
477 The default is 1.
479 Returns
480 -------
481 resid_anscombe : ndarray
482 The Anscombe residuals for the Poisson family defined below
484 Notes
485 -----
486 .. math::
488 resid\_anscombe_i = (3/2) * (endog_i^{2/3} - \mu_i^{2/3}) /
489 \mu_i^{1/6} * \sqrt(var\_weights)
490 """
491 resid = ((3 / 2.) * (endog**(2 / 3.) - mu**(2 / 3.)) /
492 (mu ** (1 / 6.) * scale ** 0.5))
493 resid *= np.sqrt(var_weights)
494 return resid
497class Gaussian(Family):
498 """
499 Gaussian exponential family distribution.
501 Parameters
502 ----------
503 link : a link instance, optional
504 The default link for the Gaussian family is the identity link.
505 Available links are log, identity, and inverse.
506 See statsmodels.genmod.families.links for more information.
508 Attributes
509 ----------
510 Gaussian.link : a link instance
511 The link function of the Gaussian instance
512 Gaussian.variance : varfunc instance
513 ``variance`` is an instance of
514 statsmodels.genmod.families.varfuncs.constant
516 See Also
517 --------
518 statsmodels.genmod.families.family.Family : Parent class for all links.
519 :ref:`links` : Further details on links.
520 """
522 links = [L.log, L.identity, L.inverse_power]
523 variance = V.constant
524 safe_links = links
526 def __init__(self, link=None):
527 if link is None:
528 link = L.identity()
529 super(Gaussian, self).__init__(link=link, variance=Gaussian.variance)
531 def _resid_dev(self, endog, mu):
532 r"""
533 Gaussian deviance residuals
535 Parameters
536 ----------
537 endog : ndarray
538 The endogenous response variable.
539 mu : ndarray
540 The inverse of the link function at the linear predicted values.
542 Returns
543 -------
544 resid_dev : float
545 Deviance residuals as defined below.
547 Notes
548 --------
549 .. math::
551 resid\_dev_i = (endog_i - \mu_i) ** 2
552 """
553 return (endog - mu) ** 2
555 def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
556 r"""
557 The log-likelihood function for each observation in terms of the fitted
558 mean response for the Gaussian distribution.
560 Parameters
561 ----------
562 endog : ndarray
563 Usually the endogenous response variable.
564 mu : ndarray
565 Usually but not always the fitted mean response variable.
566 var_weights : array_like
567 1d array of variance (analytic) weights. The default is 1.
568 scale : float
569 The scale parameter. The default is 1.
571 Returns
572 -------
573 ll_i : float
574 The value of the loglikelihood evaluated at
575 (endog, mu, var_weights, scale) as defined below.
577 Notes
578 -----
579 If the link is the identity link function then the
580 loglikelihood function is the same as the classical OLS model.
582 .. math::
584 llf = -nobs / 2 * (\log(SSR) + (1 + \log(2 \pi / nobs)))
586 where
588 .. math::
590 SSR = \sum_i (Y_i - g^{-1}(\mu_i))^2
592 If the links is not the identity link then the loglikelihood
593 function is defined as
595 .. math::
597 ll_i = -1 / 2 \sum_i * var\_weights * ((Y_i - mu_i)^2 / scale +
598 \log(2 * \pi * scale))
599 """
600 ll_obs = -var_weights * (endog - mu) ** 2 / scale
601 ll_obs += -np.log(scale / var_weights) - np.log(2 * np.pi)
602 ll_obs /= 2
603 return ll_obs
605 def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
606 r"""
607 The Anscombe residuals
609 Parameters
610 ----------
611 endog : ndarray
612 The endogenous response variable
613 mu : ndarray
614 The inverse of the link function at the linear predicted values.
615 var_weights : array_like
616 1d array of variance (analytic) weights. The default is 1.
617 scale : float, optional
618 An optional argument to divide the residuals by sqrt(scale).
619 The default is 1.
621 Returns
622 -------
623 resid_anscombe : ndarray
624 The Anscombe residuals for the Gaussian family defined below
626 Notes
627 -----
628 For the Gaussian distribution, Anscombe residuals are the same as
629 deviance residuals.
631 .. math::
633 resid\_anscombe_i = (Y_i - \mu_i) / \sqrt{scale} *
634 \sqrt(var\_weights)
635 """
636 resid = (endog - mu) / scale ** 0.5
637 resid *= np.sqrt(var_weights)
638 return resid
641class Gamma(Family):
642 """
643 Gamma exponential family distribution.
645 Parameters
646 ----------
647 link : a link instance, optional
648 The default link for the Gamma family is the inverse link.
649 Available links are log, identity, and inverse.
650 See statsmodels.genmod.families.links for more information.
652 Attributes
653 ----------
654 Gamma.link : a link instance
655 The link function of the Gamma instance
656 Gamma.variance : varfunc instance
657 ``variance`` is an instance of
658 statsmodels.genmod.family.varfuncs.mu_squared
660 See Also
661 --------
662 statsmodels.genmod.families.family.Family : Parent class for all links.
663 :ref:`links` : Further details on links.
664 """
665 links = [L.log, L.identity, L.inverse_power]
666 variance = V.mu_squared
667 safe_links = [L.Log, ]
669 def __init__(self, link=None):
670 if link is None:
671 link = L.inverse_power()
672 super(Gamma, self).__init__(link=link, variance=Gamma.variance)
674 def _resid_dev(self, endog, mu):
675 r"""
676 Gamma deviance residuals
678 Parameters
679 ----------
680 endog : ndarray
681 The endogenous response variable.
682 mu : ndarray
683 The inverse of the link function at the linear predicted values.
685 Returns
686 -------
687 resid_dev : float
688 Deviance residuals as defined below.
690 Notes
691 -----
692 .. math::
694 resid\_dev_i = 2 * ((endog_i - \mu_i) / \mu_i -
695 \log(endog_i / \mu_i))
696 """
697 endog_mu = self._clean(endog / mu)
698 resid_dev = -np.log(endog_mu) + (endog - mu) / mu
699 return 2 * resid_dev
701 def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
702 r"""
703 The log-likelihood function for each observation in terms of the fitted
704 mean response for the Gamma distribution.
706 Parameters
707 ----------
708 endog : ndarray
709 Usually the endogenous response variable.
710 mu : ndarray
711 Usually but not always the fitted mean response variable.
712 var_weights : array_like
713 1d array of variance (analytic) weights. The default is 1.
714 scale : float
715 The scale parameter. The default is 1.
717 Returns
718 -------
719 ll_i : float
720 The value of the loglikelihood evaluated at
721 (endog, mu, var_weights, scale) as defined below.
723 Notes
724 -----
725 .. math::
727 ll_i = var\_weights_i / scale * (\ln(var\_weights_i * endog_i /
728 (scale * \mu_i)) - (var\_weights_i * endog_i) /
729 (scale * \mu_i)) - \ln \Gamma(var\_weights_i / scale) - \ln(\mu_i)
730 """
731 endog_mu = self._clean(endog / mu)
732 weight_scale = var_weights / scale
733 ll_obs = weight_scale * np.log(weight_scale * endog_mu)
734 ll_obs -= weight_scale * endog_mu
735 ll_obs -= special.gammaln(weight_scale) + np.log(endog)
736 return ll_obs
738 # in Stata scale is set to equal 1 for reporting llf
739 # in R it's the dispersion, though there is a loss of precision vs.
740 # our results due to an assumed difference in implementation
742 def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
743 r"""
744 The Anscombe residuals
746 Parameters
747 ----------
748 endog : ndarray
749 The endogenous response variable
750 mu : ndarray
751 The inverse of the link function at the linear predicted values.
752 var_weights : array_like
753 1d array of variance (analytic) weights. The default is 1.
754 scale : float, optional
755 An optional argument to divide the residuals by sqrt(scale).
756 The default is 1.
758 Returns
759 -------
760 resid_anscombe : ndarray
761 The Anscombe residuals for the Gamma family defined below
763 Notes
764 -----
765 .. math::
767 resid\_anscombe_i = 3 * (endog_i^{1/3} - \mu_i^{1/3}) / \mu_i^{1/3}
768 / \sqrt{scale} * \sqrt(var\_weights)
769 """
770 resid = 3 * (endog**(1/3.) - mu**(1/3.)) / mu**(1/3.) / scale ** 0.5
771 resid *= np.sqrt(var_weights)
772 return resid
775class Binomial(Family):
776 """
777 Binomial exponential family distribution.
779 Parameters
780 ----------
781 link : a link instance, optional
782 The default link for the Binomial family is the logit link.
783 Available links are logit, probit, cauchy, log, and cloglog.
784 See statsmodels.genmod.families.links for more information.
786 Attributes
787 ----------
788 Binomial.link : a link instance
789 The link function of the Binomial instance
790 Binomial.variance : varfunc instance
791 ``variance`` is an instance of
792 statsmodels.genmod.families.varfuncs.binary
794 See Also
795 --------
796 statsmodels.genmod.families.family.Family : Parent class for all links.
797 :ref:`links` : Further details on links.
799 Notes
800 -----
801 endog for Binomial can be specified in one of three ways:
802 A 1d array of 0 or 1 values, indicating failure or success
803 respectively.
804 A 2d array, with two columns. The first column represents the
805 success count and the second column represents the failure
806 count.
807 A 1d array of proportions, indicating the proportion of
808 successes, with parameter `var_weights` containing the
809 number of trials for each row.
810 """
812 links = [L.logit, L.probit, L.cauchy, L.log, L.cloglog, L.identity]
813 variance = V.binary # this is not used below in an effort to include n
815 # Other safe links, e.g. cloglog and probit are subclasses
816 safe_links = [L.Logit, L.CDFLink]
818 def __init__(self, link=None): # , n=1.):
819 if link is None:
820 link = L.logit()
821 # TODO: it *should* work for a constant n>1 actually, if freq_weights
822 # is equal to n
823 self.n = 1
824 # overwritten by initialize if needed but always used to initialize
825 # variance since endog is assumed/forced to be (0,1)
826 super(Binomial, self).__init__(link=link,
827 variance=V.Binomial(n=self.n))
829 def starting_mu(self, y):
830 r"""
831 The starting values for the IRLS algorithm for the Binomial family.
832 A good choice for the binomial family is :math:`\mu_0 = (Y_i + 0.5)/2`
833 """
834 return (y + .5)/2
836 def initialize(self, endog, freq_weights):
837 '''
838 Initialize the response variable.
840 Parameters
841 ----------
842 endog : ndarray
843 Endogenous response variable
844 freq_weights : ndarray
845 1d array of frequency weights
847 Returns
848 -------
849 If `endog` is binary, returns `endog`
851 If `endog` is a 2d array, then the input is assumed to be in the format
852 (successes, failures) and
853 successes/(success + failures) is returned. And n is set to
854 successes + failures.
855 '''
856 # if not np.all(np.asarray(freq_weights) == 1):
857 # self.variance = V.Binomial(n=freq_weights)
858 if endog.ndim > 1 and endog.shape[1] > 2:
859 raise ValueError('endog has more than 2 columns. The Binomial '
860 'link supports either a single response variable '
861 'or a paired response variable.')
862 elif endog.ndim > 1 and endog.shape[1] > 1:
863 y = endog[:, 0]
864 # overwrite self.freq_weights for deviance below
865 self.n = endog.sum(1)
866 return y*1./self.n, self.n
867 else:
868 return endog, np.ones(endog.shape[0])
870 def _resid_dev(self, endog, mu):
871 r"""
872 Binomial deviance residuals
874 Parameters
875 ----------
876 endog : ndarray
877 The endogenous response variable.
878 mu : ndarray
879 The inverse of the link function at the linear predicted values.
881 Returns
882 -------
883 resid_dev : float
884 Deviance residuals as defined below.
886 Notes
887 -----
888 .. math::
890 resid\_dev_i = 2 * n * (endog_i * \ln(endog_i /\mu_i) +
891 (1 - endog_i) * \ln((1 - endog_i) / (1 - \mu_i)))
892 """
893 endog_mu = self._clean(endog / mu)
894 n_endog_mu = self._clean((1. - endog) / (1. - mu))
895 resid_dev = endog * np.log(endog_mu) + (1 - endog) * np.log(n_endog_mu)
896 return 2 * self.n * resid_dev
898 def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
899 r"""
900 The log-likelihood function for each observation in terms of the fitted
901 mean response for the Binomial distribution.
903 Parameters
904 ----------
905 endog : ndarray
906 Usually the endogenous response variable.
907 mu : ndarray
908 Usually but not always the fitted mean response variable.
909 var_weights : array_like
910 1d array of variance (analytic) weights. The default is 1.
911 scale : float
912 The scale parameter. The default is 1.
914 Returns
915 -------
916 ll_i : float
917 The value of the loglikelihood evaluated at
918 (endog, mu, var_weights, scale) as defined below.
920 Notes
921 -----
922 If the endogenous variable is binary:
924 .. math::
926 ll_i = \sum_i (y_i * \log(\mu_i/(1-\mu_i)) + \log(1-\mu_i)) *
927 var\_weights_i
929 If the endogenous variable is binomial:
931 .. math::
933 ll_i = \sum_i var\_weights_i * (\ln \Gamma(n+1) -
934 \ln \Gamma(y_i + 1) - \ln \Gamma(n_i - y_i +1) + y_i *
935 \log(\mu_i / (n_i - \mu_i)) + n * \log(1 - \mu_i/n_i))
937 where :math:`y_i = Y_i * n_i` with :math:`Y_i` and :math:`n_i` as
938 defined in Binomial initialize. This simply makes :math:`y_i` the
939 original number of successes.
940 """
941 n = self.n # Number of trials
942 y = endog * n # Number of successes
944 # note that mu is still in (0,1), i.e. not converted back
945 return (special.gammaln(n + 1) - special.gammaln(y + 1) -
946 special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu)) +
947 n * np.log(1 - mu)) * var_weights
949 def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
950 r'''
951 The Anscombe residuals
953 Parameters
954 ----------
955 endog : ndarray
956 The endogenous response variable
957 mu : ndarray
958 The inverse of the link function at the linear predicted values.
959 var_weights : array_like
960 1d array of variance (analytic) weights. The default is 1.
961 scale : float, optional
962 An optional argument to divide the residuals by sqrt(scale).
963 The default is 1.
965 Returns
966 -------
967 resid_anscombe : ndarray
968 The Anscombe residuals as defined below.
970 Notes
971 -----
972 .. math::
974 n^{2/3}*(cox\_snell(endog)-cox\_snell(mu)) /
975 (mu*(1-mu/n)*scale^3)^{1/6} * \sqrt(var\_weights)
977 where cox_snell is defined as
978 cox_snell(x) = betainc(2/3., 2/3., x)*betainc(2/3.,2/3.)
979 where betainc is the incomplete beta function as defined in scipy,
980 which uses a regularized version (with the unregularized version, one
981 would just have :math:`cox_snell(x) = Betainc(2/3., 2/3., x)`).
983 The name 'cox_snell' is idiosyncratic and is simply used for
984 convenience following the approach suggested in Cox and Snell (1968).
985 Further note that
986 :math:`cox\_snell(x) = \frac{3}{2}*x^{2/3} *
987 hyp2f1(2/3.,1/3.,5/3.,x)`
988 where hyp2f1 is the hypergeometric 2f1 function. The Anscombe
989 residuals are sometimes defined in the literature using the
990 hyp2f1 formulation. Both betainc and hyp2f1 can be found in scipy.
992 References
993 ----------
994 Anscombe, FJ. (1953) "Contribution to the discussion of H. Hotelling's
995 paper." Journal of the Royal Statistical Society B. 15, 229-30.
997 Cox, DR and Snell, EJ. (1968) "A General Definition of Residuals."
998 Journal of the Royal Statistical Society B. 30, 248-75.
999 '''
1000 endog = endog * self.n # convert back to successes
1001 mu = mu * self.n # convert back to successes
1003 def cox_snell(x):
1004 return special.betainc(2/3., 2/3., x) * special.beta(2/3., 2/3.)
1006 resid = (self.n ** (2/3.) * (cox_snell(endog * 1. / self.n) -
1007 cox_snell(mu * 1. / self.n)) /
1008 (mu * (1 - mu * 1. / self.n) * scale ** 3) ** (1 / 6.))
1009 resid *= np.sqrt(var_weights)
1010 return resid
1013class InverseGaussian(Family):
1014 """
1015 InverseGaussian exponential family.
1017 Parameters
1018 ----------
1019 link : a link instance, optional
1020 The default link for the inverse Gaussian family is the
1021 inverse squared link.
1022 Available links are inverse_squared, inverse, log, and identity.
1023 See statsmodels.genmod.families.links for more information.
1025 Attributes
1026 ----------
1027 InverseGaussian.link : a link instance
1028 The link function of the inverse Gaussian instance
1029 InverseGaussian.variance : varfunc instance
1030 ``variance`` is an instance of
1031 statsmodels.genmod.families.varfuncs.mu_cubed
1033 See Also
1034 --------
1035 statsmodels.genmod.families.family.Family : Parent class for all links.
1036 :ref:`links` : Further details on links.
1038 Notes
1039 -----
1040 The inverse Gaussian distribution is sometimes referred to in the
1041 literature as the Wald distribution.
1042 """
1044 links = [L.inverse_squared, L.inverse_power, L.identity, L.log]
1045 variance = V.mu_cubed
1046 safe_links = [L.inverse_squared, L.Log, ]
1048 def __init__(self, link=None):
1049 if link is None:
1050 link = L.inverse_squared()
1051 super(InverseGaussian, self).__init__(
1052 link=link, variance=InverseGaussian.variance)
1054 def _resid_dev(self, endog, mu):
1055 r"""
1056 Inverse Gaussian deviance residuals
1058 Parameters
1059 ----------
1060 endog : ndarray
1061 The endogenous response variable.
1062 mu : ndarray
1063 The inverse of the link function at the linear predicted values.
1065 Returns
1066 -------
1067 resid_dev : float
1068 Deviance residuals as defined below.
1070 Notes
1071 -----
1072 .. math::
1074 resid\_dev_i = 1 / (endog_i * \mu_i^2) * (endog_i - \mu_i)^2
1075 """
1076 return 1. / (endog * mu ** 2) * (endog - mu) ** 2
1078 def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
1079 r"""
1080 The log-likelihood function for each observation in terms of the fitted
1081 mean response for the Inverse Gaussian distribution.
1083 Parameters
1084 ----------
1085 endog : ndarray
1086 Usually the endogenous response variable.
1087 mu : ndarray
1088 Usually but not always the fitted mean response variable.
1089 var_weights : array_like
1090 1d array of variance (analytic) weights. The default is 1.
1091 scale : float
1092 The scale parameter. The default is 1.
1094 Returns
1095 -------
1096 ll_i : float
1097 The value of the loglikelihood evaluated at
1098 (endog, mu, var_weights, scale) as defined below.
1100 Notes
1101 -----
1102 .. math::
1104 ll_i = -1/2 * (var\_weights_i * (endog_i - \mu_i)^2 /
1105 (scale * endog_i * \mu_i^2) + \ln(scale * \endog_i^3 /
1106 var\_weights_i) - \ln(2 * \pi))
1107 """
1108 ll_obs = -var_weights * (endog - mu) ** 2 / (scale * endog * mu ** 2)
1109 ll_obs += -np.log(scale * endog ** 3 / var_weights) - np.log(2 * np.pi)
1110 ll_obs /= 2
1111 return ll_obs
1113 def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
1114 r"""
1115 The Anscombe residuals
1117 Parameters
1118 ----------
1119 endog : ndarray
1120 The endogenous response variable
1121 mu : ndarray
1122 The inverse of the link function at the linear predicted values.
1123 var_weights : array_like
1124 1d array of variance (analytic) weights. The default is 1.
1125 scale : float, optional
1126 An optional argument to divide the residuals by sqrt(scale).
1127 The default is 1.
1129 Returns
1130 -------
1131 resid_anscombe : ndarray
1132 The Anscombe residuals for the inverse Gaussian distribution as
1133 defined below
1135 Notes
1136 -----
1137 .. math::
1139 resid\_anscombe_i = \log(Y_i / \mu_i) / \sqrt{\mu_i * scale} *
1140 \sqrt(var\_weights)
1141 """
1142 resid = np.log(endog / mu) / np.sqrt(mu * scale)
1143 resid *= np.sqrt(var_weights)
1144 return resid
1147class NegativeBinomial(Family):
1148 r"""
1149 Negative Binomial exponential family.
1151 Parameters
1152 ----------
1153 link : a link instance, optional
1154 The default link for the negative binomial family is the log link.
1155 Available links are log, cloglog, identity, nbinom and power.
1156 See statsmodels.genmod.families.links for more information.
1157 alpha : float, optional
1158 The ancillary parameter for the negative binomial distribution.
1159 For now ``alpha`` is assumed to be nonstochastic. The default value
1160 is 1. Permissible values are usually assumed to be between .01 and 2.
1162 Attributes
1163 ----------
1164 NegativeBinomial.link : a link instance
1165 The link function of the negative binomial instance
1166 NegativeBinomial.variance : varfunc instance
1167 ``variance`` is an instance of
1168 statsmodels.genmod.families.varfuncs.nbinom
1170 See Also
1171 --------
1172 statsmodels.genmod.families.family.Family : Parent class for all links.
1173 :ref:`links` : Further details on links.
1175 Notes
1176 -----
1177 Power link functions are not yet supported.
1179 Parameterization for :math:`y=0, 1, 2, \ldots` is
1181 .. math::
1183 f(y) = \frac{\Gamma(y+\frac{1}{\alpha})}{y!\Gamma(\frac{1}{\alpha})}
1184 \left(\frac{1}{1+\alpha\mu}\right)^{\frac{1}{\alpha}}
1185 \left(\frac{\alpha\mu}{1+\alpha\mu}\right)^y
1187 with :math:`E[Y]=\mu\,` and :math:`Var[Y]=\mu+\alpha\mu^2`.
1188 """
1189 links = [L.log, L.cloglog, L.identity, L.nbinom, L.Power]
1190 # TODO: add the ability to use the power links with an if test
1191 # similar to below
1192 variance = V.nbinom
1193 safe_links = [L.Log, ]
1195 def __init__(self, link=None, alpha=1.):
1196 self.alpha = 1. * alpha # make it at least float
1197 if link is None:
1198 link = L.log()
1199 super(NegativeBinomial, self).__init__(
1200 link=link, variance=V.NegativeBinomial(alpha=self.alpha))
1202 def _resid_dev(self, endog, mu):
1203 r"""
1204 Negative Binomial deviance residuals
1206 Parameters
1207 ----------
1208 endog : ndarray
1209 The endogenous response variable.
1210 mu : ndarray
1211 The inverse of the link function at the linear predicted values.
1213 Returns
1214 -------
1215 resid_dev : float
1216 Deviance residuals as defined below.
1218 Notes
1219 -----
1220 .. math::
1222 resid_dev_i = 2 * (endog_i * \ln(endog_i /
1223 \mu_i) - (endog_i + 1 / \alpha) * \ln((endog_i + 1 / \alpha) /
1224 (\mu_i + 1 / \alpha)))
1225 """
1226 endog_mu = self._clean(endog / mu)
1227 endog_alpha = endog + 1 / self.alpha
1228 mu_alpha = mu + 1 / self.alpha
1229 resid_dev = endog * np.log(endog_mu)
1230 resid_dev -= endog_alpha * np.log(endog_alpha / mu_alpha)
1231 return 2 * resid_dev
1233 def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
1234 r"""
1235 The log-likelihood function for each observation in terms of the fitted
1236 mean response for the Negative Binomial distribution.
1238 Parameters
1239 ----------
1240 endog : ndarray
1241 Usually the endogenous response variable.
1242 mu : ndarray
1243 Usually but not always the fitted mean response variable.
1244 var_weights : array_like
1245 1d array of variance (analytic) weights. The default is 1.
1246 scale : float
1247 The scale parameter. The default is 1.
1249 Returns
1250 -------
1251 ll_i : float
1252 The value of the loglikelihood evaluated at
1253 (endog, mu, var_weights, scale) as defined below.
1255 Notes
1256 -----
1257 Defined as:
1259 .. math::
1261 llf = \sum_i var\_weights_i / scale * (Y_i * \log{(\alpha * \mu_i /
1262 (1 + \alpha * \mu_i))} - \log{(1 + \alpha * \mu_i)}/
1263 \alpha + Constant)
1265 where :math:`Constant` is defined as:
1267 .. math::
1269 Constant = \ln \Gamma{(Y_i + 1/ \alpha )} - \ln \Gamma(Y_i + 1) -
1270 \ln \Gamma{(1/ \alpha )}
1272 constant = (special.gammaln(endog + 1 / self.alpha) -
1273 special.gammaln(endog+1)-special.gammaln(1/self.alpha))
1274 return (endog * np.log(self.alpha * mu / (1 + self.alpha * mu)) -
1275 np.log(1 + self.alpha * mu) / self.alpha +
1276 constant) * var_weights / scale
1277 """
1278 ll_obs = endog * np.log(self.alpha * mu)
1279 ll_obs -= (endog + 1 / self.alpha) * np.log(1 + self.alpha * mu)
1280 ll_obs += special.gammaln(endog + 1 / self.alpha)
1281 ll_obs -= special.gammaln(1 / self.alpha)
1282 ll_obs -= special.gammaln(endog + 1)
1283 return var_weights / scale * ll_obs
1285 def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
1286 r"""
1287 The Anscombe residuals
1289 Parameters
1290 ----------
1291 endog : ndarray
1292 The endogenous response variable
1293 mu : ndarray
1294 The inverse of the link function at the linear predicted values.
1295 var_weights : array_like
1296 1d array of variance (analytic) weights. The default is 1.
1297 scale : float, optional
1298 An optional argument to divide the residuals by sqrt(scale).
1299 The default is 1.
1301 Returns
1302 -------
1303 resid_anscombe : ndarray
1304 The Anscombe residuals as defined below.
1306 Notes
1307 -----
1308 Anscombe residuals for Negative Binomial are the same as for Binomial
1309 upon setting :math:`n=-\frac{1}{\alpha}`. Due to the negative value of
1310 :math:`-\alpha*Y` the representation with the hypergeometric function
1311 :math:`H2F1(x) = hyp2f1(2/3.,1/3.,5/3.,x)` is advantageous
1313 .. math::
1315 resid\_anscombe_i = \frac{3}{2} *
1316 (Y_i^(2/3)*H2F1(-\alpha*Y_i) - \mu_i^(2/3)*H2F1(-\alpha*\mu_i))
1317 / (\mu_i * (1+\alpha*\mu_i) * scale^3)^(1/6) * \sqrt(var\_weights)
1319 Note that for the (unregularized) Beta function, one has
1320 :math:`Beta(z,a,b) = z^a/a * H2F1(a,1-b,a+1,z)`
1321 """
1322 def hyp2f1(x):
1323 return special.hyp2f1(2 / 3., 1 / 3., 5 / 3., x)
1325 resid = (3 / 2. * (endog ** (2 / 3.) * hyp2f1(-self.alpha * endog) -
1326 mu ** (2 / 3.) * hyp2f1(-self.alpha * mu)) /
1327 (mu * (1 + self.alpha * mu) *
1328 scale ** 3) ** (1 / 6.))
1329 resid *= np.sqrt(var_weights)
1330 return resid
1333class Tweedie(Family):
1334 """
1335 Tweedie family.
1337 Parameters
1338 ----------
1339 link : a link instance, optional
1340 The default link for the Tweedie family is the log link.
1341 Available links are log and Power.
1342 See statsmodels.genmod.families.links for more information.
1343 var_power : float, optional
1344 The variance power. The default is 1.
1345 eql : bool
1346 If True, the Extended Quasi-Likelihood is used, else the
1347 likelihood is used (however the latter is not implemented).
1348 If eql is True, var_power must be between 1 and 2.
1350 Attributes
1351 ----------
1352 Tweedie.link : a link instance
1353 The link function of the Tweedie instance
1354 Tweedie.variance : varfunc instance
1355 ``variance`` is an instance of
1356 statsmodels.genmod.families.varfuncs.Power
1357 Tweedie.var_power : float
1358 The power of the variance function.
1360 See Also
1361 --------
1362 statsmodels.genmod.families.family.Family : Parent class for all links.
1363 :ref:`links` : Further details on links.
1365 Notes
1366 -----
1367 Loglikelihood function not implemented because of the complexity of
1368 calculating an infinite series of summations. The variance power can be
1369 estimated using the ``estimate_tweedie_power`` function that is part of the
1370 statsmodels.genmod.generalized_linear_model.GLM class.
1371 """
1372 links = [L.log, L.Power]
1373 variance = V.Power(power=1.5)
1374 safe_links = [L.log, L.Power]
1376 def __init__(self, link=None, var_power=1., eql=False):
1377 self.var_power = var_power
1378 self.eql = eql
1379 if eql and (var_power < 1 or var_power > 2):
1380 raise ValueError("Tweedie: if EQL=True then var_power must fall "
1381 "between 1 and 2")
1382 if link is None:
1383 link = L.log()
1384 super(Tweedie, self).__init__(
1385 link=link, variance=V.Power(power=var_power * 1.))
1387 def _resid_dev(self, endog, mu):
1388 r"""
1389 Tweedie deviance residuals
1391 Parameters
1392 ----------
1393 endog : ndarray
1394 The endogenous response variable.
1395 mu : ndarray
1396 The inverse of the link function at the linear predicted values.
1398 Returns
1399 -------
1400 resid_dev : float
1401 Deviance residuals as defined below.
1403 Notes
1404 -----
1405 When :math:`p = 1`,
1407 .. math::
1409 dev_i = \mu_i
1411 when :math:`endog_i = 0` and
1413 .. math::
1415 dev_i = endog_i * \log(endog_i / \mu_i) + (\mu_i - endog_i)
1417 otherwise.
1419 When :math:`p = 2`,
1421 .. math::
1423 dev_i = (endog_i - \mu_i) / \mu_i - \log(endog_i / \mu_i)
1425 For all other p,
1427 .. math::
1429 dev_i = endog_i^{2 - p} / ((1 - p) * (2 - p)) -
1430 endog_i * \mu_i^{1 - p} / (1 - p) + \mu_i^{2 - p} /
1431 (2 - p)
1433 The deviance residual is then
1435 .. math::
1437 resid\_dev_i = 2 * dev_i
1438 """
1439 p = self.var_power
1440 if p == 1:
1441 dev = np.where(endog == 0,
1442 mu,
1443 endog * np.log(endog / mu) + (mu - endog))
1444 elif p == 2:
1445 endog1 = self._clean(endog)
1446 dev = ((endog - mu) / mu) - np.log(endog1 / mu)
1447 else:
1448 dev = (endog ** (2 - p) / ((1 - p) * (2 - p)) -
1449 endog * mu ** (1-p) / (1 - p) + mu ** (2 - p) / (2 - p))
1450 return 2 * dev
1452 def loglike_obs(self, endog, mu, var_weights=1., scale=1.):
1453 r"""
1454 The log-likelihood function for each observation in terms of the fitted
1455 mean response for the Tweedie distribution.
1457 Parameters
1458 ----------
1459 endog : ndarray
1460 Usually the endogenous response variable.
1461 mu : ndarray
1462 Usually but not always the fitted mean response variable.
1463 var_weights : array_like
1464 1d array of variance (analytic) weights. The default is 1.
1465 scale : float
1466 The scale parameter. The default is 1.
1468 Returns
1469 -------
1470 ll_i : float
1471 The value of the loglikelihood evaluated at
1472 (endog, mu, var_weights, scale) as defined below.
1474 Notes
1475 -----
1476 If eql is True, the Extended Quasi-Likelihood is used. At present,
1477 this method returns NaN if eql is False. When the actual likelihood
1478 is implemented, it will be accessible by setting eql to False.
1480 References
1481 ----------
1482 JA Nelder, D Pregibon (1987). An extended quasi-likelihood function.
1483 Biometrika 74:2, pp 221-232. https://www.jstor.org/stable/2336136
1484 """
1485 if not self.eql:
1486 # We have not yet implemented the actual likelihood
1487 return np.nan
1489 # Equations 9-10 or Nelder and Pregibon
1490 p = self.var_power
1491 llf = np.log(2 * np.pi * scale) + p * np.log(mu) - np.log(var_weights)
1492 llf /= -2
1494 if p == 1:
1495 u = endog * np.log(endog / mu) - (endog - mu)
1496 u *= var_weights / scale
1497 elif p == 2:
1498 yr = endog / mu
1499 u = yr - np.log(yr) - 1
1500 u *= var_weights / scale
1501 else:
1502 u = (endog ** (2 - p)
1503 - (2 - p) * endog * mu ** (1 - p)
1504 + (1 - p) * mu ** (2 - p))
1505 u *= var_weights / (scale * (1 - p) * (2 - p))
1506 llf -= u
1508 return llf
1510 def resid_anscombe(self, endog, mu, var_weights=1., scale=1.):
1511 r"""
1512 The Anscombe residuals
1514 Parameters
1515 ----------
1516 endog : ndarray
1517 The endogenous response variable
1518 mu : ndarray
1519 The inverse of the link function at the linear predicted values.
1520 var_weights : array_like
1521 1d array of variance (analytic) weights. The default is 1.
1522 scale : float, optional
1523 An optional argument to divide the residuals by sqrt(scale).
1524 The default is 1.
1526 Returns
1527 -------
1528 resid_anscombe : ndarray
1529 The Anscombe residuals as defined below.
1531 Notes
1532 -----
1533 When :math:`p = 3`, then
1535 .. math::
1537 resid\_anscombe_i = \log(endog_i / \mu_i) / \sqrt{\mu_i * scale} *
1538 \sqrt(var\_weights)
1540 Otherwise,
1542 .. math::
1544 c = (3 - p) / 3
1546 .. math::
1548 resid\_anscombe_i = (1 / c) * (endog_i^c - \mu_i^c) / \mu_i^{p / 6}
1549 / \sqrt{scale} * \sqrt(var\_weights)
1550 """
1551 if self.var_power == 3:
1552 resid = np.log(endog / mu) / np.sqrt(mu * scale)
1553 else:
1554 c = (3. - self.var_power) / 3.
1555 resid = ((1. / c) * (endog ** c - mu ** c) /
1556 mu ** (self.var_power / 6.)) / scale ** 0.5
1557 resid *= np.sqrt(var_weights)
1558 return resid