Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/stats/diagnostic.py : 9%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2"""
3Various Statistical Tests
5Author: josef-pktd
6License: BSD-3
8Notes
9-----
10Almost fully verified against R or Gretl, not all options are the same.
11In many cases of Lagrange multiplier tests both the LM test and the F test is
12returned. In some but not all cases, R has the option to choose the test
13statistic. Some alternative test statistic results have not been verified.
15TODO
16* refactor to store intermediate results
17* how easy is it to attach a test that is a class to a result instance,
18 for example CompareCox as a method compare_cox(self, other) ?
20missing:
22* pvalues for breaks_hansen
23* additional options, compare with R, check where ddof is appropriate
24* new tests:
25 - breaks_ap, more recent breaks tests
26 - specification tests against nonparametric alternatives
27"""
28from statsmodels.compat.pandas import deprecate_kwarg
29from statsmodels.compat.python import iteritems
31from collections.abc import Iterable
33import numpy as np
34import pandas as pd
35from scipy import stats
37from statsmodels.regression.linear_model import OLS, RegressionResultsWrapper
38from statsmodels.tsa.tsatools import lagmat
39from statsmodels.tools.validation import (array_like, int_like, bool_like,
40 string_like, dict_like, float_like)
41from statsmodels.stats._lilliefors import (kstest_fit, lilliefors,
42 kstest_normal, kstest_exponential)
43from statsmodels.stats._adnorm import normal_ad, anderson_statistic
45__all__ = ["kstest_fit", "lilliefors", "kstest_normal", "kstest_exponential",
46 "normal_ad", "compare_cox", "compare_j", "acorr_breusch_godfrey",
47 "acorr_ljungbox", "acorr_lm", "het_arch", "het_breuschpagan",
48 "het_goldfeldquandt", "het_white", "spec_white", "linear_lm",
49 "linear_rainbow", "linear_harvey_collier", "anderson_statistic"]
52NESTED_ERROR = """\
53The exog in results_x and in results_z are nested. {test} requires \
54that models are non-nested.
55"""
58# get the old signature back so the examples work
59def unitroot_adf(x, maxlag=None, trendorder=0, autolag='AIC', store=False):
60 import warnings
61 warnings.warn("unitroot_adf is deprecated and will be removed after 0.11.",
62 FutureWarning)
63 from statsmodels.tsa.stattools import adfuller
64 trendorder = {0: 'nc', 1: 'c', 2: 'ct', 3: 'ctt'}[trendorder]
65 return adfuller(x, maxlag=maxlag, regression=trendorder, autolag=autolag,
66 store=store, regresults=False)
69def _check_nested_exog(small, large):
70 """
71 Check if a larger exog nests a smaller exog
73 Parameters
74 ----------
75 small : ndarray
76 exog from smaller model
77 large : ndarray
78 exog from larger model
80 Returns
81 -------
82 bool
83 True if small is nested by large
84 """
86 if small.shape[1] > large.shape[1]:
87 return False
88 coef = np.linalg.lstsq(large, small, rcond=None)[0]
89 err = small - large @ coef
90 return np.linalg.matrix_rank(np.c_[large, err]) == large.shape[1]
93def _check_nested_results(results_x, results_z):
94 if not isinstance(results_x, RegressionResultsWrapper):
95 raise TypeError("results_x must come from a linear regression model")
96 if not isinstance(results_z, RegressionResultsWrapper):
97 raise TypeError("results_z must come from a linear regression model")
98 if not np.allclose(results_x.model.endog, results_z.model.endog):
99 raise ValueError("endogenous variables in models are not the same")
101 x = results_x.model.exog
102 z = results_z.model.exog
104 nested = False
105 if x.shape[1] <= z.shape[1]:
106 nested = nested or _check_nested_exog(x, z)
107 else:
108 nested = nested or _check_nested_exog(z, x)
109 return nested
112class ResultsStore(object):
113 def __str__(self):
114 return getattr(self, '_str', self.__class__.__name__)
117def compare_cox(results_x, results_z, store=False):
118 """
119 Compute the Cox test for non-nested models
121 Parameters
122 ----------
123 results_x : Result instance
124 result instance of first model
125 results_z : Result instance
126 result instance of second model
127 store : bool, default False
128 If true, then the intermediate results are returned.
130 Returns
131 -------
132 tstat : float
133 t statistic for the test that including the fitted values of the
134 first model in the second model has no effect.
135 pvalue : float
136 two-sided pvalue for the t statistic
137 res_store : ResultsStore, optional
138 Intermediate results. Returned if store is True.
140 Notes
141 -----
142 Tests of non-nested hypothesis might not provide unambiguous answers.
143 The test should be performed in both directions and it is possible
144 that both or neither test rejects. see [1]_ for more information.
146 Formulas from [1]_, section 8.3.4 translated to code
148 Matches results for Example 8.3 in Greene
150 References
151 ----------
152 .. [1] Greene, W. H. Econometric Analysis. New Jersey. Prentice Hall;
153 5th edition. (2002).
154 """
155 if _check_nested_results(results_x, results_z):
156 raise ValueError(NESTED_ERROR.format(test="Cox comparison"))
157 x = results_x.model.exog
158 z = results_z.model.exog
159 nobs = results_x.model.endog.shape[0]
160 sigma2_x = results_x.ssr / nobs
161 sigma2_z = results_z.ssr / nobs
162 yhat_x = results_x.fittedvalues
163 res_dx = OLS(yhat_x, z).fit()
164 err_zx = res_dx.resid
165 res_xzx = OLS(err_zx, x).fit()
166 err_xzx = res_xzx.resid
168 sigma2_zx = sigma2_x + np.dot(err_zx.T, err_zx) / nobs
169 c01 = nobs / 2. * (np.log(sigma2_z) - np.log(sigma2_zx))
170 v01 = sigma2_x * np.dot(err_xzx.T, err_xzx) / sigma2_zx ** 2
171 q = c01 / np.sqrt(v01)
172 pval = 2 * stats.norm.sf(np.abs(q))
174 if store:
175 res = ResultsStore()
176 res.res_dx = res_dx
177 res.res_xzx = res_xzx
178 res.c01 = c01
179 res.v01 = v01
180 res.q = q
181 res.pvalue = pval
182 res.dist = stats.norm
183 return q, pval, res
185 return q, pval
188class CompareCox(object):
189 """
190 Cox Test for non-nested models
192 .. deprecated:: 0.11
194 CompareCox is deprecated in favor of compare_cox.
195 CompareCox will be removed after 0.12.
196 """
198 def __init__(self):
199 import warnings
200 warnings.warn("CompareCox is deprecated in favor of compare_cox and "
201 "will be removed after 0.12.", FutureWarning)
202 self.res_dx = self.res_xzx = self.c01 = None
203 self.v01 = self.q = self.pvalue = self.dist = None
205 def run(self, results_x, results_z, attach=False):
206 results = compare_cox(results_x, results_z, store=attach)
207 if attach:
208 res = results[-1]
209 self.res_dx = res.res_dx
210 self.res_xzx = res.res_xzx
211 self.c01 = res.c01
212 self.v01 = res.v01
213 self.q = res.q
214 self.pvalue = res.pvalue
215 self.dist = res.dist
217 return results
219 def __call__(self, results_x, results_z):
220 return self.run(results_x, results_z, attach=False)
223def compare_j(results_x, results_z, store=False):
224 """
225 Compute the J-test for non-nested models
227 Parameters
228 ----------
229 results_x : RegressionResults
230 The result instance of first model.
231 results_z : RegressionResults
232 The result instance of second model.
233 store : bool, default False
234 If true, then the intermediate results are returned.
236 Returns
237 -------
238 tstat : float
239 t statistic for the test that including the fitted values of the
240 first model in the second model has no effect.
241 pvalue : float
242 two-sided pvalue for the t statistic
243 res_store : ResultsStore, optional
244 Intermediate results. Returned if store is True.
246 Notes
247 -----
248 From description in Greene, section 8.3.3. Matches results for Example
249 8.3, Greene.
251 Tests of non-nested hypothesis might not provide unambiguous answers.
252 The test should be performed in both directions and it is possible
253 that both or neither test rejects. see Greene for more information.
255 References
256 ----------
257 .. [1] Greene, W. H. Econometric Analysis. New Jersey. Prentice Hall;
258 5th edition. (2002).
259 """
260 # TODO: Allow cov to be specified
261 if _check_nested_results(results_x, results_z):
262 raise ValueError(NESTED_ERROR.format(test="J comparison"))
263 y = results_x.model.endog
264 z = results_z.model.exog
265 yhat_x = results_x.fittedvalues
266 res_zx = OLS(y, np.column_stack((yhat_x, z))).fit()
267 tstat = res_zx.tvalues[0]
268 pval = res_zx.pvalues[0]
269 if store:
270 res = ResultsStore()
271 res.res_zx = res_zx
272 res.dist = stats.t(res_zx.df_resid)
273 res.teststat = tstat
274 res.pvalue = pval
275 return tstat, pval, res
277 return tstat, pval
280class CompareJ(object):
281 """
282 J-Test for comparing non-nested models
284 .. deprecated:: 0.11
286 CompareJ is deprecated in favor of compare_j.
287 CompareJ will be removed after 0.12.
288 """
290 def __init__(self):
291 import warnings
292 warnings.warn("CompareJ is deprecated in favor of compare_j and will "
293 "be removed after 0.12.", FutureWarning)
294 self.res_zx = self.dist = self.teststat = self.pvalue = None
296 def run(self, results_x, results_z, attach=True):
297 res = compare_j(results_x, results_z, store=attach)
298 tstat, pval = res[:2]
299 if attach:
300 self.res_zx = res[-1].res_zx
301 self.dist = res[-1].dist
302 self.teststat = res[-1].teststat
303 self.pvalue = res[-1].pvalue
305 return tstat, pval
307 def __call__(self, results_x, results_z):
308 return self.run(results_x, results_z, attach=False)
311def compare_encompassing(results_x, results_z, cov_type="nonrobust",
312 cov_kwargs=None):
313 r"""
314 Davidson-MacKinnon encompassing test for comparing non-nested models
316 Parameters
317 ----------
318 results_x : Result instance
319 result instance of first model
320 results_z : Result instance
321 result instance of second model
322 cov_type : str, default "nonrobust
323 Covariance type. The default is "nonrobust` which uses the classic
324 OLS covariance estimator. Specify one of "HC0", "HC1", "HC2", "HC3"
325 to use White's covariance estimator. All covariance types supported
326 by ``OLS.fit`` are accepted.
327 cov_kwargs : dict, default None
328 Dictionary of covariance options passed to ``OLS.fit``. See OLS.fit
329 for more details.
331 Returns
332 -------
333 DataFrame
334 A DataFrame with two rows and four columns. The row labeled x
335 contains results for the null that the model contained in
336 results_x is equivalent to the encompassing model. The results in
337 the row labeled z correspond to the test that the model contained
338 in results_z are equivalent to the encompassing model. The columns
339 are the test statistic, its p-value, and the numerator and
340 denominator degrees of freedom. The test statistic has an F
341 distribution. The numerator degree of freedom is the number of
342 variables in the encompassing model that are not in the x or z model.
343 The denominator degree of freedom is the number of observations minus
344 the number of variables in the nesting model.
346 Notes
347 -----
348 The null is that the fit produced using x is the same as the fit
349 produced using both x and z. When testing whether x is encompassed,
350 the model estimated is
352 .. math::
354 Y = X\beta + Z_1\gamma + \epsilon
356 where :math:`Z_1` are the columns of :math:`Z` that are not spanned by
357 :math:`X`. The null is :math:`H_0:\gamma=0`. When testing whether z is
358 encompassed, the roles of :math:`X` and :math:`Z` are reversed.
360 Implementation of Davidson and MacKinnon (1993)'s encompassing test.
361 Performs two Wald tests where models x and z are compared to a model
362 that nests the two. The Wald tests are performed by using an OLS
363 regression.
364 """
365 if _check_nested_results(results_x, results_z):
366 raise ValueError(NESTED_ERROR.format(test="Testing encompassing"))
368 y = results_x.model.endog
369 x = results_x.model.exog
370 z = results_z.model.exog
372 def _test_nested(endog, a, b, cov_est, cov_kwds):
373 err = b - a @ np.linalg.lstsq(a, b, rcond=None)[0]
374 u, s, v = np.linalg.svd(err)
375 eps = np.finfo(np.double).eps
376 tol = s.max(axis=-1, keepdims=True) * max(err.shape) * eps
377 non_zero = np.abs(s) > tol
378 aug = err @ v[:, non_zero]
379 aug_reg = np.hstack([a, aug])
380 k_a = aug.shape[1]
381 k = aug_reg.shape[1]
383 res = OLS(endog, aug_reg).fit(cov_type=cov_est, cov_kwds=cov_kwds)
384 r_matrix = np.zeros((k_a, k))
385 r_matrix[:, -k_a:] = np.eye(k_a)
386 test = res.wald_test(r_matrix, use_f=True)
387 stat, pvalue = float(np.squeeze(test.statistic)), float(test.pvalue)
388 df_num, df_denom = int(test.df_num), int(test.df_denom)
389 return stat, pvalue, df_num, df_denom
391 x_nested = _test_nested(y, x, z, cov_type, cov_kwargs)
392 z_nested = _test_nested(y, z, x, cov_type, cov_kwargs)
393 return pd.DataFrame([x_nested, z_nested],
394 index=["x", "z"],
395 columns=["stat", "pvalue", "df_num", "df_denom"])
398def acorr_ljungbox(x, lags=None, boxpierce=False, model_df=0, period=None,
399 return_df=None):
400 """
401 Ljung-Box test of autocorrelation in residuals.
403 Parameters
404 ----------
405 x : array_like
406 The data series. The data is demeaned before the test statistic is
407 computed.
408 lags : {int, array_like}, default None
409 If lags is an integer then this is taken to be the largest lag
410 that is included, the test result is reported for all smaller lag
411 length. If lags is a list or array, then all lags are included up to
412 the largest lag in the list, however only the tests for the lags in
413 the list are reported. If lags is None, then the default maxlag is
414 currently min((nobs // 2 - 2), 40). After 0.12 this will change to
415 min(10, nobs // 5). The default number of lags changes if period
416 is set.
417 boxpierce : bool, default False
418 If true, then additional to the results of the Ljung-Box test also the
419 Box-Pierce test results are returned.
420 model_df : int, default 0
421 Number of degrees of freedom consumed by the model. In an ARMA model,
422 this value is usually p+q where p is the AR order and q is the MA
423 order. This value is subtracted from the degrees-of-freedom used in
424 the test so that the adjusted dof for the statistics are
425 lags - model_df. If lags - model_df <= 0, then NaN is returned.
426 period : int, default None
427 The period of a Seasonal time series. Used to compute the max lag
428 for seasonal data which uses min(2*period, nobs // 5) if set. If None,
429 then the default rule is used to set the number of lags. When set, must
430 be >= 2.
431 return_df : bool, default None
432 Flag indicating whether to return the result as a single DataFrame
433 with columns lb_stat, lb_pvalue, and optionally bp_stat and bp_pvalue.
434 After 0.12, this will become the only return method. Set to True
435 to return the DataFrame or False to continue returning the 2 - 4
436 output. If None (the default), a warning is raised.
438 Returns
439 -------
440 lbvalue : float or array
441 The Ljung-Box test statistic.
442 pvalue : float or array
443 The p-value based on chi-square distribution. The p-value is computed
444 as 1.0 - chi2.cdf(lbvalue, dof) where dof is lag - model_df. If
445 lag - model_df <= 0, then NaN is returned for the pvalue.
446 bpvalue : (optional), float or array
447 The test statistic for Box-Pierce test.
448 bppvalue : (optional), float or array
449 The p-value based for Box-Pierce test on chi-square distribution.
450 The p-value is computed as 1.0 - chi2.cdf(bpvalue, dof) where dof is
451 lag - model_df. If lag - model_df <= 0, then NaN is returned for the
452 pvalue.
454 See Also
455 --------
456 statsmodels.regression.linear_model.OLS.fit
457 Regression model fitting.
458 statsmodels.regression.linear_model.RegressionResults
459 Results from linear regression models.
461 Notes
462 -----
463 Ljung-Box and Box-Pierce statistic differ in their scaling of the
464 autocorrelation function. Ljung-Box test is has better finite-sample
465 properties.
467 References
468 ----------
469 .. [*] Green, W. "Econometric Analysis," 5th ed., Pearson, 2003.
471 Examples
472 --------
473 >>> import statsmodels.api as sm
474 >>> data = sm.datasets.sunspots.load_pandas().data
475 >>> res = sm.tsa.ARMA(data["SUNACTIVITY"], (1,1)).fit(disp=-1)
476 >>> sm.stats.acorr_ljungbox(res.resid, lags=[10], return_df=True)
477 lb_stat lb_pvalue
478 10 214.106992 1.827374e-40
479 """
480 x = array_like(x, "x")
481 period = int_like(period, "period", optional=True)
482 return_df = bool_like(return_df, "return_df", optional=True)
483 model_df = int_like(model_df, "model_df", optional=False)
484 if period is not None and period <= 1:
485 raise ValueError("period must be >= 2")
486 if model_df < 0:
487 raise ValueError("model_df must be >= 0")
488 nobs = x.shape[0]
489 if period is not None:
490 lags = np.arange(1, min(nobs // 5, 2 * period) + 1, dtype=np.int)
491 elif lags is None:
492 # TODO: Switch to min(10, nobs//5) after 0.12
493 import warnings
494 warnings.warn("The default value of lags is changing. After 0.12, "
495 "this value will become min(10, nobs//5). Directly set"
496 "lags to silence this warning.", FutureWarning)
497 # Future
498 # lags = np.arange(1, min(nobs // 5, 10) + 1, dtype=np.int)
499 lags = np.arange(1, min((nobs // 2 - 2), 40) + 1, dtype=np.int)
500 elif not isinstance(lags, Iterable):
501 lags = int_like(lags, "lags")
502 lags = np.arange(1, lags + 1)
503 lags = array_like(lags, "lags", dtype="int")
504 maxlag = lags.max()
506 # Avoid cyclic import
507 from statsmodels.tsa.stattools import acf
508 # normalize by nobs not (nobs-nlags)
509 # SS: unbiased=False is default now
510 sacf = acf(x, nlags=maxlag, fft=False)
511 sacf2 = sacf[1:maxlag + 1] ** 2 / (nobs - np.arange(1, maxlag + 1))
512 qljungbox = nobs * (nobs + 2) * np.cumsum(sacf2)[lags - 1]
513 adj_lags = lags - model_df
514 pval = np.full_like(qljungbox, np.nan)
515 loc = adj_lags > 0
516 pval[loc] = stats.chi2.sf(qljungbox[loc], adj_lags[loc])
518 if return_df is None:
519 msg = ("The value returned will change to a single DataFrame after "
520 "0.12 is released. Set return_df to True to use to return a "
521 "DataFrame now. Set return_df to False to silence this "
522 "warning.")
523 import warnings
524 warnings.warn(msg, FutureWarning)
526 if not boxpierce:
527 if return_df:
528 return pd.DataFrame({"lb_stat": qljungbox, "lb_pvalue": pval},
529 index=lags)
530 return qljungbox, pval
532 qboxpierce = nobs * np.cumsum(sacf[1:maxlag + 1] ** 2)[lags - 1]
533 pvalbp = np.full_like(qljungbox, np.nan)
534 pvalbp[loc] = stats.chi2.sf(qboxpierce[loc], adj_lags[loc])
535 if return_df:
536 return pd.DataFrame({"lb_stat": qljungbox, "lb_pvalue": pval,
537 "bp_stat": qboxpierce, "bp_pvalue": pvalbp},
538 index=lags)
540 return qljungbox, pval, qboxpierce, pvalbp
543@deprecate_kwarg("maxlag", "nlags")
544def acorr_lm(resid, nlags=None, autolag="AIC", store=False, *, period=None,
545 ddof=0, cov_type="nonrobust", cov_kwargs=None):
546 """
547 Lagrange Multiplier tests for autocorrelation.
549 This is a generic Lagrange Multiplier test for autocorrelation. Returns
550 Engle's ARCH test if resid is the squared residual array. Breusch-Godfrey
551 is a variation on this test with additional exogenous variables.
553 Parameters
554 ----------
555 resid : array_like
556 Time series to test.
557 nlags : int, default None
558 Highest lag to use. The behavior of this parameter will change
559 after 0.12.
560 autolag : {str, None}, default "AIC"
561 If None, then a fixed number of lags given by maxlag is used. This
562 parameter is deprecated and will be removed after 0.12. Searching
563 for model specification cannot control test size.
564 store : bool, default False
565 If true then the intermediate results are also returned.
566 period : int, default none
567 The period of a Seasonal time series. Used to compute the max lag
568 for seasonal data which uses min(2*period, nobs // 5) if set. If None,
569 then the default rule is used to set the number of lags. When set, must
570 be >= 2.
571 ddof : int, default 0
572 The number of degrees of freedom consumed by the model used to
573 produce resid. The default value is 0.
574 cov_type : str, default "nonrobust"
575 Covariance type. The default is "nonrobust` which uses the classic
576 OLS covariance estimator. Specify one of "HC0", "HC1", "HC2", "HC3"
577 to use White's covariance estimator. All covariance types supported
578 by ``OLS.fit`` are accepted.
579 cov_kwargs : dict, default None
580 Dictionary of covariance options passed to ``OLS.fit``. See OLS.fit for
581 more details.
583 Returns
584 -------
585 lm : float
586 Lagrange multiplier test statistic.
587 lmpval : float
588 The p-value for Lagrange multiplier test.
589 fval : float
590 The f statistic of the F test, alternative version of the same
591 test based on F test for the parameter restriction.
592 fpval : float
593 The pvalue of the F test.
594 res_store : ResultsStore, optional
595 Intermediate results. Only returned if store=True.
597 See Also
598 --------
599 het_arch
600 Conditional heteroskedasticity testing.
601 acorr_breusch_godfrey
602 Breusch-Godfrey test for serial correlation.
603 acorr_ljung_box
604 Ljung-Box test for serial correlation.
606 Notes
607 -----
608 The test statistic is computed as (nobs - ddof) * r2 where r2 is the
609 R-squared from a regression on the residual on nlags lags of the
610 residual.
611 """
612 resid = array_like(resid, "resid", ndim=1)
613 cov_type = string_like(cov_type, "cov_type")
614 cov_kwargs = {} if cov_kwargs is None else cov_kwargs
615 cov_kwargs = dict_like(cov_kwargs, "cov_kwargs")
616 nobs = resid.shape[0]
617 if period is not None and nlags is None:
618 maxlag = min(nobs // 5, 2 * period)
619 elif nlags is None:
620 # TODO: Switch to min(10, nobs//5) after 0.12
621 import warnings
622 warnings.warn("The default value of nlags is changing. After 0.12, "
623 "this value will become min(10, nobs//5). Directly set"
624 "maxlags or period to silence this warning.",
625 FutureWarning)
626 # Future
627 # maxlag = min(nobs // 5, 10)
628 # Old: for adf from Greene referencing Schwert 1989
629 maxlag = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.)))
630 else:
631 maxlag = nlags
633 xdall = lagmat(resid[:, None], maxlag, trim="both")
634 nobs = xdall.shape[0]
635 xdall = np.c_[np.ones((nobs, 1)), xdall]
636 xshort = resid[-nobs:]
637 res_store = ResultsStore()
639 if autolag:
640 # TODO: Deprecate this
641 # Use same rules as autocorr
642 # search for lag length with highest information criteria
643 # Note: I use the same number of observations to have comparable IC
644 import warnings
645 warnings.warn("autolag is deprecated and will be removed after 0.12. "
646 "Model selection before testing fails to control test "
647 "size. Set autolag to False to silence this warning.",
648 FutureWarning)
649 results = {}
650 for mlag in range(1, maxlag + 1):
651 results[mlag] = OLS(xshort, xdall[:, :mlag + 1]).fit()
653 if autolag.lower() == "aic":
654 bestic, icbestlag = min((v.aic, k) for k, v in iteritems(results))
655 elif autolag.lower() == "bic":
656 icbest, icbestlag = min((v.bic, k) for k, v in iteritems(results))
657 else:
658 raise ValueError("autolag can only be None, \"AIC\" or \"BIC\"")
660 # rerun ols with best ic
661 xdall = lagmat(resid[:, None], icbestlag, trim="both")
662 nobs = xdall.shape[0]
663 xdall = np.c_[np.ones((nobs, 1)), xdall]
664 xshort = resid[-nobs:]
665 usedlag = icbestlag
666 if store:
667 res_store.results = results
668 else:
669 usedlag = maxlag
671 resols = OLS(xshort, xdall[:, :usedlag + 1]).fit(cov_type=cov_type,
672 cov_kwargs=cov_kwargs)
673 fval = resols.fvalue
674 fpval = resols.f_pvalue
675 if cov_type == "nonrobust":
676 lm = (nobs - ddof) * resols.rsquared
677 lmpval = stats.chi2.sf(lm, usedlag)
678 # Note: deg of freedom for LM test: nvars - constant = lags used
679 else:
680 r_matrix = np.hstack((np.ones((usedlag, 1)), np.eye(usedlag)))
681 test_stat = resols.wald_test(r_matrix, use_f=False)
682 lm = float(test_stat.statistic)
683 lmpval = test_stat.pvalue
685 if store:
686 res_store.resols = resols
687 res_store.usedlag = usedlag
688 return lm, lmpval, fval, fpval, res_store
689 else:
690 return lm, lmpval, fval, fpval
693@deprecate_kwarg("maxlag", "nlags")
694def het_arch(resid, nlags=None, autolag=None, store=False, ddof=0):
695 """
696 Engle's Test for Autoregressive Conditional Heteroscedasticity (ARCH).
698 Parameters
699 ----------
700 resid : ndarray
701 residuals from an estimation, or time series
702 nlags : int, default None
703 Highest lag to use. The behavior of this parameter will change
704 after 0.12.
705 autolag : {str, None}, default None
706 If None, then a fixed number of lags given by maxlag is used. This
707 parameter is deprecated and will be removed after 0.12. Searching
708 for model specification cannot control test size.
709 store : bool, default False
710 If true then the intermediate results are also returned
711 ddof : int, default 0
712 If the residuals are from a regression, or ARMA estimation, then there
713 are recommendations to correct the degrees of freedom by the number
714 of parameters that have been estimated, for example ddof=p+q for an
715 ARMA(p,q).
717 Returns
718 -------
719 lm : float
720 Lagrange multiplier test statistic
721 lmpval : float
722 p-value for Lagrange multiplier test
723 fval : float
724 fstatistic for F test, alternative version of the same test based on
725 F test for the parameter restriction
726 fpval : float
727 pvalue for F test
728 res_store : ResultsStore, optional
729 Intermediate results. Returned if store is True.
731 Notes
732 -----
733 verified against R:FinTS::ArchTest
734 """
735 return acorr_lm(resid ** 2, nlags=nlags, autolag=autolag, store=store,
736 ddof=ddof)
739@deprecate_kwarg("results", "res")
740def acorr_breusch_godfrey(res, nlags=None, store=False):
741 """
742 Breusch-Godfrey Lagrange Multiplier tests for residual autocorrelation.
744 Parameters
745 ----------
746 res : RegressionResults
747 Estimation results for which the residuals are tested for serial
748 correlation.
749 nlags : int, default None
750 Number of lags to include in the auxiliary regression. (nlags is
751 highest lag).
752 store : bool, default False
753 If store is true, then an additional class instance that contains
754 intermediate results is returned.
756 Returns
757 -------
758 lm : float
759 Lagrange multiplier test statistic.
760 lmpval : float
761 The p-value for Lagrange multiplier test.
762 fval : float
763 The value of the f statistic for F test, alternative version of the
764 same test based on F test for the parameter restriction.
765 fpval : float
766 The pvalue for F test.
767 res_store : ResultsStore
768 A class instance that holds intermediate results. Only returned if
769 store=True.
771 Notes
772 -----
773 BG adds lags of residual to exog in the design matrix for the auxiliary
774 regression with residuals as endog. See [1]_, section 12.7.1.
776 References
777 ----------
778 .. [1] Greene, W. H. Econometric Analysis. New Jersey. Prentice Hall;
779 5th edition. (2002).
780 """
782 x = np.asarray(res.resid).squeeze()
783 if x.ndim != 1:
784 raise ValueError("Model resid must be a 1d array. Cannot be used on"
785 " multivariate models.")
786 exog_old = res.model.exog
787 nobs = x.shape[0]
788 if nlags is None:
789 # TODO: Switch to min(10, nobs//5) after 0.12
790 import warnings
791 warnings.warn("The default value of nlags is changing. After 0.12, "
792 "this value will become min(10, nobs//5). Directly set"
793 "nlags or period to silence this warning.",
794 FutureWarning)
796 nlags = np.trunc(12. * np.power(nobs / 100., 1 / 4.))
797 nlags = int(nlags)
799 x = np.concatenate((np.zeros(nlags), x))
801 xdall = lagmat(x[:, None], nlags, trim="both")
802 nobs = xdall.shape[0]
803 xdall = np.c_[np.ones((nobs, 1)), xdall]
804 xshort = x[-nobs:]
805 exog = np.column_stack((exog_old, xdall))
806 k_vars = exog.shape[1]
808 resols = OLS(xshort, exog).fit()
809 ft = resols.f_test(np.eye(nlags, k_vars, k_vars - nlags))
810 fval = ft.fvalue
811 fpval = ft.pvalue
812 fval = float(np.squeeze(fval))
813 fpval = float(np.squeeze(fpval))
814 lm = nobs * resols.rsquared
815 lmpval = stats.chi2.sf(lm, nlags)
816 # Note: degrees of freedom for LM test is nvars minus constant = usedlags
818 if store:
819 res_store = ResultsStore()
820 res_store.resols = resols
821 res_store.usedlag = nlags
822 return lm, lmpval, fval, fpval, res_store
823 else:
824 return lm, lmpval, fval, fpval
827def het_breuschpagan(resid, exog_het):
828 r"""
829 Breusch-Pagan Lagrange Multiplier test for heteroscedasticity
831 The tests the hypothesis that the residual variance does not depend on
832 the variables in x in the form
834 .. :math: \sigma_i = \sigma * f(\alpha_0 + \alpha z_i)
836 Homoscedasticity implies that :math:`\alpha=0`.
838 Parameters
839 ----------
840 resid : array_like
841 For the Breusch-Pagan test, this should be the residual of a
842 regression. If an array is given in exog, then the residuals are
843 calculated by the an OLS regression or resid on exog. In this case
844 resid should contain the dependent variable. Exog can be the same as x.
845 exog_het : array_like
846 This contains variables suspected of being related to
847 heteroscedasticity in resid.
849 Returns
850 -------
851 lm : float
852 lagrange multiplier statistic
853 lm_pvalue :float
854 p-value of lagrange multiplier test
855 fvalue : float
856 f-statistic of the hypothesis that the error variance does not depend
857 on x
858 f_pvalue : float
859 p-value for the f-statistic
861 Notes
862 -----
863 Assumes x contains constant (for counting dof and calculation of R^2).
864 In the general description of LM test, Greene mentions that this test
865 exaggerates the significance of results in small or moderately large
866 samples. In this case the F-statistic is preferable.
868 **Verification**
870 Chisquare test statistic is exactly (<1e-13) the same result as bptest
871 in R-stats with defaults (studentize=True).
873 **Implementation**
875 This is calculated using the generic formula for LM test using $R^2$
876 (Greene, section 17.6) and not with the explicit formula
877 (Greene, section 11.4.3).
878 The degrees of freedom for the p-value assume x is full rank.
880 References
881 ----------
882 .. [1] Greene, W. H. Econometric Analysis. New Jersey. Prentice Hall;
883 5th edition. (2002).
884 .. [2] Breusch, T. S.; Pagan, A. R. (1979). "A Simple Test for
885 Heteroskedasticity and Random Coefficient Variation". Econometrica.
886 47 (5): 1287–1294.
887 """
889 x = np.asarray(exog_het)
890 y = np.asarray(resid) ** 2
891 nobs, nvars = x.shape
892 resols = OLS(y, x).fit()
893 fval = resols.fvalue
894 fpval = resols.f_pvalue
895 lm = nobs * resols.rsquared
896 # Note: degrees of freedom for LM test is nvars minus constant
897 return lm, stats.chi2.sf(lm, nvars - 1), fval, fpval
900def het_white(resid, exog):
901 """
902 White's Lagrange Multiplier Test for Heteroscedasticity.
904 Parameters
905 ----------
906 resid : array_like
907 The residuals. The squared residuals are used as the endogenous
908 variable.
909 exog : array_like
910 The explanatory variables for the variance. Squares and interaction
911 terms are automatically included in the auxiliary regression.
913 Returns
914 -------
915 lm : float
916 The lagrange multiplier statistic.
917 lm_pvalue :float
918 The p-value of lagrange multiplier test.
919 fvalue : float
920 The f-statistic of the hypothesis that the error variance does not
921 depend on x. This is an alternative test variant not the original
922 LM test.
923 f_pvalue : float
924 The p-value for the f-statistic.
926 Notes
927 -----
928 Assumes x contains constant (for counting dof).
930 question: does f-statistic make sense? constant ?
932 References
933 ----------
934 Greene section 11.4.1 5th edition p. 222. Test statistic reproduces
935 Greene 5th, example 11.3.
936 """
937 x = array_like(exog, "exog", ndim=2)
938 y = array_like(resid, "resid", ndim=2, shape=(x.shape[0], 1))
939 if x.shape[1] < 2:
940 raise ValueError("White's heteroskedasticity test requires exog to"
941 "have at least two columns where one is a constant.")
942 nobs, nvars0 = x.shape
943 i0, i1 = np.triu_indices(nvars0)
944 exog = x[:, i0] * x[:, i1]
945 nobs, nvars = exog.shape
946 assert nvars == nvars0 * (nvars0 - 1) / 2. + nvars0
947 resols = OLS(y ** 2, exog).fit()
948 fval = resols.fvalue
949 fpval = resols.f_pvalue
950 lm = nobs * resols.rsquared
951 # Note: degrees of freedom for LM test is nvars minus constant
952 # degrees of freedom take possible reduced rank in exog into account
953 # df_model checks the rank to determine df
954 # extra calculation that can be removed:
955 assert resols.df_model == np.linalg.matrix_rank(exog) - 1
956 lmpval = stats.chi2.sf(lm, resols.df_model)
957 return lm, lmpval, fval, fpval
960def het_goldfeldquandt(y, x, idx=None, split=None, drop=None,
961 alternative="increasing", store=False):
962 """
963 Goldfeld-Quandt homoskedasticity test.
965 This test examines whether the residual variance is the same in 2
966 subsamples.
968 Parameters
969 ----------
970 y : array_like
971 endogenous variable
972 x : array_like
973 exogenous variable, regressors
974 idx : int, default None
975 column index of variable according to which observations are
976 sorted for the split
977 split : {int, float}, default None
978 If an integer, this is the index at which sample is split.
979 If a float in 0<split<1 then split is interpreted as fraction
980 of the observations in the first sample. If None, uses nobs//2.
981 drop : {int, float}, default None
982 If this is not None, then observation are dropped from the middle
983 part of the sorted series. If 0<split<1 then split is interpreted
984 as fraction of the number of observations to be dropped.
985 Note: Currently, observations are dropped between split and
986 split+drop, where split and drop are the indices (given by rounding
987 if specified as fraction). The first sample is [0:split], the
988 second sample is [split+drop:]
989 alternative : {"increasing", "decreasing", "two-sided"}
990 The default is increasing. This specifies the alternative for the
991 p-value calculation.
992 store : bool, default False
993 Flag indicating to return the regression results
995 Returns
996 -------
997 fval : float
998 value of the F-statistic
999 pval : float
1000 p-value of the hypothesis that the variance in one subsample is
1001 larger than in the other subsample
1002 ordering : str
1003 The ordering used in the alternative.
1004 res_store : ResultsStore, optional
1005 Storage for the intermediate and final results that are calculated
1007 Notes
1008 -----
1009 The Null hypothesis is that the variance in the two sub-samples are the
1010 same. The alternative hypothesis, can be increasing, i.e. the variance
1011 in the second sample is larger than in the first, or decreasing or
1012 two-sided.
1014 Results are identical R, but the drop option is defined differently.
1015 (sorting by idx not tested yet)
1016 """
1017 x = np.asarray(x)
1018 y = np.asarray(y) # **2
1019 nobs, nvars = x.shape
1020 if split is None:
1021 split = nobs // 2
1022 elif (0 < split) and (split < 1):
1023 split = int(nobs * split)
1025 if drop is None:
1026 start2 = split
1027 elif (0 < drop) and (drop < 1):
1028 start2 = split + int(nobs * drop)
1029 else:
1030 start2 = split + drop
1032 if idx is not None:
1033 xsortind = np.argsort(x[:, idx])
1034 y = y[xsortind]
1035 x = x[xsortind, :]
1037 resols1 = OLS(y[:split], x[:split]).fit()
1038 resols2 = OLS(y[start2:], x[start2:]).fit()
1039 fval = resols2.mse_resid / resols1.mse_resid
1040 # if fval>1:
1041 if alternative.lower() in ["i", "inc", "increasing"]:
1042 fpval = stats.f.sf(fval, resols1.df_resid, resols2.df_resid)
1043 ordering = "increasing"
1044 elif alternative.lower() in ["d", "dec", "decreasing"]:
1045 fpval = stats.f.sf(1. / fval, resols2.df_resid, resols1.df_resid)
1046 ordering = "decreasing"
1047 elif alternative.lower() in ["2", "2-sided", "two-sided"]:
1048 fpval_sm = stats.f.cdf(fval, resols2.df_resid, resols1.df_resid)
1049 fpval_la = stats.f.sf(fval, resols2.df_resid, resols1.df_resid)
1050 fpval = 2 * min(fpval_sm, fpval_la)
1051 ordering = "two-sided"
1052 else:
1053 raise ValueError("invalid alternative")
1055 if store:
1056 res = ResultsStore()
1057 res.__doc__ = "Test Results for Goldfeld-Quandt test of" \
1058 "heterogeneity"
1059 res.fval = fval
1060 res.fpval = fpval
1061 res.df_fval = (resols2.df_resid, resols1.df_resid)
1062 res.resols1 = resols1
1063 res.resols2 = resols2
1064 res.ordering = ordering
1065 res.split = split
1066 res._str = """\
1067The Goldfeld-Quandt test for null hypothesis that the variance in the second
1068subsample is %s than in the first subsample:
1069F-statistic =%8.4f and p-value =%8.4f""" % (ordering, fval, fpval)
1071 return fval, fpval, ordering, res
1073 return fval, fpval, ordering
1076class HetGoldfeldQuandt(object):
1077 """
1078 Test whether variance is the same in 2 subsamples
1080 .. deprecated:: 0.11
1082 HetGoldfeldQuandt is deprecated in favor of het_goldfeldquandt.
1083 HetGoldfeldQuandt will be removed after 0.12.
1085 See Also
1086 --------
1087 het_goldfeldquant
1088 Goldfeld-Quant heteroskedasticity test.
1089 """
1091 def __init__(self):
1092 import warnings
1093 warnings.warn("HetGoldfeldQuandt is deprecated in favor of"
1094 "het_goldfeldquandt. HetGoldfeldQuandt will be removed"
1095 "after 0.12.",
1096 FutureWarning)
1097 self.fval = self.fpval = self.df_fval = self.resols1 = None
1098 self.resols2 = self.ordering = self.split = self._str = None
1100 def run(self, y, x, idx=None, split=None, drop=None,
1101 alternative="increasing", attach=True):
1102 """
1103 .. deprecated:: 0.11
1105 Use het_goldfeldquant instead.
1107 See Also
1108 --------
1109 het_goldfeldquant
1110 Goldfeld-Quant heteroskedasticity test.
1111 """
1112 res = het_goldfeldquandt(y, x, idx=idx, split=split, drop=drop,
1113 alternative=alternative, store=attach)
1114 if attach:
1115 store = res[-1]
1116 self.__doc__ = store.__doc__
1117 self.fval = store.fval
1118 self.fpval = store.fpval
1119 self.df_fval = store.df_fval
1120 self.resols1 = store.resols1
1121 self.resols2 = store.resols2
1122 self.ordering = store.ordering
1123 self.split = store.split
1125 return res[:3]
1127 def __call__(self, y, x, idx=None, split=None, drop=None,
1128 alternative="increasing"):
1129 return self.run(y, x, idx=idx, split=split, drop=drop,
1130 attach=False, alternative=alternative)
1133@deprecate_kwarg("result", "res")
1134def linear_reset(res, power=3, test_type="fitted", use_f=False,
1135 cov_type="nonrobust", cov_kwargs=None):
1136 r"""
1137 Ramsey's RESET test for neglected nonlinearity
1139 Parameters
1140 ----------
1141 res : RegressionResults
1142 A results instance from a linear regression.
1143 power : {int, List[int]}, default 3
1144 The maximum power to include in the model, if an integer. Includes
1145 powers 2, 3, ..., power. If an list of integers, includes all powers
1146 in the list.
1147 test_type : str, default "fitted"
1148 The type of augmentation to use:
1150 * "fitted" : (default) Augment regressors with powers of fitted values.
1151 * "exog" : Augment exog with powers of exog. Excludes binary
1152 regressors.
1153 * "princomp": Augment exog with powers of first principal component of
1154 exog.
1155 use_f : bool, default False
1156 Flag indicating whether an F-test should be used (True) or a
1157 chi-square test (False).
1158 cov_type : str, default "nonrobust
1159 Covariance type. The default is "nonrobust` which uses the classic
1160 OLS covariance estimator. Specify one of "HC0", "HC1", "HC2", "HC3"
1161 to use White's covariance estimator. All covariance types supported
1162 by ``OLS.fit`` are accepted.
1163 cov_kwargs : dict, default None
1164 Dictionary of covariance options passed to ``OLS.fit``. See OLS.fit
1165 for more details.
1167 Returns
1168 -------
1169 ContrastResults
1170 Test results for Ramsey's Reset test. See notes for implementation
1171 details.
1173 Notes
1174 -----
1175 The RESET test uses an augmented regression of the form
1177 .. math::
1179 Y = X\beta + Z\gamma + \epsilon
1181 where :math:`Z` are a set of regressors that are one of:
1183 * Powers of :math:`X\hat{\beta}` from the original regression.
1184 * Powers of :math:`X`, excluding the constant and binary regressors.
1185 * Powers of the first principal component of :math:`X`. If the
1186 model includes a constant, this column is dropped before computing
1187 the principal component. In either case, the principal component
1188 is extracted from the correlation matrix of remaining columns.
1190 The test is a Wald test of the null :math:`H_0:\gamma=0`. If use_f
1191 is True, then the quadratic-form test statistic is divided by the
1192 number of restrictions and the F distribution is used to compute
1193 the critical value.
1194 """
1195 if not isinstance(res, RegressionResultsWrapper):
1196 raise TypeError("result must come from a linear regression model")
1197 if bool(res.model.k_constant) and res.model.exog.shape[1] == 1:
1198 raise ValueError("exog contains only a constant column. The RESET "
1199 "test requires exog to have at least 1 "
1200 "non-constant column.")
1201 test_type = string_like(test_type, "test_type",
1202 options=("fitted", "exog", "princomp"))
1203 cov_kwargs = dict_like(cov_kwargs, "cov_kwargs", optional=True)
1204 use_f = bool_like(use_f, "use_f")
1205 if isinstance(power, int):
1206 if power < 2:
1207 raise ValueError("power must be >= 2")
1208 power = np.arange(2, power + 1, dtype=np.int)
1209 else:
1210 try:
1211 power = np.array(power, dtype=np.int)
1212 except Exception:
1213 raise ValueError("power must be an integer or list of integers")
1214 if power.ndim != 1 or len(set(power)) != power.shape[0] or \
1215 (power < 2).any():
1216 raise ValueError("power must contains distinct integers all >= 2")
1217 exog = res.model.exog
1218 if test_type == "fitted":
1219 aug = res.fittedvalues[:, None]
1220 elif test_type == "exog":
1221 # Remove constant and binary
1222 aug = res.model.exog
1223 binary = ((exog == exog.max(axis=0)) | (exog == exog.min(axis=0)))
1224 binary = binary.all(axis=0)
1225 if binary.all():
1226 raise ValueError("Model contains only constant or binary data")
1227 aug = aug[:, ~binary]
1228 else:
1229 from statsmodels.multivariate.pca import PCA
1230 aug = exog
1231 if res.k_constant:
1232 retain = np.arange(aug.shape[1]).tolist()
1233 retain.pop(int(res.model.data.const_idx))
1234 aug = aug[:, retain]
1235 pca = PCA(aug, ncomp=1, standardize=bool(res.k_constant),
1236 demean=bool(res.k_constant), method="nipals")
1237 aug = pca.factors[:, :1]
1238 aug_exog = np.hstack([exog] + [aug ** p for p in power])
1239 mod_class = res.model.__class__
1240 mod = mod_class(res.model.data.endog, aug_exog)
1241 cov_kwargs = {} if cov_kwargs is None else cov_kwargs
1242 res = mod.fit(cov_type=cov_type, cov_kwargs=cov_kwargs)
1243 nrestr = aug_exog.shape[1] - exog.shape[1]
1244 nparams = aug_exog.shape[1]
1245 r_mat = np.eye(nrestr, nparams, k=nparams-nrestr)
1246 return res.wald_test(r_mat, use_f=use_f)
1249def linear_harvey_collier(res, order_by=None):
1250 """
1251 Harvey Collier test for linearity
1253 The Null hypothesis is that the regression is correctly modeled as linear.
1255 Parameters
1256 ----------
1257 res : RegressionResults
1258 A results instance from a linear regression.
1259 order_by : array_like, default None
1260 Integer array specifying the order of the residuals. If not provided,
1261 the order of the residuals is not changed. If provided, must have
1262 the same number of observations as the endogenous variable.
1264 Returns
1265 -------
1266 tvalue : float
1267 The test statistic, based on ttest_1sample.
1268 pvalue : float
1269 The pvalue of the test.
1271 Notes
1272 -----
1273 This test is a t-test that the mean of the recursive ols residuals is zero.
1274 Calculating the recursive residuals might take some time for large samples.
1275 """
1276 # I think this has different ddof than
1277 # B.H. Baltagi, Econometrics, 2011, chapter 8
1278 # but it matches Gretl and R:lmtest, pvalue at decimal=13
1279 rr = recursive_olsresiduals(res, skip=3, alpha=0.95, order_by=order_by)
1281 return stats.ttest_1samp(rr[3][3:], 0)
1284def linear_rainbow(res, frac=0.5, order_by=None, use_distance=False,
1285 center=None):
1286 """
1287 Rainbow test for linearity
1289 The null hypothesis is the fit of the model using full sample is the same
1290 as using a central subset. The alternative is that the fits are difference.
1291 The rainbow test has power against many different forms of nonlinearity.
1293 Parameters
1294 ----------
1295 res : RegressionResults
1296 A results instance from a linear regression.
1297 frac : float, default 0.5
1298 The fraction of the data to include in the center model.
1299 order_by : {ndarray, str, List[str]}, default None
1300 If an ndarray, the values in the array are used to sort the
1301 observations. If a string or a list of strings, these are interpreted
1302 as column name(s) which are then used to lexicographically sort the
1303 data.
1304 use_distance : bool, default False
1305 Flag indicating whether data should be ordered by the Mahalanobis
1306 distance to the center.
1307 center : {float, int}, default None
1308 If a float, the value must be in [0, 1] and the center is center *
1309 nobs of the ordered data. If an integer, must be in [0, nobs) and
1310 is interpreted as the observation of the ordered data to use.
1312 Returns
1313 -------
1314 fstat : float
1315 The test statistic based on the F test.
1316 pvalue : float
1317 The pvalue of the test.
1319 Notes
1320 -----
1321 This test assumes residuals are homoskedastic and may reject a correct
1322 linear specification if the residuals are heteroskedastic.
1323 """
1324 if not isinstance(res, RegressionResultsWrapper):
1325 raise TypeError("res must be a results instance from a linear model.")
1326 frac = float_like(frac, "frac")
1328 use_distance = bool_like(use_distance, "use_distance")
1329 nobs = res.nobs
1330 endog = res.model.endog
1331 exog = res.model.exog
1332 if order_by is not None and use_distance:
1333 raise ValueError("order_by and use_distance cannot be simultaneously"
1334 "used.")
1335 if order_by is not None:
1336 if isinstance(order_by, np.ndarray):
1337 order_by = array_like(order_by, "order_by", ndim=1, dtype="int")
1338 else:
1339 if isinstance(order_by, str):
1340 order_by = [order_by]
1341 try:
1342 cols = res.model.data.orig_exog[order_by].copy()
1343 except (IndexError, KeyError):
1344 raise TypeError("order_by must contain valid column names "
1345 "from the exog data used to construct res,"
1346 "and exog must be a pandas DataFrame.")
1347 name = "__index__"
1348 while name in cols:
1349 name += '_'
1350 cols[name] = np.arange(cols.shape[0])
1351 cols = cols.sort_values(order_by)
1352 order_by = np.asarray(cols[name])
1353 endog = endog[order_by]
1354 exog = exog[order_by]
1355 if use_distance:
1356 center = int(nobs) // 2 if center is None else center
1357 if isinstance(center, float):
1358 if not 0.0 <= center <= 1.0:
1359 raise ValueError("center must be in (0, 1) when a float.")
1360 center = int(center * (nobs-1))
1361 else:
1362 center = int_like(center, "center")
1363 if not 0 < center < nobs - 1:
1364 raise ValueError("center must be in [0, nobs) when an int.")
1365 center_obs = exog[center:center+1]
1366 from scipy.spatial.distance import cdist
1367 try:
1368 err = exog - center_obs
1369 vi = np.linalg.inv(err.T @ err / nobs)
1370 except np.linalg.LinAlgError:
1371 err = exog - exog.mean(0)
1372 vi = np.linalg.inv(err.T @ err / nobs)
1373 dist = cdist(exog, center_obs, metric='mahalanobis', VI=vi)
1374 idx = np.argsort(dist.ravel())
1375 endog = endog[idx]
1376 exog = exog[idx]
1378 lowidx = np.ceil(0.5 * (1 - frac) * nobs).astype(int)
1379 uppidx = np.floor(lowidx + frac * nobs).astype(int)
1380 if uppidx - lowidx < exog.shape[1]:
1381 raise ValueError("frac is too small to perform test. frac * nobs"
1382 "must be greater than the number of exogenous"
1383 "variables in the model.")
1384 mi_sl = slice(lowidx, uppidx)
1385 res_mi = OLS(endog[mi_sl], exog[mi_sl]).fit()
1386 nobs_mi = res_mi.model.endog.shape[0]
1387 ss_mi = res_mi.ssr
1388 ss = res.ssr
1389 fstat = (ss - ss_mi) / (nobs - nobs_mi) / ss_mi * res_mi.df_resid
1390 pval = stats.f.sf(fstat, nobs - nobs_mi, res_mi.df_resid)
1391 return fstat, pval
1394def linear_lm(resid, exog, func=None):
1395 """
1396 Lagrange multiplier test for linearity against functional alternative
1398 # TODO: Remove the restriction
1399 limitations: Assumes currently that the first column is integer.
1400 Currently it does not check whether the transformed variables contain NaNs,
1401 for example log of negative number.
1403 Parameters
1404 ----------
1405 resid : ndarray
1406 residuals of a regression
1407 exog : ndarray
1408 exogenous variables for which linearity is tested
1409 func : callable, default None
1410 If func is None, then squares are used. func needs to take an array
1411 of exog and return an array of transformed variables.
1413 Returns
1414 -------
1415 lm : float
1416 Lagrange multiplier test statistic
1417 lm_pval : float
1418 p-value of Lagrange multiplier tes
1419 ftest : ContrastResult instance
1420 the results from the F test variant of this test
1422 Notes
1423 -----
1424 Written to match Gretl's linearity test. The test runs an auxiliary
1425 regression of the residuals on the combined original and transformed
1426 regressors. The Null hypothesis is that the linear specification is
1427 correct.
1428 """
1429 if func is None:
1430 def func(x):
1431 return np.power(x, 2)
1433 exog_aux = np.column_stack((exog, func(exog[:, 1:])))
1435 nobs, k_vars = exog.shape
1436 ls = OLS(resid, exog_aux).fit()
1437 ftest = ls.f_test(np.eye(k_vars - 1, k_vars * 2 - 1, k_vars))
1438 lm = nobs * ls.rsquared
1439 lm_pval = stats.chi2.sf(lm, k_vars - 1)
1440 return lm, lm_pval, ftest
1443def spec_white(resid, exog):
1444 """
1445 White's Two-Moment Specification Test
1447 Parameters
1448 ----------
1449 resid : array_like
1450 OLS residuals.
1451 exog : array_like
1452 OLS design matrix.
1454 Returns
1455 -------
1456 stat : float
1457 The test statistic.
1458 pval : float
1459 A chi-square p-value for test statistic.
1460 dof : int
1461 The degrees of freedom.
1463 See Also
1464 --------
1465 het_white
1466 White's test for heteroskedasticity.
1468 Notes
1469 -----
1470 Implements the two-moment specification test described by White's
1471 Theorem 2 (1980, p. 823) which compares the standard OLS covariance
1472 estimator with White's heteroscedasticity-consistent estimator. The
1473 test statistic is shown to be chi-square distributed.
1475 Null hypothesis is homoscedastic and correctly specified.
1477 Assumes the OLS design matrix contains an intercept term and at least
1478 one variable. The intercept is removed to calculate the test statistic.
1480 Interaction terms (squares and crosses of OLS regressors) are added to
1481 the design matrix to calculate the test statistic.
1483 Degrees-of-freedom (full rank) = nvar + nvar * (nvar + 1) / 2
1485 Linearly dependent columns are removed to avoid singular matrix error.
1487 References
1488 ----------
1489 .. [*] White, H. (1980). A heteroskedasticity-consistent covariance matrix
1490 estimator and a direct test for heteroscedasticity. Econometrica, 48:
1491 817-838.
1492 """
1493 x = array_like(exog, "exog", ndim=2)
1494 e = array_like(resid, "resid", ndim=1)
1495 if x.shape[1] < 2 or not np.any(np.ptp(x, 0) == 0.0):
1496 raise ValueError("White's specification test requires at least two"
1497 "columns where one is a constant.")
1499 # add interaction terms
1500 i0, i1 = np.triu_indices(x.shape[1])
1501 exog = np.delete(x[:, i0] * x[:, i1], 0, 1)
1503 # collinearity check - see _fit_collinear
1504 atol = 1e-14
1505 rtol = 1e-13
1506 tol = atol + rtol * exog.var(0)
1507 r = np.linalg.qr(exog, mode="r")
1508 mask = np.abs(r.diagonal()) < np.sqrt(tol)
1509 exog = exog[:, np.where(~mask)[0]]
1511 # calculate test statistic
1512 sqe = e * e
1513 sqmndevs = sqe - np.mean(sqe)
1514 d = np.dot(exog.T, sqmndevs)
1515 devx = exog - np.mean(exog, axis=0)
1516 devx *= sqmndevs[:, None]
1517 b = devx.T.dot(devx)
1518 stat = d.dot(np.linalg.solve(b, d))
1520 # chi-square test
1521 dof = devx.shape[1]
1522 pval = stats.chi2.sf(stat, dof)
1523 return stat, pval, dof
1526@deprecate_kwarg("olsresults", "res")
1527def recursive_olsresiduals(res, skip=None, lamda=0.0, alpha=0.95,
1528 order_by=None):
1529 """
1530 Calculate recursive ols with residuals and Cusum test statistic
1532 Parameters
1533 ----------
1534 res : RegressionResults
1535 Results from estimation of a regression model.
1536 skip : int, default None
1537 The number of observations to use for initial OLS, if None then skip is
1538 set equal to the number of regressors (columns in exog).
1539 lamda : float, default 0.0
1540 The weight for Ridge correction to initial (X'X)^{-1}.
1541 alpha : {0.90, 0.95, 0.99}, default 0.95
1542 Confidence level of test, currently only two values supported,
1543 used for confidence interval in cusum graph.
1544 order_by : array_like, default None
1545 Integer array specifying the order of the residuals. If not provided,
1546 the order of the residuals is not changed. If provided, must have
1547 the same number of observations as the endogenous variable.
1549 Returns
1550 -------
1551 rresid : ndarray
1552 The recursive ols residuals.
1553 rparams : ndarray
1554 The recursive ols parameter estimates.
1555 rypred : ndarray
1556 The recursive prediction of endogenous variable.
1557 rresid_standardized : ndarray
1558 The recursive residuals standardized so that N(0,sigma2) distributed,
1559 where sigma2 is the error variance.
1560 rresid_scaled : ndarray
1561 The recursive residuals normalize so that N(0,1) distributed.
1562 rcusum : ndarray
1563 The cumulative residuals for cusum test.
1564 rcusumci : ndarray
1565 The confidence interval for cusum test using a size of alpha.
1567 Notes
1568 -----
1569 It produces same recursive residuals as other version. This version updates
1570 the inverse of the X'X matrix and does not require matrix inversion during
1571 updating. looks efficient but no timing
1573 Confidence interval in Greene and Brown, Durbin and Evans is the same as
1574 in Ploberger after a little bit of algebra.
1576 References
1577 ----------
1578 jplv to check formulas, follows Harvey
1579 BigJudge 5.5.2b for formula for inverse(X'X) updating
1580 Greene section 7.5.2
1582 Brown, R. L., J. Durbin, and J. M. Evans. “Techniques for Testing the
1583 Constancy of Regression Relationships over Time.”
1584 Journal of the Royal Statistical Society. Series B (Methodological) 37,
1585 no. 2 (1975): 149-192.
1586 """
1587 y = res.model.endog
1588 x = res.model.exog
1589 order_by = array_like(order_by, "order_by", dtype="int", optional=True,
1590 ndim=1, shape=(y.shape[0],))
1591 # intialize with skip observations
1592 if order_by is not None:
1593 x = x[order_by]
1594 y = y[order_by]
1596 nobs, nvars = x.shape
1597 if skip is None:
1598 skip = nvars
1599 rparams = np.nan * np.zeros((nobs, nvars))
1600 rresid = np.nan * np.zeros(nobs)
1601 rypred = np.nan * np.zeros(nobs)
1602 rvarraw = np.nan * np.zeros(nobs)
1604 x0 = x[:skip]
1605 y0 = y[:skip]
1606 # add Ridge to start (not in jplv
1607 xtxi = np.linalg.inv(np.dot(x0.T, x0) + lamda * np.eye(nvars))
1608 xty = np.dot(x0.T, y0) # xi * y #np.dot(xi, y)
1609 beta = np.dot(xtxi, xty)
1610 rparams[skip - 1] = beta
1611 yipred = np.dot(x[skip - 1], beta)
1612 rypred[skip - 1] = yipred
1613 rresid[skip - 1] = y[skip - 1] - yipred
1614 rvarraw[skip - 1] = 1 + np.dot(x[skip - 1], np.dot(xtxi, x[skip - 1]))
1615 for i in range(skip, nobs):
1616 xi = x[i:i + 1, :]
1617 yi = y[i]
1619 # get prediction error with previous beta
1620 yipred = np.dot(xi, beta)
1621 rypred[i] = yipred
1622 residi = yi - yipred
1623 rresid[i] = residi
1625 # update beta and inverse(X'X)
1626 tmp = np.dot(xtxi, xi.T)
1627 ft = 1 + np.dot(xi, tmp)
1629 xtxi = xtxi - np.dot(tmp, tmp.T) / ft # BigJudge equ 5.5.15
1631 beta = beta + (tmp * residi / ft).ravel() # BigJudge equ 5.5.14
1632 rparams[i] = beta
1633 rvarraw[i] = ft
1635 rresid_scaled = rresid / np.sqrt(rvarraw) # N(0,sigma2) distributed
1636 nrr = nobs - skip
1637 # sigma2 = rresid_scaled[skip-1:].var(ddof=1) #var or sum of squares ?
1638 # Greene has var, jplv and Ploberger have sum of squares (Ass.:mean=0)
1639 # Gretl uses: by reverse engineering matching their numbers
1640 sigma2 = rresid_scaled[skip:].var(ddof=1)
1641 rresid_standardized = rresid_scaled / np.sqrt(sigma2) # N(0,1) distributed
1642 rcusum = rresid_standardized[skip - 1:].cumsum()
1643 # confidence interval points in Greene p136 looks strange. Cleared up
1644 # this assumes sum of independent standard normal, which does not take into
1645 # account that we make many tests at the same time
1646 if alpha == 0.90:
1647 a = 0.850
1648 elif alpha == 0.95:
1649 a = 0.948
1650 elif alpha == 0.99:
1651 a = 1.143
1652 else:
1653 raise ValueError("alpha can only be 0.9, 0.95 or 0.99")
1655 # following taken from Ploberger,
1656 # crit = a * np.sqrt(nrr)
1657 rcusumci = (a * np.sqrt(nrr) + 2 * a * np.arange(0, nobs - skip) / np.sqrt(
1658 nrr)) * np.array([[-1.], [+1.]])
1659 return (rresid, rparams, rypred, rresid_standardized, rresid_scaled,
1660 rcusum, rcusumci)
1663def breaks_hansen(olsresults):
1664 """
1665 Test for model stability, breaks in parameters for ols, Hansen 1992
1667 Parameters
1668 ----------
1669 olsresults : RegressionResults
1670 Results from estimation of a regression model.
1672 Returns
1673 -------
1674 teststat : float
1675 Hansen's test statistic.
1676 crit : ndarray
1677 The critical values at alpha=0.95 for different nvars.
1679 Notes
1680 -----
1681 looks good in example, maybe not very powerful for small changes in
1682 parameters
1684 According to Greene, distribution of test statistics depends on nvar but
1685 not on nobs.
1687 Test statistic is verified against R:strucchange
1689 References
1690 ----------
1691 Greene section 7.5.1, notation follows Greene
1692 """
1693 x = olsresults.model.exog
1694 resid = array_like(olsresults.resid, "resid", shape=(x.shape[0], 1))
1695 nobs, nvars = x.shape
1696 resid2 = resid ** 2
1697 ft = np.c_[x * resid[:, None], (resid2 - resid2.mean())]
1698 score = ft.cumsum(0)
1699 f = nobs * (ft[:, :, None] * ft[:, None, :]).sum(0)
1700 s = (score[:, :, None] * score[:, None, :]).sum(0)
1701 h = np.trace(np.dot(np.linalg.inv(f), s))
1702 crit95 = np.array([(2, 1.9), (6, 3.75), (15, 3.75), (19, 4.52)],
1703 dtype=[("nobs", int), ("crit", float)])
1704 # TODO: get critical values from Bruce Hansen's 1992 paper
1705 return h, crit95
1708def breaks_cusumolsresid(resid, ddof=0):
1709 """
1710 Cusum test for parameter stability based on ols residuals.
1712 Parameters
1713 ----------
1714 resid : ndarray
1715 An array of residuals from an OLS estimation.
1716 ddof : int
1717 The number of parameters in the OLS estimation, used as degrees
1718 of freedom correction for error variance.
1720 Returns
1721 -------
1722 sup_b : float
1723 The test statistic, maximum of absolute value of scaled cumulative OLS
1724 residuals.
1725 pval : float
1726 Probability of observing the data under the null hypothesis of no
1727 structural change, based on asymptotic distribution which is a Brownian
1728 Bridge
1729 crit: list
1730 The tabulated critical values, for alpha = 1%, 5% and 10%.
1732 Notes
1733 -----
1734 Tested against R:structchange.
1736 Not clear: Assumption 2 in Ploberger, Kramer assumes that exog x have
1737 asymptotically zero mean, x.mean(0) = [1, 0, 0, ..., 0]
1738 Is this really necessary? I do not see how it can affect the test statistic
1739 under the null. It does make a difference under the alternative.
1740 Also, the asymptotic distribution of test statistic depends on this.
1742 From examples it looks like there is little power for standard cusum if
1743 exog (other than constant) have mean zero.
1745 References
1746 ----------
1747 Ploberger, Werner, and Walter Kramer. “The Cusum Test with OLS Residuals.”
1748 Econometrica 60, no. 2 (March 1992): 271-285.
1749 """
1750 resid = resid.ravel()
1751 nobs = len(resid)
1752 nobssigma2 = (resid ** 2).sum()
1753 if ddof > 0:
1754 nobssigma2 = nobssigma2 / (nobs - ddof) * nobs
1755 # b is asymptotically a Brownian Bridge
1756 b = resid.cumsum() / np.sqrt(nobssigma2) # use T*sigma directly
1757 # asymptotically distributed as standard Brownian Bridge
1758 sup_b = np.abs(b).max()
1759 crit = [(1, 1.63), (5, 1.36), (10, 1.22)]
1760 # Note stats.kstwobign.isf(0.1) is distribution of sup.abs of Brownian
1761 # Bridge
1762 # >>> stats.kstwobign.isf([0.01,0.05,0.1])
1763 # array([ 1.62762361, 1.35809864, 1.22384787])
1764 pval = stats.kstwobign.sf(sup_b)
1765 return sup_b, pval, crit
1767# def breaks_cusum(recolsresid):
1768# """renormalized cusum test for parameter stability based on recursive
1769# residuals
1770#
1771#
1772# still incorrect: in PK, the normalization for sigma is by T not T-K
1773# also the test statistic is asymptotically a Wiener Process, Brownian
1774# motion
1775# not Brownian Bridge
1776# for testing: result reject should be identical as in standard cusum
1777# version
1778#
1779# References
1780# ----------
1781# Ploberger, Werner, and Walter Kramer. “The Cusum Test with OLS Residuals.”
1782# Econometrica 60, no. 2 (March 1992): 271-285.
1783#
1784# """
1785# resid = recolsresid.ravel()
1786# nobssigma2 = (resid**2).sum()
1787# #B is asymptotically a Brownian Bridge
1788# B = resid.cumsum()/np.sqrt(nobssigma2) # use T*sigma directly
1789# nobs = len(resid)
1790# denom = 1. + 2. * np.arange(nobs)/(nobs-1.) #not sure about limits
1791# sup_b = np.abs(B/denom).max()
1792# #asymptotically distributed as standard Brownian Bridge
1793# crit = [(1,1.63), (5, 1.36), (10, 1.22)]
1794# #Note stats.kstwobign.isf(0.1) is distribution of sup.abs of Brownian
1795# Bridge
1796# #>>> stats.kstwobign.isf([0.01,0.05,0.1])
1797# #array([ 1.62762361, 1.35809864, 1.22384787])
1798# pval = stats.kstwobign.sf(sup_b)
1799# return sup_b, pval, crit