Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/tsa/ar_model.py : 15%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# -*- coding: utf-8 -*-
2import copy
3import datetime as dt
4from collections.abc import Iterable
5from types import SimpleNamespace
7import numpy as np
8import pandas as pd
9from numpy.linalg import inv, slogdet
10from scipy.stats import norm, gaussian_kde
12import statsmodels.base.model as base
13import statsmodels.base.wrapper as wrap
14from statsmodels.compat.pandas import Appender, Substitution
15from statsmodels.iolib.summary import Summary
16from statsmodels.regression.linear_model import OLS
17from statsmodels.tools.decorators import cache_readonly, cache_writable
18from statsmodels.tools.docstring import (Docstring, remove_parameters)
19from statsmodels.tools.numdiff import approx_fprime, approx_hess
20from statsmodels.tools.validation import array_like, string_like, bool_like, \
21 int_like
22from statsmodels.tsa.arima_process import arma2ma
23from statsmodels.tsa.base import tsa_model
24from statsmodels.tsa.kalmanf.kalmanfilter import KalmanFilter
25from statsmodels.tsa.tsatools import (lagmat, add_trend, _ar_transparams,
26 _ar_invtransparams, freq_to_period)
27from statsmodels.tsa.vector_ar import util
29__all__ = ['AR', 'AutoReg']
31AR_DEPRECATION_WARN = """
32statsmodels.tsa.AR has been deprecated in favor of statsmodels.tsa.AutoReg and
33statsmodels.tsa.SARIMAX.
35AutoReg adds the ability to specify exogenous variables, include time trends,
36and add seasonal dummies. The AutoReg API differs from AR since the model is
37treated as immutable, and so the entire specification including the lag
38length must be specified when creating the model. This change is too
39substantial to incorporate into the existing AR api. The function
40ar_select_order performs lag length selection for AutoReg models.
42AutoReg only estimates parameters using conditional MLE (OLS). Use SARIMAX to
43estimate ARX and related models using full MLE via the Kalman Filter.
45To silence this warning and continue using AR until it is removed, use:
47import warnings
48warnings.filterwarnings('ignore', 'statsmodels.tsa.ar_model.AR', FutureWarning)
49"""
51REPEATED_FIT_ERROR = """
52Model has been fit using maxlag={0}, method={1}, ic={2}, trend={3}. These
53cannot be changed in subsequent calls to `fit`. Instead, use a new instance of
54AR.
55"""
58def sumofsq(x, axis=0):
59 """Helper function to calculate sum of squares along first axis"""
60 return np.sum(x ** 2, axis=axis)
63def _ar_predict_out_of_sample(y, params, k_ar, k_trend, steps, start=0):
64 mu = params[:k_trend] if k_trend else 0 # only have to worry constant
65 arparams = params[k_trend:][::-1] # reverse for dot
67 # dynamic endogenous variable
68 endog = np.zeros(k_ar + steps) # this is one too big but does not matter
69 if start:
70 endog[:k_ar] = y[start - k_ar:start]
71 else:
72 endog[:k_ar] = y[-k_ar:]
74 forecast = np.zeros(steps)
75 for i in range(steps):
76 fcast = mu + np.dot(arparams, endog[i:i + k_ar])
77 forecast[i] = fcast
78 endog[i + k_ar] = fcast
80 return forecast
83class AutoReg(tsa_model.TimeSeriesModel):
84 """
85 Autoregressive AR-X(p) model.
87 Estimate an AR-X model using Conditional Maximum Likelihood (OLS).
89 Parameters
90 ----------
91 endog : array_like
92 A 1-d endogenous response variable. The independent variable.
93 lags : {int, list[int]}
94 The number of lags to include in the model if an integer or the
95 list of lag indices to include. For example, [1, 4] will only
96 include lags 1 and 4 while lags=4 will include lags 1, 2, 3, and 4.
97 trend : {'n', 'c', 't', 'ct'}
98 The trend to include in the model:
100 * 'n' - No trend.
101 * 'c' - Constant only.
102 * 't' - Time trend only.
103 * 'ct' - Constant and time trend.
105 seasonal : bool
106 Flag indicating whether to include seasonal dummies in the model. If
107 seasonal is True and trend includes 'c', then the first period
108 is excluded from the seasonal terms.
109 exog : array_like, optional
110 Exogenous variables to include in the model. Must have the same number
111 of observations as endog and should be aligned so that endog[i] is
112 regressed on exog[i].
113 hold_back : {None, int}
114 Initial observations to exclude from the estimation sample. If None,
115 then hold_back is equal to the maximum lag in the model. Set to a
116 non-zero value to produce comparable models with different lag
117 length. For example, to compare the fit of a model with lags=3 and
118 lags=1, set hold_back=3 which ensures that both models are estimated
119 using observations 3,...,nobs. hold_back must be >= the maximum lag in
120 the model.
121 period : {None, int}
122 The period of the data. Only used if seasonal is True. This parameter
123 can be omitted if using a pandas object for endog that contains a
124 recognized frequency.
125 missing : str
126 Available options are 'none', 'drop', and 'raise'. If 'none', no nan
127 checking is done. If 'drop', any observations with nans are dropped.
128 If 'raise', an error is raised. Default is 'none'.
130 See Also
131 --------
132 statsmodels.tsa.statespace.sarimax.SARIMAX
133 Estimation of SARIMAX models using exact likelihood and the
134 Kalman Filter.
136 Examples
137 --------
138 >>> import statsmodels.api as sm
139 >>> from statsmodels.tsa.ar_model import AutoReg
140 >>> data = sm.datasets.sunspots.load_pandas().data['SUNACTIVITY']
141 >>> out = 'AIC: {0:0.3f}, HQIC: {1:0.3f}, BIC: {2:0.3f}'
143 Start by fitting an unrestricted Seasonal AR model
145 >>> res = AutoReg(data, lags = [1, 11, 12]).fit()
146 >>> print(out.format(res.aic, res.hqic, res.bic))
147 AIC: 5.945, HQIC: 5.970, BIC: 6.007
149 An alternative used seasonal dummies
151 >>> res = AutoReg(data, lags=1, seasonal=True, period=11).fit()
152 >>> print(out.format(res.aic, res.hqic, res.bic))
153 AIC: 6.017, HQIC: 6.080, BIC: 6.175
155 Finally, both the seasonal AR structure and dummies can be included
157 >>> res = AutoReg(data, lags=[1, 11, 12], seasonal=True, period=11).fit()
158 >>> print(out.format(res.aic, res.hqic, res.bic))
159 AIC: 5.884, HQIC: 5.959, BIC: 6.071
160 """
162 def __init__(self, endog, lags, trend='c', seasonal=False, exog=None,
163 hold_back=None, period=None, missing='none'):
164 super(AutoReg, self).__init__(endog, exog, None, None,
165 missing=missing)
166 self._trend = string_like(trend, 'trend',
167 options=('n', 'c', 't', 'ct'))
168 self._seasonal = bool_like(seasonal, 'seasonal')
169 self._period = int_like(period, 'period', optional=True)
170 if self._period is None and self._seasonal:
171 if self.data.freq:
172 self._period = freq_to_period(self._index_freq)
173 else:
174 err = 'freq cannot be inferred from endog and model includes' \
175 ' seasonal terms. The number of periods must be ' \
176 'explicitly set when the endog\'s index does not ' \
177 'contain a frequency.'
178 raise ValueError(err)
179 self._lags = lags
180 self._exog_names = []
181 self._k_ar = 0
182 self._hold_back = int_like(hold_back, 'hold_back', optional=True)
183 self._check_lags()
184 self._setup_regressors()
185 self.nobs = self._y.shape[0]
186 self.data.xnames = self.exog_names
188 @property
189 def ar_lags(self):
190 """The autoregressive lags included in the model"""
191 return self._lags
193 @property
194 def hold_back(self):
195 """The number of initial obs. excluded from the estimation sample."""
196 return self._hold_back
198 @property
199 def seasonal(self):
200 """Flag indicating that the model contains a seasonal component."""
201 return self._seasonal
203 @property
204 def df_model(self):
205 """The model degrees of freedom."""
206 return self._x.shape[1]
208 @property
209 def exog_names(self):
210 """Names of exogenous variables included in model"""
211 return self._exog_names
213 def initialize(self):
214 """Initialize the model (no-op)."""
215 pass
217 def _check_lags(self):
218 lags = self._lags
219 if isinstance(lags, Iterable):
220 lags = np.array(sorted([int_like(lag, 'lags') for lag in lags]))
221 self._lags = lags
222 if np.any(lags < 1) or np.unique(lags).shape[0] != lags.shape[0]:
223 raise ValueError('All values in lags must be positive and '
224 'distinct.')
225 self._maxlag = np.max(lags)
226 else:
227 self._maxlag = int_like(lags, 'lags')
228 if self._maxlag < 0:
229 raise ValueError('lags must be a positive scalar.')
230 self._lags = np.arange(1, self._maxlag + 1)
231 if self._hold_back is None:
232 self._hold_back = self._maxlag
233 if self._hold_back < self._maxlag:
234 raise ValueError('hold_back must be >= lags if lags is an int or'
235 'max(lags) if lags is array_like.')
237 def _setup_regressors(self):
238 maxlag = self._maxlag
239 hold_back = self._hold_back
240 exog_names = []
241 endog_names = self.endog_names
242 x, y = lagmat(self.endog, maxlag, original='sep')
243 exog_names.extend([endog_names + '.L{0}'.format(lag)
244 for lag in self._lags])
245 if len(self._lags) < maxlag:
246 x = x[:, self._lags - 1]
247 self._k_ar = x.shape[1]
248 if self._seasonal:
249 nobs, period = self.endog.shape[0], self._period
250 season_names = ['seasonal.{0}'.format(i) for i in range(period)]
251 dummies = np.zeros((nobs, period))
252 for i in range(period):
253 dummies[i::period, i] = 1
254 if 'c' in self._trend:
255 dummies = dummies[:, 1:]
256 season_names = season_names[1:]
257 x = np.c_[dummies, x]
258 exog_names = season_names + exog_names
259 x = add_trend(x, trend=self._trend, prepend=True)
260 if 't' in self._trend:
261 exog_names.insert(0, 'trend')
262 if 'c' in self._trend:
263 exog_names.insert(0, 'intercept')
264 if self.exog is not None:
265 x = np.c_[x, self.exog]
266 exog_names.extend(self.data.param_names)
267 y = y[hold_back:]
268 x = x[hold_back:]
269 if y.shape[0] < x.shape[1]:
270 reg = x.shape[1]
271 trend = 0 if self._trend == 'n' else len(self._trend)
272 seas = 0 if not self._seasonal else period - ('c' in self._trend)
273 lags = self._lags.shape[0]
274 nobs = y.shape[0]
275 raise ValueError('The model specification cannot be estimated. '
276 'The model contains {0} regressors ({1} trend, '
277 '{2} seasonal, {3} lags) but after adjustment '
278 'for hold_back and creation of the lags, there '
279 'are only {4} data points available to estimate '
280 'parameters.'.format(reg, trend, seas, lags,
281 nobs))
282 self._y, self._x = y, x
283 self._exog_names = exog_names
285 def fit(self, cov_type='nonrobust', cov_kwds=None, use_t=False):
286 """
287 Estimate the model parameters.
289 Parameters
290 ----------
291 cov_type : str
292 The covariance estimator to use. The most common choices are listed
293 below. Supports all covariance estimators that are available
294 in ``OLS.fit``.
296 * 'nonrobust' - The class OLS covariance estimator that assumes
297 homoskedasticity.
298 * 'HC0', 'HC1', 'HC2', 'HC3' - Variants of White's
299 (or Eiker-Huber-White) covariance estimator. `HC0` is the
300 standard implementation. The other make corrections to improve
301 the finite sample performance of the heteroskedasticity robust
302 covariance estimator.
303 * 'HAC' - Heteroskedasticity-autocorrelation robust covariance
304 estimation. Supports cov_kwds.
306 - `maxlag` integer (required) : number of lags to use.
307 - `kernel` callable or str (optional) : kernel
308 currently available kernels are ['bartlett', 'uniform'],
309 default is Bartlett.
310 - `use_correction` bool (optional) : If true, use small sample
311 correction.
312 cov_kwds : dict, optional
313 A dictionary of keyword arguments to pass to the covariance
314 estimator. `nonrobust` and `HC#` do not support cov_kwds.
315 use_t : bool, optional
316 A flag indicating that inference should use the Student's t
317 distribution that accounts for model degree of freedom. If False,
318 uses the normal distribution. If None, defers the choice to
319 the cov_type. It also removes degree of freedom corrections from
320 the covariance estimator when cov_type is 'nonrobust'.
322 Returns
323 -------
324 AutoRegResults
325 Estimation results.
327 See Also
328 --------
329 statsmodels.regression.linear_model.OLS
330 Ordinary Least Squares estimation.
331 statsmodels.regression.linear_model.RegressionResults
332 See ``get_robustcov_results`` for a detailed list of available
333 covariance estimators and options.
335 Notes
336 -----
337 Use ``OLS`` to estimate model parameters and to estimate parameter
338 covariance.
339 """
340 # TODO: Determine correction for degree-of-freedom
341 # Special case parameterless model
342 if self._x.shape[1] == 0:
343 return AutoRegResultsWrapper(AutoRegResults(self,
344 np.empty(0),
345 np.empty((0, 0))))
347 ols_mod = OLS(self._y, self._x)
348 ols_res = ols_mod.fit(cov_type=cov_type, cov_kwds=cov_kwds,
349 use_t=use_t)
350 cov_params = ols_res.cov_params()
351 use_t = ols_res.use_t
352 if cov_type == "nonrobust" and not use_t:
353 nobs = self._y.shape[0]
354 k = self._x.shape[1]
355 scale = nobs / (nobs - k)
356 cov_params /= scale
357 res = AutoRegResults(self, ols_res.params, cov_params,
358 ols_res.normalized_cov_params)
360 return AutoRegResultsWrapper(res)
362 def _resid(self, params):
363 params = array_like(params, 'params', ndim=2)
364 resid = self._y - self._x @ params
365 return resid.squeeze()
367 def loglike(self, params):
368 """
369 Log-likelihood of model.
371 Parameters
372 ----------
373 params : ndarray
374 The model parameters used to compute the log-likelihood.
376 Returns
377 -------
378 float
379 The log-likelihood value.
380 """
381 nobs = self.nobs
382 resid = self._resid(params)
383 ssr = resid @ resid
384 llf = -(nobs / 2) * (np.log(2 * np.pi) + np.log(ssr / nobs) + 1)
385 return llf
387 def score(self, params):
388 """
389 Score vector of model.
391 The gradient of logL with respect to each parameter.
393 Parameters
394 ----------
395 params : ndarray
396 The parameters to use when evaluating the Hessian.
398 Returns
399 -------
400 ndarray
401 The score vector evaluated at the parameters.
402 """
403 resid = self._resid(params)
404 return self._x.T @ resid
406 def information(self, params):
407 """
408 Fisher information matrix of model.
410 Returns -1 * Hessian of the log-likelihood evaluated at params.
412 Parameters
413 ----------
414 params : ndarray
415 The model parameters.
417 Returns
418 -------
419 ndarray
420 The information matrix.
421 """
422 resid = self._resid(params)
423 sigma2 = resid @ resid / self.nobs
424 return sigma2 * (self._x.T @ self._x)
426 def hessian(self, params):
427 """
428 The Hessian matrix of the model.
430 Parameters
431 ----------
432 params : ndarray
433 The parameters to use when evaluating the Hessian.
435 Returns
436 -------
437 ndarray
438 The hessian evaluated at the parameters.
439 """
440 return -self.information(params)
442 def _setup_oos_forecast(self, add_forecasts, exog_oos):
443 full_nobs = self.data.endog.shape[0]
444 x = np.zeros((add_forecasts, self._x.shape[1]))
445 loc = 0
446 if 'c' in self._trend:
447 x[:, 0] = 1
448 loc += 1
449 if 't' in self._trend:
450 x[:, loc] = np.arange(full_nobs + 1, full_nobs + add_forecasts + 1)
451 loc += 1
452 if self.seasonal:
453 seasonal = np.zeros((add_forecasts, self._period))
454 period = self._period
455 col = full_nobs % period
456 for i in range(period):
457 seasonal[i::period, (col + i) % period] = 1
458 if 'c' in self._trend:
459 x[:, loc:loc + period - 1] = seasonal[:, 1:]
460 loc += seasonal.shape[1] - 1
461 else:
462 x[:, loc:loc + period] = seasonal
463 loc += seasonal.shape[1]
464 # skip the AR columns
465 loc += len(self._lags)
466 if self.exog is not None:
467 x[:, loc:] = exog_oos[:add_forecasts]
468 return x
470 def _wrap_prediction(self, prediction, start, end):
471 if not isinstance(self.data.orig_endog, (pd.Series, pd.DataFrame)):
472 return prediction
473 index = self.data.orig_endog.index
474 if end > self.endog.shape[0]:
475 freq = getattr(index, 'freq', None)
476 if freq:
477 index = pd.date_range(index[0], freq=freq, periods=end)
478 else:
479 index = pd.RangeIndex(end)
480 index = index[start:end]
481 return pd.Series(prediction, index=index)
483 def _dynamic_predict(self, params, start, end, dynamic, num_oos, exog,
484 exog_oos):
485 """
487 :param params:
488 :param start:
489 :param end:
490 :param dynamic:
491 :param num_oos:
492 :param exog:
493 :param exog_oos:
494 :return:
495 """
496 reg = []
497 hold_back = self._hold_back
498 if (start - hold_back) <= self.nobs:
499 is_loc = slice(start - hold_back, end + 1 - hold_back)
500 x = self._x[is_loc]
501 if exog is not None:
502 x = x.copy()
503 # Replace final columns
504 x[:, -exog.shape[1]:] = exog[start:end + 1]
505 reg.append(x)
506 if num_oos > 0:
507 reg.append(self._setup_oos_forecast(num_oos, exog_oos))
508 reg = np.vstack(reg)
509 det_col_idx = self._x.shape[1] - len(self._lags)
510 det_col_idx -= 0 if self.exog is None else self.exog.shape[1]
511 # + 1 is due t0 inclusiveness of predict functions
512 adj_dynamic = dynamic - start + 1
513 forecasts = np.empty(reg.shape[0])
514 forecasts[:adj_dynamic] = reg[:adj_dynamic] @ params
515 for h in range(adj_dynamic, reg.shape[0]):
516 # Fill in regressor matrix
517 for j, lag in enumerate(self._lags):
518 fcast_loc = h - lag
519 if fcast_loc >= 0:
520 val = forecasts[fcast_loc]
521 else:
522 # If before the start of the forecasts, use actual values
523 val = self.endog[start + fcast_loc]
524 reg[h, det_col_idx + j] = val
525 forecasts[h] = reg[h:h + 1] @ params
526 return self._wrap_prediction(forecasts, start, end + 1 + num_oos)
528 def _static_oos_predict(self, params, num_oos, exog_oos):
529 new_x = self._setup_oos_forecast(num_oos, exog_oos)
530 if self._maxlag == 0:
531 return new_x @ params
532 forecasts = np.empty(num_oos)
533 nexog = 0 if self.exog is None else self.exog.shape[1]
534 ar_offset = self._x.shape[1] - nexog - self._lags.shape[0]
535 for i in range(num_oos):
536 for j, lag in enumerate(self._lags):
537 loc = i - lag
538 val = self._y[loc] if loc < 0 else forecasts[loc]
539 new_x[i, ar_offset + j] = val
540 forecasts[i] = new_x[i:i + 1] @ params
541 return forecasts
543 def _static_predict(self, params, start, end, num_oos, exog, exog_oos):
544 """
545 Path for static predictions
547 Parameters
548 ----------
549 start : int
550 Index of first observation
551 end : int
552 Index of last in-sample observation. Inclusive, so start:end+1
553 in slice notation.
554 num_oos : int
555 Number of out-of-sample observations, so that the returned size is
556 num_oos + (end - start + 1).
557 exog : ndarray
558 Array containing replacement exog values
559 exog_oos : ndarray
560 Containing forecast exog values
561 """
562 hold_back = self._hold_back
563 nobs = self.endog.shape[0]
565 x = np.empty((0, self._x.shape[1]))
566 if start <= nobs:
567 is_loc = slice(start - hold_back, end + 1 - hold_back)
568 x = self._x[is_loc]
569 if exog is not None:
570 x = x.copy()
571 # Replace final columns
572 x[:, -exog.shape[1]:] = exog[start:end + 1]
573 in_sample = x @ params
574 if num_oos == 0: # No out of sample
575 return self._wrap_prediction(in_sample, start, end + 1)
577 out_of_sample = self._static_oos_predict(params, num_oos, exog_oos)
579 prediction = np.hstack((in_sample, out_of_sample))
580 return self._wrap_prediction(prediction, start, end + 1 + num_oos)
582 def predict(self, params, start=None, end=None, dynamic=False, exog=None,
583 exog_oos=None):
584 """
585 In-sample prediction and out-of-sample forecasting.
587 Parameters
588 ----------
589 params : array_like
590 The fitted model parameters.
591 start : int, str, or datetime, optional
592 Zero-indexed observation number at which to start forecasting,
593 i.e., the first forecast is start. Can also be a date string to
594 parse or a datetime type. Default is the the zeroth observation.
595 end : int, str, or datetime, optional
596 Zero-indexed observation number at which to end forecasting, i.e.,
597 the last forecast is end. Can also be a date string to
598 parse or a datetime type. However, if the dates index does not
599 have a fixed frequency, end must be an integer index if you
600 want out-of-sample prediction. Default is the last observation in
601 the sample. Unlike standard python slices, end is inclusive so
602 that all the predictions [start, start+1, ..., end-1, end] are
603 returned.
604 dynamic : {bool, int, str, datetime, Timestamp}, optional
605 Integer offset relative to `start` at which to begin dynamic
606 prediction. Prior to this observation, true endogenous values
607 will be used for prediction; starting with this observation and
608 continuing through the end of prediction, forecasted endogenous
609 values will be used instead. Datetime-like objects are not
610 interpreted as offsets. They are instead used to find the index
611 location of `dynamic` which is then used to to compute the offset.
612 exog : array_like
613 A replacement exogenous array. Must have the same shape as the
614 exogenous data array used when the model was created.
615 exog_oos : array_like
616 An array containing out-of-sample values of the exogenous variable.
617 Must has the same number of columns as the exog used when the
618 model was created, and at least as many rows as the number of
619 out-of-sample forecasts.
621 Returns
622 -------
623 array_like
624 Array of out of in-sample predictions and / or out-of-sample
625 forecasts. An (npredict x k_endog) array.
626 """
627 params = array_like(params, 'params')
628 exog = array_like(exog, 'exog', ndim=2, optional=True)
629 exog_oos = array_like(exog_oos, 'exog_oos', ndim=2, optional=True)
631 start = 0 if start is None else start
632 end = self._index[-1] if end is None else end
633 start, end, num_oos, _ = self._get_prediction_index(start, end)
634 start = max(start, self._hold_back)
635 if self.exog is None and (exog is not None or exog_oos is not None):
636 raise ValueError('exog and exog_oos cannot be used when the model '
637 'does not contains exogenous regressors.')
638 elif self.exog is not None:
639 if exog is not None and exog.shape != self.exog.shape:
640 msg = ('The shape of exog {0} must match the shape of the '
641 'exog variable used to create the model {1}.')
642 raise ValueError(msg.format(exog.shape, self.exog.shape))
643 if exog_oos is not None and \
644 exog_oos.shape[1] != self.exog.shape[1]:
645 msg = ('The number of columns in exog_oos ({0}) must match '
646 'the number of columns in the exog variable used to '
647 'create the model ({1}).')
648 raise ValueError(msg.format(exog_oos.shape[1],
649 self.exog.shape[1]))
650 if num_oos > 0 and exog_oos is None:
651 raise ValueError('exog_oos must be provided when producing '
652 'out-of-sample forecasts.')
653 elif exog_oos is not None and num_oos > exog_oos.shape[0]:
654 msg = ('start and end indicate that {0} out-of-sample '
655 'predictions must be computed. exog_oos has {1} rows '
656 'but must have at least {0}.')
657 raise ValueError(msg.format(num_oos, exog_oos.shape[0]))
659 if (isinstance(dynamic, bool) and not dynamic) or self._maxlag == 0:
660 # If model has no lags, static and dynamic are identical
661 return self._static_predict(params, start, end, num_oos,
662 exog, exog_oos)
664 if isinstance(dynamic, (str, bytes, pd.Timestamp, dt.datetime)):
665 dynamic, _, _ = self._get_index_loc(dynamic)
666 offset = dynamic - start
667 elif dynamic is True:
668 # if True, all forecasts are dynamic, except start
669 offset = 0
670 else:
671 offset = int(dynamic)
672 dynamic = start + offset
673 if dynamic < 0:
674 raise ValueError('Dynamic prediction cannot begin prior to the'
675 ' first observation in the sample.')
677 return self._dynamic_predict(params, start, end, dynamic, num_oos,
678 exog, exog_oos)
681class AR(tsa_model.TimeSeriesModel):
682 __doc__ = tsa_model._tsa_doc % {"model": "Autoregressive AR(p) model.\n\n"
683 " .. deprecated:: 0.11",
684 "params": """endog : array_like
685 A 1-d endogenous response variable. The independent variable.""",
686 "extra_params": base._missing_param_doc,
687 "extra_sections": ""}
689 def __init__(self, endog, dates=None, freq=None, missing='none'):
690 import warnings
691 warnings.warn(AR_DEPRECATION_WARN, FutureWarning)
692 super(AR, self).__init__(endog, None, dates, freq, missing=missing)
693 endog = self.endog # original might not have been an ndarray
694 if endog.ndim == 1:
695 endog = endog[:, None]
696 self.endog = endog # to get shapes right
697 elif endog.ndim > 1 and endog.shape[1] != 1:
698 raise ValueError("Only the univariate case is implemented")
699 self._fit_params = None
701 def initialize(self):
702 """Initialization of the model (no-op)."""
703 pass
705 def _transparams(self, params):
706 """
707 Transforms params to induce stationarity/invertability.
709 Reference
710 ---------
711 Jones(1980)
712 """
713 p = self.k_ar
714 k = self.k_trend
715 newparams = params.copy()
716 newparams[k:k + p] = _ar_transparams(params[k:k + p].copy())
717 return newparams
719 def _invtransparams(self, start_params):
720 """
721 Inverse of the Jones reparameterization
722 """
723 p = self.k_ar
724 k = self.k_trend
725 newparams = start_params.copy()
726 newparams[k:k + p] = _ar_invtransparams(start_params[k:k + p].copy())
727 return newparams
729 def _presample_fit(self, params, start, p, end, y, predictedvalues):
730 """
731 Return the pre-sample predicted values using the Kalman Filter
733 Notes
734 -----
735 See predict method for how to use start and p.
736 """
737 k = self.k_trend
739 # build system matrices
740 T_mat = KalmanFilter.T(params, p, k, p)
741 R_mat = KalmanFilter.R(params, p, k, 0, p)
743 # Initial State mean and variance
744 alpha = np.zeros((p, 1))
745 Q_0 = np.dot(inv(np.identity(p ** 2) - np.kron(T_mat, T_mat)),
746 np.dot(R_mat, R_mat.T).ravel('F'))
748 Q_0 = Q_0.reshape(p, p, order='F') # TODO: order might need to be p+k
749 P = Q_0
750 Z_mat = KalmanFilter.Z(p)
751 for i in range(end): # iterate p-1 times to fit presample
752 v_mat = y[i] - np.dot(Z_mat, alpha)
753 F_mat = np.dot(np.dot(Z_mat, P), Z_mat.T)
754 Finv = 1. / F_mat # inv. always scalar
755 K = np.dot(np.dot(np.dot(T_mat, P), Z_mat.T), Finv)
756 # update state
757 alpha = np.dot(T_mat, alpha) + np.dot(K, v_mat)
758 L = T_mat - np.dot(K, Z_mat)
759 P = np.dot(np.dot(T_mat, P), L.T) + np.dot(R_mat, R_mat.T)
760 if i >= start - 1: # only record if we ask for it
761 predictedvalues[i + 1 - start] = np.dot(Z_mat, alpha)
763 def _get_prediction_index(self, start, end, dynamic, index=None):
764 method = getattr(self, 'method', 'mle')
765 k_ar = getattr(self, 'k_ar', 0)
766 if start is None:
767 if method == 'mle' and not dynamic:
768 start = 0
769 else: # cannot do presample fit for cmle or dynamic
770 start = k_ar
771 start = self._index[start]
772 if end is None:
773 end = self._index[-1]
775 start, end, out_of_sample, prediction_index = (
776 super(AR, self)._get_prediction_index(start, end, index))
778 # Other validation
779 if (method == 'cmle' or dynamic) and start < k_ar:
780 raise ValueError("Start must be >= k_ar for conditional MLE "
781 "or dynamic forecast. Got %d" % start)
783 return start, end, out_of_sample, prediction_index
785 def predict(self, params, start=None, end=None, dynamic=False):
786 """
787 Construct in-sample and out-of-sample prediction.
789 Parameters
790 ----------
791 params : ndarray
792 The fitted model parameters.
793 start : int, str, or datetime
794 Zero-indexed observation number at which to start forecasting, ie.,
795 the first forecast is start. Can also be a date string to
796 parse or a datetime type.
797 end : int, str, or datetime
798 Zero-indexed observation number at which to end forecasting, ie.,
799 the first forecast is start. Can also be a date string to
800 parse or a datetime type.
801 dynamic : bool
802 The `dynamic` keyword affects in-sample prediction. If dynamic
803 is False, then the in-sample lagged values are used for
804 prediction. If `dynamic` is True, then in-sample forecasts are
805 used in place of lagged dependent variables. The first forecasted
806 value is `start`.
808 Returns
809 -------
810 array_like
811 An array containing the predicted values.
813 Notes
814 -----
815 The linear Gaussian Kalman filter is used to return pre-sample fitted
816 values. The exact initial Kalman Filter is used. See Durbin and Koopman
817 in the references for more information.
818 """
819 if not (hasattr(self, 'k_ar') and hasattr(self, 'k_trend')):
820 raise RuntimeError('Model must be fit before calling predict')
821 # will return an index of a date
822 start, end, out_of_sample, _ = (
823 self._get_prediction_index(start, end, dynamic))
825 k_ar = self.k_ar
826 k_trend = self.k_trend
827 method = self.method
828 endog = self.endog.squeeze()
830 if dynamic:
831 out_of_sample += end - start + 1
832 return _ar_predict_out_of_sample(endog, params, k_ar,
833 k_trend, out_of_sample, start)
835 predictedvalues = np.zeros(end + 1 - start)
837 # fit pre-sample
838 if method == 'mle': # use Kalman Filter to get initial values
839 if k_trend:
840 mu = params[0] / (1 - np.sum(params[k_trend:]))
841 else:
842 mu = 0
844 # modifies predictedvalues in place
845 if start < k_ar:
846 self._presample_fit(params, start, k_ar, min(k_ar - 1, end),
847 endog[:k_ar] - mu, predictedvalues)
848 predictedvalues[:k_ar - start] += mu
850 if end < k_ar:
851 return predictedvalues
853 # just do the whole thing and truncate
854 fittedvalues = np.dot(self.X, params)
856 pv_start = max(k_ar - start, 0)
857 fv_start = max(start - k_ar, 0)
858 fv_end = min(len(fittedvalues), end - k_ar + 1)
859 predictedvalues[pv_start:] = fittedvalues[fv_start:fv_end]
861 if out_of_sample:
862 forecastvalues = _ar_predict_out_of_sample(endog, params,
863 k_ar, k_trend,
864 out_of_sample)
865 predictedvalues = np.r_[predictedvalues, forecastvalues]
867 return predictedvalues
869 def _presample_varcov(self, params):
870 """
871 Returns the inverse of the presample variance-covariance.
873 Notes
874 -----
875 See Hamilton p. 125
876 """
877 k = self.k_trend
878 p = self.k_ar
880 # get inv(Vp) Hamilton 5.3.7
881 params0 = np.r_[-1, params[k:]]
883 Vpinv = np.zeros((p, p), dtype=params.dtype)
884 for i in range(1, p + 1):
885 Vpinv[i - 1, i - 1:] = np.correlate(params0, params0[:i])[:-1]
886 Vpinv[i - 1, i - 1:] -= np.correlate(params0[-i:], params0)[:-1]
888 Vpinv = Vpinv + Vpinv.T - np.diag(Vpinv.diagonal())
889 return Vpinv
891 def _loglike_css(self, params):
892 """
893 Loglikelihood of AR(p) process using conditional sum of squares
894 """
895 nobs = self.nobs
896 Y = self.Y
897 X = self.X
898 ssr = sumofsq(Y.squeeze() - np.dot(X, params))
899 sigma2 = ssr / nobs
900 return -nobs / 2 * (np.log(2 * np.pi) + np.log(sigma2) + 1)
902 def _loglike_mle(self, params):
903 """
904 Loglikelihood of AR(p) process using exact maximum likelihood
905 """
906 nobs = self.nobs
907 X = self.X
908 endog = self.endog
909 k_ar = self.k_ar
910 k_trend = self.k_trend
912 # reparameterize according to Jones (1980) like in ARMA/Kalman Filter
913 if self.transparams:
914 params = self._transparams(params)
916 # get mean and variance for pre-sample lags
917 yp = endog[:k_ar].copy()
918 if k_trend:
919 c = [params[0]] * k_ar
920 else:
921 c = [0]
922 mup = np.asarray(c / (1 - np.sum(params[k_trend:])))
923 diffp = yp - mup[:, None]
925 # get inv(Vp) Hamilton 5.3.7
926 Vpinv = self._presample_varcov(params)
928 diffpVpinv = np.dot(np.dot(diffp.T, Vpinv), diffp).item()
929 ssr = sumofsq(endog[k_ar:].squeeze() - np.dot(X, params))
931 # concentrating the likelihood means that sigma2 is given by
932 sigma2 = 1. / nobs * (diffpVpinv + ssr)
933 self.sigma2 = sigma2
934 logdet = slogdet(Vpinv)[1] # TODO: add check for singularity
935 loglike = -1 / 2. * (nobs * (np.log(2 * np.pi) + np.log(sigma2))
936 - logdet + diffpVpinv / sigma2 + ssr / sigma2)
937 return loglike
939 def loglike(self, params):
940 r"""
941 The loglikelihood of an AR(p) process.
943 Parameters
944 ----------
945 params : ndarray
946 The fitted parameters of the AR model.
948 Returns
949 -------
950 float
951 The loglikelihood evaluated at `params`.
953 Notes
954 -----
955 Contains constant term. If the model is fit by OLS then this returns
956 the conditional maximum likelihood.
958 .. math::
960 \frac{\left(n-p\right)}{2}\left(\log\left(2\pi\right)
961 +\log\left(\sigma^{2}\right)\right)
962 -\frac{1}{\sigma^{2}}\sum_{i}\epsilon_{i}^{2}
964 If it is fit by MLE then the (exact) unconditional maximum likelihood
965 is returned.
967 .. math::
969 -\frac{n}{2}log\left(2\pi\right)
970 -\frac{n}{2}\log\left(\sigma^{2}\right)
971 +\frac{1}{2}\left|V_{p}^{-1}\right|
972 -\frac{1}{2\sigma^{2}}\left(y_{p}
973 -\mu_{p}\right)^{\prime}V_{p}^{-1}\left(y_{p}-\mu_{p}\right)
974 -\frac{1}{2\sigma^{2}}\sum_{t=p+1}^{n}\epsilon_{i}^{2}
976 where
978 :math:`\mu_{p}` is a (`p` x 1) vector with each element equal to the
979 mean of the AR process and :math:`\sigma^{2}V_{p}` is the (`p` x `p`)
980 variance-covariance matrix of the first `p` observations.
981 """
982 # Math is on Hamilton ~pp 124-5
983 if self.method == "cmle":
984 return self._loglike_css(params)
986 else:
987 return self._loglike_mle(params)
989 def score(self, params):
990 """
991 Compute the gradient of the log-likelihood at params.
993 Parameters
994 ----------
995 params : array_like
996 The parameter values at which to evaluate the score function.
998 Returns
999 -------
1000 ndarray
1001 The gradient computed using numerical methods.
1002 """
1003 loglike = self.loglike
1004 return approx_fprime(params, loglike, epsilon=1e-8)
1006 def information(self, params):
1007 """
1008 Not implemented.
1010 Parameters
1011 ----------
1012 params : ndarray
1013 The model parameters.
1014 """
1015 return
1017 def hessian(self, params):
1018 """
1019 Compute the hessian using a numerical approximation.
1021 Parameters
1022 ----------
1023 params : ndarray
1024 The model parameters.
1026 Returns
1027 -------
1028 ndarray
1029 The hessian evaluated at params.
1030 """
1031 loglike = self.loglike
1032 return approx_hess(params, loglike)
1034 def _stackX(self, k_ar, trend):
1035 """
1036 Private method to build the RHS matrix for estimation.
1038 Columns are trend terms then lags.
1039 """
1040 endog = self.endog
1041 X = lagmat(endog, maxlag=k_ar, trim='both')
1042 k_trend = util.get_trendorder(trend)
1043 if k_trend:
1044 X = add_trend(X, prepend=True, trend=trend, has_constant="raise")
1045 self.k_trend = k_trend
1046 return X
1048 def select_order(self, maxlag, ic, trend='c', method='mle'):
1049 """
1050 Select the lag order according to the information criterion.
1052 Parameters
1053 ----------
1054 maxlag : int
1055 The highest lag length tried. See `AR.fit`.
1056 ic : {'aic','bic','hqic','t-stat'}
1057 Criterion used for selecting the optimal lag length.
1058 See `AR.fit`.
1059 trend : {'c','nc'}
1060 Whether to include a constant or not. 'c' - include constant.
1061 'nc' - no constant.
1062 method : {'cmle', 'mle'}, optional
1063 The method to use in estimation.
1065 * 'cmle' - Conditional maximum likelihood using OLS
1066 * 'mle' - Unconditional (exact) maximum likelihood. See `solver`
1067 and the Notes.
1069 Returns
1070 -------
1071 int
1072 Best lag according to the information criteria.
1073 """
1074 endog = self.endog
1076 # make Y and X with same nobs to compare ICs
1077 Y = endog[maxlag:]
1078 self.Y = Y # attach to get correct fit stats
1079 X = self._stackX(maxlag, trend) # sets k_trend
1080 self.X = X
1081 k = self.k_trend # k_trend set in _stackX
1082 k = max(1, k) # handle if startlag is 0
1083 results = {}
1085 if ic != 't-stat':
1086 for lag in range(k, maxlag + 1):
1087 # have to reinstantiate the model to keep comparable models
1088 endog_tmp = endog[maxlag - lag:]
1089 fit = AR(endog_tmp).fit(maxlag=lag, method=method,
1090 full_output=0, trend=trend,
1091 maxiter=100, disp=0)
1092 results[lag] = getattr(fit, ic)
1093 bestic, bestlag = min((res, k) for k, res in results.items())
1095 else: # choose by last t-stat.
1096 stop = 1.6448536269514722 # for t-stat, norm.ppf(.95)
1097 for lag in range(maxlag, k - 1, -1):
1098 # have to reinstantiate the model to keep comparable models
1099 endog_tmp = endog[maxlag - lag:]
1100 fit = AR(endog_tmp).fit(maxlag=lag, method=method,
1101 full_output=0, trend=trend,
1102 maxiter=35, disp=-1)
1104 bestlag = 0
1105 if np.abs(fit.tvalues[-1]) >= stop:
1106 bestlag = lag
1107 break
1108 return bestlag
1110 def fit(self, maxlag=None, method='cmle', ic=None, trend='c',
1111 transparams=True, start_params=None, solver='lbfgs', maxiter=35,
1112 full_output=1, disp=1, callback=None, **kwargs):
1113 """
1114 Fit the unconditional maximum likelihood of an AR(p) process.
1116 Parameters
1117 ----------
1118 maxlag : int
1119 If `ic` is None, then maxlag is the lag length used in fit. If
1120 `ic` is specified then maxlag is the highest lag order used to
1121 select the correct lag order. If maxlag is None, the default is
1122 round(12*(nobs/100.)**(1/4.)).
1123 method : {'cmle', 'mle'}, optional
1124 The method to use in estimation.
1126 * 'cmle' - Conditional maximum likelihood using OLS
1127 * 'mle' - Unconditional (exact) maximum likelihood. See `solver`
1128 and the Notes.
1129 ic : {'aic','bic','hic','t-stat'}
1130 Criterion used for selecting the optimal lag length.
1132 * 'aic' - Akaike Information Criterion
1133 * 'bic' - Bayes Information Criterion
1134 * 't-stat' - Based on last lag
1135 * 'hqic' - Hannan-Quinn Information Criterion
1137 If any of the information criteria are selected, the lag length
1138 which results in the lowest value is selected. If t-stat, the
1139 model starts with maxlag and drops a lag until the highest lag
1140 has a t-stat that is significant at the 95 % level.
1141 trend : {'c','nc'}
1142 Whether to include a constant or not.
1144 * 'c' - include constant.
1145 * 'nc' - no constant.
1146 transparams : bool, optional
1147 Whether or not to transform the parameters to ensure stationarity.
1148 Uses the transformation suggested in Jones (1980).
1149 start_params : array_like, optional
1150 A first guess on the parameters. Default is cmle estimates.
1151 solver : str or None, optional
1152 Solver to be used if method is 'mle'. The default is 'lbfgs'
1153 (limited memory Broyden-Fletcher-Goldfarb-Shanno). Other choices
1154 are 'bfgs', 'newton' (Newton-Raphson), 'nm' (Nelder-Mead),
1155 'cg' - (conjugate gradient), 'ncg' (non-conjugate gradient),
1156 and 'powell'.
1157 maxiter : int, optional
1158 The maximum number of function evaluations. Default is 35.
1159 full_output : bool, optional
1160 If True, all output from solver will be available in
1161 the Results object's mle_retvals attribute. Output is dependent
1162 on the solver. See Notes for more information.
1163 disp : bool, optional
1164 If True, convergence information is output.
1165 callback : function, optional
1166 Called after each iteration as callback(xk) where xk is the current
1167 parameter vector.
1168 **kwargs
1169 See LikelihoodModel.fit for keyword arguments that can be passed
1170 to fit.
1172 Returns
1173 -------
1174 ARResults
1175 Results instance.
1177 See Also
1178 --------
1179 statsmodels.base.model.LikelihoodModel.fit
1180 Base fit class with further details about options.
1182 Notes
1183 -----
1184 The parameters after `trend` are only used when method is 'mle'.
1186 References
1187 ----------
1188 .. [*] Jones, R.H. 1980 "Maximum likelihood fitting of ARMA models to
1189 time series with missing observations." `Technometrics`. 22.3.
1190 389-95.
1191 """
1192 start_params = array_like(start_params, 'start_params', ndim=1,
1193 optional=True)
1194 method = method.lower()
1195 if method not in ['cmle', 'mle']:
1196 raise ValueError("Method %s not recognized" % method)
1197 self.method = method
1198 self.trend = trend
1199 self.transparams = transparams
1200 nobs = len(self.endog) # overwritten if method is 'cmle'
1201 endog = self.endog
1202 # The parameters are no longer allowed to change in an instance
1203 fit_params = (maxlag, method, ic, trend)
1204 if self._fit_params is not None and self._fit_params != fit_params:
1205 raise RuntimeError(REPEATED_FIT_ERROR.format(*self._fit_params))
1206 if maxlag is None:
1207 maxlag = int(round(12 * (nobs / 100.) ** (1 / 4.)))
1208 k_ar = maxlag # stays this if ic is None
1210 # select lag length
1211 if ic is not None:
1212 ic = ic.lower()
1213 if ic not in ['aic', 'bic', 'hqic', 't-stat']:
1214 raise ValueError("ic option %s not understood" % ic)
1215 k_ar = self.select_order(k_ar, ic, trend, method)
1217 self.k_ar = k_ar # change to what was chosen by ic
1219 # redo estimation for best lag
1220 # make LHS
1221 Y = endog[k_ar:, :]
1222 # make lagged RHS
1223 X = self._stackX(k_ar, trend) # sets self.k_trend
1224 k_trend = self.k_trend
1225 self.exog_names = util.make_lag_names(self.endog_names, k_ar, k_trend)
1226 self.Y = Y
1227 self.X = X
1229 if method == "cmle": # do OLS
1230 arfit = OLS(Y, X).fit()
1231 params = arfit.params
1232 self.nobs = nobs - k_ar
1233 self.sigma2 = arfit.ssr / arfit.nobs # needed for predict fcasterr
1235 else: # method == "mle"
1236 solver = solver.lower()
1237 self.nobs = nobs
1238 if start_params is None:
1239 start_params = OLS(Y, X).fit().params
1240 else:
1241 if len(start_params) != k_trend + k_ar:
1242 raise ValueError("Length of start params is %d. There"
1243 " are %d parameters." %
1244 (len(start_params), k_trend + k_ar))
1245 start_params = self._invtransparams(start_params)
1246 if solver == 'lbfgs':
1247 kwargs.setdefault('pgtol', 1e-8)
1248 kwargs.setdefault('factr', 1e2)
1249 kwargs.setdefault('m', 12)
1250 kwargs.setdefault('approx_grad', True)
1251 mlefit = super(AR, self).fit(start_params=start_params,
1252 method=solver, maxiter=maxiter,
1253 full_output=full_output, disp=disp,
1254 callback=callback, **kwargs)
1256 params = mlefit.params
1257 if self.transparams:
1258 params = self._transparams(params)
1259 self.transparams = False # turn off now for other results
1261 pinv_exog = np.linalg.pinv(X)
1262 normalized_cov_params = np.dot(pinv_exog, pinv_exog.T)
1263 arfit = ARResults(copy.copy(self), params, normalized_cov_params)
1264 if method == 'mle' and full_output:
1265 arfit.mle_retvals = mlefit.mle_retvals
1266 arfit.mle_settings = mlefit.mle_settings
1267 # Set fit params since completed the fit
1268 if self._fit_params is None:
1269 self._fit_params = fit_params
1270 return ARResultsWrapper(arfit)
1273class ARResults(tsa_model.TimeSeriesModelResults):
1274 """
1275 Class to hold results from fitting an AR model.
1277 Parameters
1278 ----------
1279 model : AR Model instance
1280 Reference to the model that is fit.
1281 params : ndarray
1282 The fitted parameters from the AR Model.
1283 normalized_cov_params : ndarray
1284 The array inv(dot(x.T,x)) where x contains the regressors in the
1285 model.
1286 scale : float, optional
1287 An estimate of the scale of the model.
1289 Attributes
1290 ----------
1291 k_ar : float
1292 Lag length. Sometimes used as `p` in the docs.
1293 k_trend : float
1294 The number of trend terms included. 'nc'=0, 'c'=1.
1295 llf : float
1296 The loglikelihood of the model evaluated at `params`. See `AR.loglike`
1297 model : AR model instance
1298 A reference to the fitted AR model.
1299 nobs : float
1300 The number of available observations `nobs` - `k_ar`
1301 n_totobs : float
1302 The number of total observations in `endog`. Sometimes `n` in the docs.
1303 params : ndarray
1304 The fitted parameters of the model.
1305 scale : float
1306 Same as sigma2
1307 sigma2 : float
1308 The variance of the innovations (residuals).
1309 trendorder : int
1310 The polynomial order of the trend. 'nc' = None, 'c' or 't' = 0,
1311 'ct' = 1, etc.
1312 """
1314 _cache = {} # for scale setter
1316 def __init__(self, model, params, normalized_cov_params=None, scale=1.):
1317 super(ARResults, self).__init__(model, params, normalized_cov_params,
1318 scale)
1319 self._cache = {}
1320 self.nobs = model.nobs
1321 n_totobs = len(model.endog)
1322 self.n_totobs = n_totobs
1323 self.X = model.X # copy?
1324 self.Y = model.Y
1325 k_ar = model.k_ar
1326 self.k_ar = k_ar
1327 k_trend = model.k_trend
1328 self.k_trend = k_trend
1329 trendorder = None
1330 if k_trend > 0:
1331 trendorder = k_trend - 1
1332 self.trendorder = trendorder
1333 # TODO: cmle vs mle?
1334 self.df_model = k_ar + k_trend
1335 self.df_resid = self.model.df_resid = n_totobs - self.df_model
1337 @cache_writable()
1338 def sigma2(self):
1339 model = self.model
1340 if model.method == "cmle": # do DOF correction
1341 return 1. / self.nobs * sumofsq(self.resid)
1342 else:
1343 return self.model.sigma2
1345 @cache_writable() # for compatability with RegressionResults
1346 def scale(self):
1347 return self.sigma2
1349 @cache_readonly
1350 def bse(self): # allow user to specify?
1351 """
1352 The standard errors of the estimated parameters.
1354 If `method` is 'cmle', then the standard errors that are returned are
1355 the OLS standard errors of the coefficients. If the `method` is 'mle'
1356 then they are computed using the numerical Hessian.
1357 """
1358 if self.model.method == "cmle": # uses different scale/sigma def.
1359 resid = self.resid
1360 ssr = np.dot(resid, resid)
1361 ols_scale = ssr / (self.nobs - self.k_ar - self.k_trend)
1362 return np.sqrt(np.diag(self.cov_params(scale=ols_scale)))
1363 else:
1364 hess = approx_hess(self.params, self.model.loglike)
1365 return np.sqrt(np.diag(-np.linalg.inv(hess)))
1367 @cache_readonly
1368 def pvalues(self):
1369 """The p values associated with the standard errors."""
1370 return norm.sf(np.abs(self.tvalues)) * 2
1372 @cache_readonly
1373 def aic(self):
1374 """
1375 Akaike Information Criterion using Lutkephol's definition.
1377 :math:`log(sigma) + 2*(1 + k_ar + k_trend)/nobs`
1378 """
1379 # TODO: this is based on loglike with dropped constant terms ?
1380 # Lutkepohl
1381 # return np.log(self.sigma2) + 1./self.model.nobs * self.k_ar
1382 # Include constant as estimated free parameter and double the loss
1383 return np.log(self.sigma2) + 2 * (1 + self.df_model) / self.nobs
1384 # Stata defintion
1385 # nobs = self.nobs
1386 # return -2 * self.llf/nobs + 2 * (self.k_ar+self.k_trend)/nobs
1388 @cache_readonly
1389 def hqic(self):
1390 """Hannan-Quinn Information Criterion."""
1391 nobs = self.nobs
1392 # Lutkepohl
1393 # return np.log(self.sigma2)+ 2 * np.log(np.log(nobs))/nobs * self.k_ar
1394 # R uses all estimated parameters rather than just lags
1395 return (np.log(self.sigma2) + 2 * np.log(np.log(nobs))
1396 / nobs * (1 + self.df_model))
1397 # Stata
1398 # nobs = self.nobs
1399 # return -2 * self.llf/nobs + 2 * np.log(np.log(nobs))/nobs * \
1400 # (self.k_ar + self.k_trend)
1402 @cache_readonly
1403 def fpe(self):
1404 """
1405 Final prediction error using Lütkepohl's definition.
1407 ((n_totobs+k_trend)/(n_totobs-k_ar-k_trend))*sigma
1408 """
1409 nobs = self.nobs
1410 df_model = self.df_model
1411 # Lutkepohl
1412 return ((nobs + df_model) / (nobs - df_model)) * self.sigma2
1414 @cache_readonly
1415 def bic(self):
1416 """
1417 Bayes Information Criterion
1419 :math:`\\log(\\sigma) + (1 + k_ar + k_trend)*\\log(nobs)/nobs`
1420 """
1421 nobs = self.nobs
1422 # Lutkepohl
1423 # return np.log(self.sigma2) + np.log(nobs)/nobs * self.k_ar
1424 # Include constant as est. free parameter
1425 return np.log(self.sigma2) + (1 + self.df_model) * np.log(nobs) / nobs
1426 # Stata
1427 # return -2 * self.llf/nobs + np.log(nobs)/nobs * (self.k_ar + \
1428 # self.k_trend)
1430 @cache_readonly
1431 def resid(self):
1432 """
1433 The residuals of the model.
1435 If the model is fit by 'mle' then the pre-sample residuals are
1436 calculated using fittedvalues from the Kalman Filter.
1437 """
1438 # NOTE: uses fittedvalues because it calculate presample values for mle
1439 model = self.model
1440 endog = model.endog.squeeze()
1441 if model.method == "cmle": # elimate pre-sample
1442 return endog[self.k_ar:] - self.fittedvalues
1443 else:
1444 return model.endog.squeeze() - self.fittedvalues
1446 @cache_readonly
1447 def roots(self):
1448 """
1449 The roots of the AR process.
1451 The roots are the solution to
1452 (1 - arparams[0]*z - arparams[1]*z**2 -...- arparams[p-1]*z**k_ar) = 0.
1453 Stability requires that the roots in modulus lie outside the unit
1454 circle.
1455 """
1456 k = self.k_trend
1457 return np.roots(np.r_[1, -self.params[k:]]) ** -1
1459 @cache_readonly
1460 def arfreq(self):
1461 r"""
1462 Returns the frequency of the AR roots.
1464 This is the solution, x, to z = abs(z)*exp(2j*np.pi*x) where z are the
1465 roots.
1466 """
1467 z = self.roots
1468 return np.arctan2(z.imag, z.real) / (2 * np.pi)
1470 @cache_readonly
1471 def fittedvalues(self):
1472 """
1473 The in-sample predicted values of the fitted AR model.
1475 The `k_ar` initial values are computed via the Kalman Filter if the
1476 model is fit by `mle`.
1477 """
1478 return self.model.predict(self.params)
1480 @Appender(remove_parameters(AR.predict.__doc__, 'params'))
1481 def predict(self, start=None, end=None, dynamic=False):
1482 params = self.params
1483 predictedvalues = self.model.predict(params, start, end, dynamic)
1484 return predictedvalues
1485 # TODO: consider returning forecast errors and confidence intervals?
1487 def summary(self, alpha=.05):
1488 """Summarize the Model
1490 Parameters
1491 ----------
1492 alpha : float, optional
1493 Significance level for the confidence intervals.
1495 Returns
1496 -------
1497 smry : Summary instance
1498 This holds the summary table and text, which can be printed or
1499 converted to various output formats.
1501 See Also
1502 --------
1503 statsmodels.iolib.summary.Summary
1504 """
1505 model = self.model
1506 title = model.__class__.__name__ + ' Model Results'
1507 method = model.method
1508 # get sample
1509 start = 0 if 'mle' in method else self.k_ar
1510 if self.data.dates is not None:
1511 dates = self.data.dates
1512 sample = [dates[start].strftime('%m-%d-%Y')]
1513 sample += ['- ' + dates[-1].strftime('%m-%d-%Y')]
1514 else:
1515 sample = str(start) + ' - ' + str(len(self.data.orig_endog))
1517 k_ar = self.k_ar
1518 order = '({0})'.format(k_ar)
1519 dep_name = str(self.model.endog_names)
1520 top_left = [('Dep. Variable:', dep_name),
1521 ('Model:', [model.__class__.__name__ + order]),
1522 ('Method:', [method]),
1523 ('Date:', None),
1524 ('Time:', None),
1525 ('Sample:', [sample[0]]),
1526 ('', [sample[1]])
1527 ]
1529 top_right = [
1530 ('No. Observations:', [str(len(self.model.endog))]),
1531 ('Log Likelihood', ["%#5.3f" % self.llf]),
1532 ('S.D. of innovations', ["%#5.3f" % self.sigma2 ** .5]),
1533 ('AIC', ["%#5.3f" % self.aic]),
1534 ('BIC', ["%#5.3f" % self.bic]),
1535 ('HQIC', ["%#5.3f" % self.hqic])]
1537 smry = Summary()
1538 smry.add_table_2cols(self, gleft=top_left, gright=top_right,
1539 title=title)
1540 smry.add_table_params(self, alpha=alpha, use_t=False)
1542 # Make the roots table
1543 from statsmodels.iolib.table import SimpleTable
1545 if k_ar:
1546 arstubs = ["AR.%d" % i for i in range(1, k_ar + 1)]
1547 stubs = arstubs
1548 roots = self.roots
1549 freq = self.arfreq
1550 else: # AR(0) model
1551 stubs = []
1552 if len(stubs): # not AR(0)
1553 modulus = np.abs(roots)
1554 data = np.column_stack((roots.real, roots.imag, modulus, freq))
1555 roots_table = SimpleTable([('%17.4f' % row[0],
1556 '%+17.4fj' % row[1],
1557 '%17.4f' % row[2],
1558 '%17.4f' % row[3]) for row in data],
1559 headers=[' Real',
1560 ' Imaginary',
1561 ' Modulus',
1562 ' Frequency'],
1563 title="Roots",
1564 stubs=stubs)
1566 smry.tables.append(roots_table)
1567 return smry
1570class ARResultsWrapper(wrap.ResultsWrapper):
1571 _attrs = {}
1572 _wrap_attrs = wrap.union_dicts(
1573 tsa_model.TimeSeriesResultsWrapper._wrap_attrs, _attrs)
1574 _methods = {}
1575 _wrap_methods = wrap.union_dicts(
1576 tsa_model.TimeSeriesResultsWrapper._wrap_methods, _methods)
1579wrap.populate_wrapper(ARResultsWrapper, ARResults)
1581doc = Docstring(AutoReg.predict.__doc__)
1582_predict_params = doc.extract_parameters(['start', 'end', 'dynamic',
1583 'exog', 'exog_oos'], 8)
1586class AutoRegResults(tsa_model.TimeSeriesModelResults):
1587 """
1588 Class to hold results from fitting an AutoReg model.
1590 Parameters
1591 ----------
1592 model : AutoReg
1593 Reference to the model that is fit.
1594 params : ndarray
1595 The fitted parameters from the AR Model.
1596 cov_params : ndarray
1597 The estimated covariance matrix of the model parameters.
1598 normalized_cov_params : ndarray
1599 The array inv(dot(x.T,x)) where x contains the regressors in the
1600 model.
1601 scale : float, optional
1602 An estimate of the scale of the model.
1603 """
1605 _cache = {} # for scale setter
1607 def __init__(self, model, params, cov_params, normalized_cov_params=None,
1608 scale=1.):
1609 super(AutoRegResults, self).__init__(model, params,
1610 normalized_cov_params, scale)
1611 self._cache = {}
1612 self._params = params
1613 self._nobs = model.nobs
1614 self._n_totobs = model.endog.shape[0]
1615 self._df_model = model.df_model
1616 self._ar_lags = model.ar_lags
1617 self._max_lag = 0
1618 if self._ar_lags.shape[0] > 0:
1619 self._max_lag = self._ar_lags.max()
1620 self._hold_back = self.model.hold_back
1621 self.cov_params_default = cov_params
1623 def initialize(self, model, params, **kwargs):
1624 """
1625 Initialize (possibly re-initialize) a Results instance.
1627 Parameters
1628 ----------
1629 model : Model
1630 The model instance.
1631 params : ndarray
1632 The model parameters.
1633 **kwargs
1634 Any additional keyword arguments required to initialize the model.
1635 """
1636 self._params = params
1637 self.model = model
1639 @property
1640 def ar_lags(self):
1641 """The autoregressive lags included in the model"""
1642 return self._ar_lags
1644 @property
1645 def params(self):
1646 """The estimated parameters."""
1647 return self._params
1649 @property
1650 def df_model(self):
1651 """The degrees of freedom consumed by the model."""
1652 return self._df_model
1654 @property
1655 def df_resid(self):
1656 """The remaining degrees of freedom in the residuals."""
1657 return self.nobs - self._df_model
1659 @property
1660 def nobs(self):
1661 """
1662 The number of observations after adjusting for losses due to lags.
1663 """
1664 return self._nobs
1666 @cache_writable()
1667 def sigma2(self):
1668 return 1. / self.nobs * sumofsq(self.resid)
1670 @cache_writable() # for compatability with RegressionResults
1671 def scale(self):
1672 return self.sigma2
1674 @cache_readonly
1675 def bse(self): # allow user to specify?
1676 """
1677 The standard errors of the estimated parameters.
1679 If `method` is 'cmle', then the standard errors that are returned are
1680 the OLS standard errors of the coefficients. If the `method` is 'mle'
1681 then they are computed using the numerical Hessian.
1682 """
1683 return np.sqrt(np.diag(self.cov_params()))
1685 @cache_readonly
1686 def aic(self):
1687 """
1688 Akaike Information Criterion using Lutkephol's definition.
1690 :math:`log(sigma) + 2*(1 + df_model) / nobs`
1691 """
1692 # This is based on loglike with dropped constant terms ?
1693 # Lutkepohl
1694 # return np.log(self.sigma2) + 1./self.model.nobs * self.k_ar
1695 # Include constant as estimated free parameter and double the loss
1696 # Stata defintion
1697 # nobs = self.nobs
1698 # return -2 * self.llf/nobs + 2 * (self.k_ar+self.k_trend)/nobs
1699 return np.log(self.sigma2) + 2 * (1 + self.df_model) / self.nobs
1701 @cache_readonly
1702 def hqic(self):
1703 """Hannan-Quinn Information Criterion."""
1704 # Lutkepohl
1705 # return np.log(self.sigma2)+ 2 * np.log(np.log(nobs))/nobs * self.k_ar
1706 # R uses all estimated parameters rather than just lags
1707 # Stata
1708 # nobs = self.nobs
1709 # return -2 * self.llf/nobs + 2 * np.log(np.log(nobs))/nobs * \
1710 # (self.k_ar + self.k_trend)
1711 nobs = self.nobs
1712 loglog_n = np.log(np.log(nobs))
1713 log_sigma2 = np.log(self.sigma2)
1714 return (log_sigma2 + 2 * loglog_n / nobs * (1 + self.df_model))
1716 @cache_readonly
1717 def fpe(self):
1718 """
1719 Final prediction error using Lütkepohl's definition.
1721 ((n_totobs+k_trend)/(n_totobs-k_ar-k_trend))*sigma
1722 """
1723 nobs = self.nobs
1724 df_model = self.df_model
1725 # Lutkepohl
1726 return ((nobs + df_model) / (nobs - df_model)) * self.sigma2
1728 @cache_readonly
1729 def bic(self):
1730 r"""
1731 Bayes Information Criterion
1733 :math:`\ln(\sigma) + df_{model} \ln(nobs)/nobs`
1734 """
1735 # Lutkepohl
1736 # np.log(self.sigma2) + np.log(nobs)/nobs * self.k_ar
1737 # Include constant as est. free parameter
1738 # Stata
1739 # -2 * self.llf/nobs + np.log(nobs)/nobs * (self.k_ar + self.k_trend)
1740 nobs = self.nobs
1741 return np.log(self.sigma2) + (1 + self.df_model) * np.log(nobs) / nobs
1743 @cache_readonly
1744 def resid(self):
1745 """
1746 The residuals of the model.
1747 """
1748 model = self.model
1749 endog = model.endog.squeeze()
1750 return endog[self._hold_back:] - self.fittedvalues
1752 def _lag_repr(self):
1753 """Returns poly repr of an AR, (1 -phi1 L -phi2 L^2-...)"""
1754 k_ar = len(self.ar_lags)
1755 ar_params = np.zeros(self._max_lag + 1)
1756 ar_params[0] = 1
1757 df_model = self._df_model
1758 exog = self.model.exog
1759 k_exog = exog.shape[1] if exog is not None else 0
1760 params = self._params[df_model - k_ar - k_exog:df_model - k_exog]
1761 for i, lag in enumerate(self._ar_lags):
1762 ar_params[lag] = -params[i]
1763 return ar_params
1765 @cache_readonly
1766 def roots(self):
1767 """
1768 The roots of the AR process.
1770 The roots are the solution to
1771 (1 - arparams[0]*z - arparams[1]*z**2 -...- arparams[p-1]*z**k_ar) = 0.
1772 Stability requires that the roots in modulus lie outside the unit
1773 circle.
1774 """
1775 lag_repr = self._lag_repr()
1776 if lag_repr.shape[0] == 1:
1777 return np.empty(0)
1779 return np.roots(lag_repr) ** -1
1781 @cache_readonly
1782 def arfreq(self):
1783 r"""
1784 Returns the frequency of the AR roots.
1786 This is the solution, x, to z = abs(z)*exp(2j*np.pi*x) where z are the
1787 roots.
1788 """
1789 z = self.roots
1790 return np.arctan2(z.imag, z.real) / (2 * np.pi)
1792 @cache_readonly
1793 def fittedvalues(self):
1794 """
1795 The in-sample predicted values of the fitted AR model.
1797 The `k_ar` initial values are computed via the Kalman Filter if the
1798 model is fit by `mle`.
1799 """
1800 return self.model.predict(self.params)
1802 def test_serial_correlation(self, lags=None, model_df=None):
1803 """
1804 Ljung-Box test for residual serial correlation
1806 Parameters
1807 ----------
1808 lags : int
1809 The maximum number of lags to use in the test. Jointly tests that
1810 all autocorrelations up to and including lag j are zero for
1811 j = 1, 2, ..., lags. If None, uses lag=12*(nobs/100)^{1/4}.
1812 model_df : int
1813 The model degree of freedom to use when adjusting computing the
1814 test statistic to account for parameter estimation. If None, uses
1815 the number of AR lags included in the model.
1817 Returns
1818 -------
1819 output : DataFrame
1820 DataFrame containing three columns: the test statistic, the
1821 p-value of the test, and the degree of freedom used in the test.
1823 Notes
1824 -----
1825 Null hypothesis is no serial correlation.
1827 The the test degree-of-freedom is 0 or negative once accounting for
1828 model_df, then the test statistic's p-value is missing.
1830 See Also
1831 --------
1832 statsmodels.stats.diagnostic.acorr_ljungbox
1833 Ljung-Box test for serial correlation.
1834 """
1835 # Deferred to prevent circular import
1836 from statsmodels.stats.diagnostic import acorr_ljungbox
1838 lags = int_like(lags, 'lags', optional=True)
1839 model_df = int_like(model_df, 'df_model', optional=True)
1840 model_df = len(self.ar_lags) if model_df is None else model_df
1841 nobs_effective = self.resid.shape[0]
1842 # Default lags for acorr_ljungbox is 40, but may not always have
1843 # that many observations
1844 if lags is None:
1845 lags = int(min(12 * (nobs_effective / 100) ** (1 / 4),
1846 nobs_effective - 1))
1847 test_stats = acorr_ljungbox(self.resid, lags=lags, boxpierce=False,
1848 model_df=model_df, return_df=False)
1849 cols = ['Ljung-Box', 'LB P-value', 'DF']
1850 if lags == 1:
1851 test_stats = [list(test_stats) + [max(0, 1 - model_df)]]
1852 else:
1853 df = np.clip(np.arange(1, lags + 1) - model_df, 0, np.inf).astype(
1854 np.int)
1855 test_stats = list(test_stats) + [df]
1856 test_stats = [[test_stats[i][j] for i in range(3)] for j in
1857 range(lags)]
1858 index = pd.RangeIndex(1, lags + 1, name='Lag')
1859 return pd.DataFrame(test_stats,
1860 columns=cols,
1861 index=index)
1863 def test_normality(self):
1864 """
1865 Test for normality of standardized residuals.
1867 Returns
1868 -------
1869 Series
1870 Series containing four values, the test statistic and its p-value,
1871 the skewness and the kurtosis.
1873 Notes
1874 -----
1875 Null hypothesis is normality.
1877 See Also
1878 --------
1879 statsmodels.stats.stattools.jarque_bera
1880 The Jarque-Bera test of normality.
1881 """
1882 # Deferred to prevent circular import
1883 from statsmodels.stats.stattools import jarque_bera
1884 index = ['Jarque-Bera', 'P-value', 'Skewness', 'Kurtosis']
1885 return pd.Series(jarque_bera(self.resid), index=index)
1887 def test_heteroskedasticity(self, lags=None):
1888 """
1889 ARCH-LM test of residual heteroskedasticity
1891 Parameters
1892 ----------
1893 lags : int
1894 The maximum number of lags to use in the test. Jointly tests that
1895 all squared autocorrelations up to and including lag j are zero for
1896 j = 1, 2, ..., lags. If None, uses lag=12*(nobs/100)^{1/4}.
1898 Returns
1899 -------
1900 Series
1901 Series containing the test statistic and its p-values.
1903 See Also
1904 --------
1905 statsmodels.stats.diagnostic.het_arch
1906 ARCH-LM test.
1907 statsmodels.stats.diagnostic.acorr_lm
1908 LM test for autocorrelation.
1909 """
1910 from statsmodels.stats.diagnostic import het_arch
1912 lags = int_like(lags, 'lags', optional=True)
1913 nobs_effective = self.resid.shape[0]
1914 if lags is None:
1915 max_lag = (nobs_effective - 1) // 2
1916 lags = int(min(12 * (nobs_effective / 100) ** (1 / 4), max_lag))
1917 out = []
1918 for lag in range(1, lags + 1):
1919 res = het_arch(self.resid, maxlag=lag, autolag=None)
1920 out.append([res[0], res[1], lag])
1921 index = pd.RangeIndex(1, lags + 1, name='Lag')
1922 cols = ['ARCH-LM', 'P-value', 'DF']
1923 return pd.DataFrame(out, columns=cols, index=index)
1925 def diagnostic_summary(self):
1926 """
1927 Returns a summary containing standard model diagnostic tests
1929 Returns
1930 -------
1931 Summary
1932 A summary instance with panels for serial correlation tests,
1933 normality tests and heteroskedasticity tests.
1935 See Also
1936 --------
1937 test_serial_correlation
1938 Test models residuals for serial correlation.
1939 test_normality
1940 Test models residuals for deviations from normality.
1941 test_heteroskedasticity
1942 Test models residuals for conditional heteroskedasticity.
1943 """
1944 from statsmodels.iolib.table import SimpleTable
1945 spacer = SimpleTable([''])
1946 smry = Summary()
1947 sc = self.test_serial_correlation()
1948 sc = sc.loc[sc.DF > 0]
1949 values = [[i + 1] + row for i, row in enumerate(sc.values.tolist())]
1950 data_fmts = ('%10d', '%10.3f', '%10.3f', '%10d')
1951 if sc.shape[0]:
1952 tab = SimpleTable(values,
1953 headers=['Lag'] + list(sc.columns),
1954 title='Test of No Serial Correlation',
1955 header_align='r', data_fmts=data_fmts)
1956 smry.tables.append(tab)
1957 smry.tables.append(spacer)
1958 jb = self.test_normality()
1959 data_fmts = ('%10.3f', '%10.3f', '%10.3f', '%10.3f')
1960 tab = SimpleTable([jb.values], headers=list(jb.index),
1961 title='Test of Normality',
1962 header_align='r', data_fmts=data_fmts)
1963 smry.tables.append(tab)
1964 smry.tables.append(spacer)
1965 arch_lm = self.test_heteroskedasticity()
1966 values = [[i + 1] + row for i, row in
1967 enumerate(arch_lm.values.tolist())]
1968 data_fmts = ('%10d', '%10.3f', '%10.3f', '%10d')
1969 tab = SimpleTable(values,
1970 headers=['Lag'] + list(arch_lm.columns),
1971 title='Test of Conditional Homoskedasticity',
1972 header_align='r', data_fmts=data_fmts)
1973 smry.tables.append(tab)
1974 return smry
1976 @Appender(remove_parameters(AutoReg.predict.__doc__, 'params'))
1977 def predict(self, start=None, end=None, dynamic=False, exog=None,
1978 exog_oos=None):
1979 return self.model.predict(self._params, start=start, end=end,
1980 dynamic=dynamic, exog=exog,
1981 exog_oos=exog_oos)
1983 @Substitution(predict_params=_predict_params)
1984 def plot_predict(self, start=None, end=None, dynamic=False, exog=None,
1985 exog_oos=None, alpha=.05, in_sample=True, fig=None,
1986 figsize=None):
1987 """
1988 Plot in- and out-of-sample predictions
1990 Parameters
1991 ----------
1992%(predict_params)s
1993 alpha : {float, None}
1994 The tail probability not covered by the confidence interval. Must
1995 be in (0, 1). Confidence interval is constructed assuming normally
1996 distributed shocks. If None, figure will not show the confidence
1997 interval.
1998 in_sample : bool
1999 Flag indicating whether to include the in-sample period in the
2000 plot.
2001 fig : Figure
2002 An existing figure handle. If not provided, a new figure is
2003 created.
2004 figsize: tuple[float, float]
2005 Tuple containing the figure size values.
2007 Returns
2008 -------
2009 Figure
2010 Figure handle containing the plot.
2011 """
2012 from statsmodels.graphics.utils import _import_mpl, create_mpl_fig
2013 _import_mpl()
2014 fig = create_mpl_fig(fig, figsize)
2015 predictions = self.predict(start=start, end=end, dynamic=dynamic,
2016 exog=exog, exog_oos=exog_oos)
2017 start = 0 if start is None else start
2018 end = self.model._index[-1] if end is None else end
2019 _, _, oos, _ = self.model._get_prediction_index(start, end)
2021 ax = fig.add_subplot(111)
2022 if in_sample:
2023 ax.plot(predictions)
2024 elif oos:
2025 if isinstance(predictions, pd.Series):
2026 predictions = predictions.iloc[-oos:]
2027 else:
2028 predictions = predictions[-oos:]
2029 else:
2030 raise ValueError('in_sample is False but there are no'
2031 'out-of-sample forecasts to plot.')
2033 if oos and alpha is not None:
2034 pred_oos = np.asarray(predictions)[-oos:]
2035 ar_params = self._lag_repr()
2036 ma = arma2ma(ar_params, [1], lags=oos)
2037 fc_error = np.sqrt(self.sigma2) * np.cumsum(ma ** 2)
2038 quantile = norm.ppf(alpha / 2)
2039 lower = pred_oos + fc_error * quantile
2040 upper = pred_oos + fc_error * -quantile
2041 label = "{0:.0%} confidence interval".format(1 - alpha)
2042 x = ax.get_lines()[-1].get_xdata()
2043 ax.fill_between(x[-oos:], lower, upper,
2044 color='gray', alpha=.5, label=label)
2046 ax.legend(loc='best')
2048 return fig
2050 def plot_diagnostics(self, lags=10, fig=None, figsize=None):
2051 """
2052 Diagnostic plots for standardized residuals
2054 Parameters
2055 ----------
2056 lags : int, optional
2057 Number of lags to include in the correlogram. Default is 10.
2058 fig : Figure, optional
2059 If given, subplots are created in this figure instead of in a new
2060 figure. Note that the 2x2 grid will be created in the provided
2061 figure using `fig.add_subplot()`.
2062 figsize : tuple, optional
2063 If a figure is created, this argument allows specifying a size.
2064 The tuple is (width, height).
2066 Notes
2067 -----
2068 Produces a 2x2 plot grid with the following plots (ordered clockwise
2069 from top left):
2071 1. Standardized residuals over time
2072 2. Histogram plus estimated density of standardized residuals, along
2073 with a Normal(0,1) density plotted for reference.
2074 3. Normal Q-Q plot, with Normal reference line.
2075 4. Correlogram
2077 See Also
2078 --------
2079 statsmodels.graphics.gofplots.qqplot
2080 statsmodels.graphics.tsaplots.plot_acf
2081 """
2082 from statsmodels.graphics.utils import _import_mpl, create_mpl_fig
2083 _import_mpl()
2084 fig = create_mpl_fig(fig, figsize)
2085 # Eliminate residuals associated with burned or diffuse likelihoods
2086 resid = self.resid
2088 # Top-left: residuals vs time
2089 ax = fig.add_subplot(221)
2090 if hasattr(self.model.data, 'dates') and self.data.dates is not None:
2091 x = self.model.data.dates._mpl_repr()
2092 x = x[self.model.hold_back:]
2093 else:
2094 hold_back = self.model.hold_back
2095 x = hold_back + np.arange(self.resid.shape[0])
2096 std_resid = resid / np.sqrt(self.sigma2)
2097 ax.plot(x, std_resid)
2098 ax.hlines(0, x[0], x[-1], alpha=0.5)
2099 ax.set_xlim(x[0], x[-1])
2100 ax.set_title('Standardized residual')
2102 # Top-right: histogram, Gaussian kernel density, Normal density
2103 # Can only do histogram and Gaussian kernel density on the non-null
2104 # elements
2105 std_resid_nonmissing = std_resid[~(np.isnan(resid))]
2106 ax = fig.add_subplot(222)
2108 # gh5792: Remove except after support for matplotlib>2.1 required
2109 try:
2110 ax.hist(std_resid_nonmissing, density=True, label='Hist')
2111 except AttributeError:
2112 ax.hist(std_resid_nonmissing, normed=True, label='Hist')
2114 kde = gaussian_kde(std_resid)
2115 xlim = (-1.96 * 2, 1.96 * 2)
2116 x = np.linspace(xlim[0], xlim[1])
2117 ax.plot(x, kde(x), label='KDE')
2118 ax.plot(x, norm.pdf(x), label='N(0,1)')
2119 ax.set_xlim(xlim)
2120 ax.legend()
2121 ax.set_title('Histogram plus estimated density')
2123 # Bottom-left: QQ plot
2124 ax = fig.add_subplot(223)
2125 from statsmodels.graphics.gofplots import qqplot
2126 qqplot(std_resid, line='s', ax=ax)
2127 ax.set_title('Normal Q-Q')
2129 # Bottom-right: Correlogram
2130 ax = fig.add_subplot(224)
2131 from statsmodels.graphics.tsaplots import plot_acf
2132 plot_acf(resid, ax=ax, lags=lags)
2133 ax.set_title('Correlogram')
2135 ax.set_ylim(-1, 1)
2137 return fig
2139 def summary(self, alpha=.05):
2140 """
2141 Summarize the Model
2143 Parameters
2144 ----------
2145 alpha : float, optional
2146 Significance level for the confidence intervals.
2148 Returns
2149 -------
2150 smry : Summary instance
2151 This holds the summary table and text, which can be printed or
2152 converted to various output formats.
2154 See Also
2155 --------
2156 statsmodels.iolib.summary.Summary
2157 """
2158 model = self.model
2160 title = model.__class__.__name__ + ' Model Results'
2161 method = 'Conditional MLE'
2162 # get sample
2163 start = self._hold_back
2164 if self.data.dates is not None:
2165 dates = self.data.dates
2166 sample = [dates[start].strftime('%m-%d-%Y')]
2167 sample += ['- ' + dates[-1].strftime('%m-%d-%Y')]
2168 else:
2169 sample = [str(start), str(len(self.data.orig_endog))]
2170 model = model.__class__.__name__
2171 if self.model.seasonal:
2172 model = 'Seas. ' + model
2173 if len(self.ar_lags) < self._max_lag:
2174 model = 'Restr. ' + model
2175 if self.model.exog is not None:
2176 model += '-X'
2178 order = '({0})'.format(self._max_lag)
2179 dep_name = str(self.model.endog_names)
2180 top_left = [('Dep. Variable:', [dep_name]),
2181 ('Model:', [model + order]),
2182 ('Method:', [method]),
2183 ('Date:', None),
2184 ('Time:', None),
2185 ('Sample:', [sample[0]]),
2186 ('', [sample[1]])
2187 ]
2189 top_right = [
2190 ('No. Observations:', [str(len(self.model.endog))]),
2191 ('Log Likelihood', ["%#5.3f" % self.llf]),
2192 ('S.D. of innovations', ["%#5.3f" % self.sigma2 ** .5]),
2193 ('AIC', ["%#5.3f" % self.aic]),
2194 ('BIC', ["%#5.3f" % self.bic]),
2195 ('HQIC', ["%#5.3f" % self.hqic])]
2197 smry = Summary()
2198 smry.add_table_2cols(self, gleft=top_left, gright=top_right,
2199 title=title)
2200 smry.add_table_params(self, alpha=alpha, use_t=False)
2202 # Make the roots table
2203 from statsmodels.iolib.table import SimpleTable
2205 if self._max_lag:
2206 arstubs = ["AR.%d" % i for i in range(1, self._max_lag + 1)]
2207 stubs = arstubs
2208 roots = self.roots
2209 freq = self.arfreq
2210 modulus = np.abs(roots)
2211 data = np.column_stack((roots.real, roots.imag, modulus, freq))
2212 roots_table = SimpleTable([('%17.4f' % row[0],
2213 '%+17.4fj' % row[1],
2214 '%17.4f' % row[2],
2215 '%17.4f' % row[3]) for row in data],
2216 headers=[' Real',
2217 ' Imaginary',
2218 ' Modulus',
2219 ' Frequency'],
2220 title="Roots",
2221 stubs=stubs)
2223 smry.tables.append(roots_table)
2224 return smry
2227class AutoRegResultsWrapper(wrap.ResultsWrapper):
2228 _attrs = {}
2229 _wrap_attrs = wrap.union_dicts(
2230 tsa_model.TimeSeriesResultsWrapper._wrap_attrs, _attrs)
2231 _methods = {}
2232 _wrap_methods = wrap.union_dicts(
2233 tsa_model.TimeSeriesResultsWrapper._wrap_methods, _methods)
2236wrap.populate_wrapper(AutoRegResultsWrapper, AutoRegResults)
2238doc = Docstring(AutoReg.__doc__)
2239_auto_reg_params = doc.extract_parameters(['trend', 'seasonal', 'exog',
2240 'hold_back', 'period', 'missing'],
2241 4)
2244@Substitution(auto_reg_params=_auto_reg_params)
2245def ar_select_order(endog, maxlag, ic='bic', glob=False, trend='c',
2246 seasonal=False, exog=None, hold_back=None, period=None,
2247 missing='none'):
2248 """
2249 Autoregressive AR-X(p) model order selection.
2251 Parameters
2252 ----------
2253 endog : array_like
2254 A 1-d endogenous response variable. The independent variable.
2255 maxlag : int
2256 The maximum lag to consider.
2257 ic : {'aic', 'hqic', 'bic'}
2258 The information criterion to use in the selection.
2259 glob : bool
2260 Flag indicating where to use a global search across all combinations
2261 of lags. In practice, this option is not computational feasible when
2262 maxlag is larger than 15 (or perhaps 20) since the global search
2263 requires fitting 2**maxlag models.
2264%(auto_reg_params)s
2266 Returns
2267 -------
2268 AROrderSelectionResults
2269 A results holder containing the model and the complete set of
2270 information criteria for all models fit.
2272 Examples
2273 --------
2274 >>> from statsmodels.tsa.ar_model import ar_select_order
2275 >>> data = sm.datasets.sunspots.load_pandas().data['SUNACTIVITY']
2277 Determine the optimal lag structure
2279 >>> mod = ar_select_order(data, maxlag=13)
2280 >>> mod.ar_lags
2281 array([1, 2, 3, 4, 5, 6, 7, 8, 9])
2283 Determine the optimal lag structure with seasonal terms
2285 >>> mod = ar_select_order(data, maxlag=13, seasonal=True, period=12)
2286 >>> mod.ar_lags
2287 array([1, 2, 3, 4, 5, 6, 7, 8, 9])
2289 Globally determine the optimal lag structure
2291 >>> mod = ar_select_order(data, maxlag=13, glob=True)
2292 >>> mod.ar_lags
2293 array([1, 2, 9])
2294 """
2295 full_mod = AutoReg(endog, maxlag, trend=trend, seasonal=seasonal,
2296 exog=exog, hold_back=hold_back, period=period,
2297 missing=missing)
2298 nexog = full_mod.exog.shape[1] if full_mod.exog is not None else 0
2299 y, x = full_mod._y, full_mod._x
2300 base_col = x.shape[1] - nexog - maxlag
2301 sel = np.ones(x.shape[1], dtype=bool)
2302 ics = []
2304 def compute_ics(res):
2305 nobs = res.nobs
2306 df_model = res.df_model
2307 sigma2 = 1. / nobs * sumofsq(res.resid)
2309 res = SimpleNamespace(nobs=nobs, df_model=df_model, sigma2=sigma2)
2311 aic = AutoRegResults.aic.func(res)
2312 bic = AutoRegResults.bic.func(res)
2313 hqic = AutoRegResults.hqic.func(res)
2315 return aic, bic, hqic
2317 def ic_no_data():
2318 """Fake mod and results to handle no regressor case"""
2319 mod = SimpleNamespace(nobs=y.shape[0],
2320 endog=y,
2321 exog=np.empty((y.shape[0], 0)))
2322 llf = OLS.loglike(mod, np.empty(0))
2323 res = SimpleNamespace(resid=y, nobs=y.shape[0], llf=llf,
2324 df_model=0, k_constant=0)
2326 return compute_ics(res)
2328 if not glob:
2329 sel[base_col: base_col + maxlag] = False
2330 for i in range(maxlag + 1):
2331 sel[base_col:base_col + i] = True
2332 if not np.any(sel):
2333 ics.append((0, ic_no_data()))
2334 continue
2335 res = OLS(y, x[:, sel]).fit()
2336 lags = tuple(j for j in range(1, i + 1))
2337 lags = 0 if not lags else lags
2338 ics.append((lags, compute_ics(res)))
2339 else:
2340 bits = np.arange(2 ** maxlag, dtype=np.int32)[:, None]
2341 bits = bits.view(np.uint8)
2342 bits = np.unpackbits(bits).reshape(-1, 32)
2343 for i in range(4):
2344 bits[:, 8 * i:8 * (i + 1)] = bits[:, 8 * i:8 * (i + 1)][:, ::-1]
2345 masks = bits[:, :maxlag]
2346 for mask in masks:
2347 sel[base_col:base_col + maxlag] = mask
2348 if not np.any(sel):
2349 ics.append((0, ic_no_data()))
2350 continue
2351 res = OLS(y, x[:, sel]).fit()
2352 lags = tuple(np.where(mask)[0] + 1)
2353 lags = 0 if not lags else lags
2354 ics.append((lags, compute_ics(res)))
2356 key_loc = {'aic': 0, 'bic': 1, 'hqic': 2}[ic]
2357 ics = sorted(ics, key=lambda x: x[1][key_loc])
2358 selected_model = ics[0][0]
2359 mod = AutoReg(endog, selected_model, trend=trend, seasonal=seasonal,
2360 exog=exog, hold_back=hold_back, period=period,
2361 missing=missing)
2362 return AROrderSelectionResults(mod, ics, trend, seasonal, period)
2365class AROrderSelectionResults(object):
2366 """
2367 Results from an AR order selection
2369 Contains the information criteria for all fitted model orders.
2370 """
2372 def __init__(self, model, ics, trend, seasonal, period):
2373 self._model = model
2374 self._ics = ics
2375 self._trend = trend
2376 self._seasonal = seasonal
2377 self._period = period
2378 aic = sorted(ics, key=lambda r: r[1][0])
2379 self._aic = dict([(key, val[0]) for key, val in aic])
2380 bic = sorted(ics, key=lambda r: r[1][1])
2381 self._bic = dict([(key, val[1]) for key, val in bic])
2382 hqic = sorted(ics, key=lambda r: r[1][2])
2383 self._hqic = dict([(key, val[2]) for key, val in hqic])
2385 @property
2386 def model(self):
2387 """The model selected using the chosen information criterion."""
2388 return self._model
2390 @property
2391 def seasonal(self):
2392 """Flag indicating if a seasonal component is included."""
2393 return self._seasonal
2395 @property
2396 def trend(self):
2397 """The trend included in the model selection."""
2398 return self._trend
2400 @property
2401 def period(self):
2402 """The period of the seasonal component."""
2403 return self._period
2405 @property
2406 def aic(self):
2407 """
2408 The Akaike information criterion for the models fit.
2410 Returns
2411 -------
2412 dict[tuple, float]
2413 """
2414 return self._aic
2416 @property
2417 def bic(self):
2418 """
2419 The Bayesian (Schwarz) information criteria for the models fit.
2421 Returns
2422 -------
2423 dict[tuple, float]
2424 """
2425 return self._bic
2427 @property
2428 def hqic(self):
2429 """
2430 The Hannan-Quinn information criteria for the models fit.
2432 Returns
2433 -------
2434 dict[tuple, float]
2435 """
2436 return self._hqic
2438 @property
2439 def ar_lags(self):
2440 """The lags included in the selected model."""
2441 return self._model.ar_lags