Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2import copy 

3import datetime as dt 

4from collections.abc import Iterable 

5from types import SimpleNamespace 

6 

7import numpy as np 

8import pandas as pd 

9from numpy.linalg import inv, slogdet 

10from scipy.stats import norm, gaussian_kde 

11 

12import statsmodels.base.model as base 

13import statsmodels.base.wrapper as wrap 

14from statsmodels.compat.pandas import Appender, Substitution 

15from statsmodels.iolib.summary import Summary 

16from statsmodels.regression.linear_model import OLS 

17from statsmodels.tools.decorators import cache_readonly, cache_writable 

18from statsmodels.tools.docstring import (Docstring, remove_parameters) 

19from statsmodels.tools.numdiff import approx_fprime, approx_hess 

20from statsmodels.tools.validation import array_like, string_like, bool_like, \ 

21 int_like 

22from statsmodels.tsa.arima_process import arma2ma 

23from statsmodels.tsa.base import tsa_model 

24from statsmodels.tsa.kalmanf.kalmanfilter import KalmanFilter 

25from statsmodels.tsa.tsatools import (lagmat, add_trend, _ar_transparams, 

26 _ar_invtransparams, freq_to_period) 

27from statsmodels.tsa.vector_ar import util 

28 

29__all__ = ['AR', 'AutoReg'] 

30 

31AR_DEPRECATION_WARN = """ 

32statsmodels.tsa.AR has been deprecated in favor of statsmodels.tsa.AutoReg and 

33statsmodels.tsa.SARIMAX. 

34 

35AutoReg adds the ability to specify exogenous variables, include time trends, 

36and add seasonal dummies. The AutoReg API differs from AR since the model is 

37treated as immutable, and so the entire specification including the lag 

38length must be specified when creating the model. This change is too 

39substantial to incorporate into the existing AR api. The function 

40ar_select_order performs lag length selection for AutoReg models. 

41 

42AutoReg only estimates parameters using conditional MLE (OLS). Use SARIMAX to 

43estimate ARX and related models using full MLE via the Kalman Filter. 

44 

45To silence this warning and continue using AR until it is removed, use: 

46 

47import warnings 

48warnings.filterwarnings('ignore', 'statsmodels.tsa.ar_model.AR', FutureWarning) 

49""" 

50 

51REPEATED_FIT_ERROR = """ 

52Model has been fit using maxlag={0}, method={1}, ic={2}, trend={3}. These 

53cannot be changed in subsequent calls to `fit`. Instead, use a new instance of 

54AR. 

55""" 

56 

57 

58def sumofsq(x, axis=0): 

59 """Helper function to calculate sum of squares along first axis""" 

60 return np.sum(x ** 2, axis=axis) 

61 

62 

63def _ar_predict_out_of_sample(y, params, k_ar, k_trend, steps, start=0): 

64 mu = params[:k_trend] if k_trend else 0 # only have to worry constant 

65 arparams = params[k_trend:][::-1] # reverse for dot 

66 

67 # dynamic endogenous variable 

68 endog = np.zeros(k_ar + steps) # this is one too big but does not matter 

69 if start: 

70 endog[:k_ar] = y[start - k_ar:start] 

71 else: 

72 endog[:k_ar] = y[-k_ar:] 

73 

74 forecast = np.zeros(steps) 

75 for i in range(steps): 

76 fcast = mu + np.dot(arparams, endog[i:i + k_ar]) 

77 forecast[i] = fcast 

78 endog[i + k_ar] = fcast 

79 

80 return forecast 

81 

82 

83class AutoReg(tsa_model.TimeSeriesModel): 

84 """ 

85 Autoregressive AR-X(p) model. 

86 

87 Estimate an AR-X model using Conditional Maximum Likelihood (OLS). 

88 

89 Parameters 

90 ---------- 

91 endog : array_like 

92 A 1-d endogenous response variable. The independent variable. 

93 lags : {int, list[int]} 

94 The number of lags to include in the model if an integer or the 

95 list of lag indices to include. For example, [1, 4] will only 

96 include lags 1 and 4 while lags=4 will include lags 1, 2, 3, and 4. 

97 trend : {'n', 'c', 't', 'ct'} 

98 The trend to include in the model: 

99 

100 * 'n' - No trend. 

101 * 'c' - Constant only. 

102 * 't' - Time trend only. 

103 * 'ct' - Constant and time trend. 

104 

105 seasonal : bool 

106 Flag indicating whether to include seasonal dummies in the model. If 

107 seasonal is True and trend includes 'c', then the first period 

108 is excluded from the seasonal terms. 

109 exog : array_like, optional 

110 Exogenous variables to include in the model. Must have the same number 

111 of observations as endog and should be aligned so that endog[i] is 

112 regressed on exog[i]. 

113 hold_back : {None, int} 

114 Initial observations to exclude from the estimation sample. If None, 

115 then hold_back is equal to the maximum lag in the model. Set to a 

116 non-zero value to produce comparable models with different lag 

117 length. For example, to compare the fit of a model with lags=3 and 

118 lags=1, set hold_back=3 which ensures that both models are estimated 

119 using observations 3,...,nobs. hold_back must be >= the maximum lag in 

120 the model. 

121 period : {None, int} 

122 The period of the data. Only used if seasonal is True. This parameter 

123 can be omitted if using a pandas object for endog that contains a 

124 recognized frequency. 

125 missing : str 

126 Available options are 'none', 'drop', and 'raise'. If 'none', no nan 

127 checking is done. If 'drop', any observations with nans are dropped. 

128 If 'raise', an error is raised. Default is 'none'. 

129 

130 See Also 

131 -------- 

132 statsmodels.tsa.statespace.sarimax.SARIMAX 

133 Estimation of SARIMAX models using exact likelihood and the 

134 Kalman Filter. 

135 

136 Examples 

137 -------- 

138 >>> import statsmodels.api as sm 

139 >>> from statsmodels.tsa.ar_model import AutoReg 

140 >>> data = sm.datasets.sunspots.load_pandas().data['SUNACTIVITY'] 

141 >>> out = 'AIC: {0:0.3f}, HQIC: {1:0.3f}, BIC: {2:0.3f}' 

142 

143 Start by fitting an unrestricted Seasonal AR model 

144 

145 >>> res = AutoReg(data, lags = [1, 11, 12]).fit() 

146 >>> print(out.format(res.aic, res.hqic, res.bic)) 

147 AIC: 5.945, HQIC: 5.970, BIC: 6.007 

148 

149 An alternative used seasonal dummies 

150 

151 >>> res = AutoReg(data, lags=1, seasonal=True, period=11).fit() 

152 >>> print(out.format(res.aic, res.hqic, res.bic)) 

153 AIC: 6.017, HQIC: 6.080, BIC: 6.175 

154 

155 Finally, both the seasonal AR structure and dummies can be included 

156 

157 >>> res = AutoReg(data, lags=[1, 11, 12], seasonal=True, period=11).fit() 

158 >>> print(out.format(res.aic, res.hqic, res.bic)) 

159 AIC: 5.884, HQIC: 5.959, BIC: 6.071 

160 """ 

161 

162 def __init__(self, endog, lags, trend='c', seasonal=False, exog=None, 

163 hold_back=None, period=None, missing='none'): 

164 super(AutoReg, self).__init__(endog, exog, None, None, 

165 missing=missing) 

166 self._trend = string_like(trend, 'trend', 

167 options=('n', 'c', 't', 'ct')) 

168 self._seasonal = bool_like(seasonal, 'seasonal') 

169 self._period = int_like(period, 'period', optional=True) 

170 if self._period is None and self._seasonal: 

171 if self.data.freq: 

172 self._period = freq_to_period(self._index_freq) 

173 else: 

174 err = 'freq cannot be inferred from endog and model includes' \ 

175 ' seasonal terms. The number of periods must be ' \ 

176 'explicitly set when the endog\'s index does not ' \ 

177 'contain a frequency.' 

178 raise ValueError(err) 

179 self._lags = lags 

180 self._exog_names = [] 

181 self._k_ar = 0 

182 self._hold_back = int_like(hold_back, 'hold_back', optional=True) 

183 self._check_lags() 

184 self._setup_regressors() 

185 self.nobs = self._y.shape[0] 

186 self.data.xnames = self.exog_names 

187 

188 @property 

189 def ar_lags(self): 

190 """The autoregressive lags included in the model""" 

191 return self._lags 

192 

193 @property 

194 def hold_back(self): 

195 """The number of initial obs. excluded from the estimation sample.""" 

196 return self._hold_back 

197 

198 @property 

199 def seasonal(self): 

200 """Flag indicating that the model contains a seasonal component.""" 

201 return self._seasonal 

202 

203 @property 

204 def df_model(self): 

205 """The model degrees of freedom.""" 

206 return self._x.shape[1] 

207 

208 @property 

209 def exog_names(self): 

210 """Names of exogenous variables included in model""" 

211 return self._exog_names 

212 

213 def initialize(self): 

214 """Initialize the model (no-op).""" 

215 pass 

216 

217 def _check_lags(self): 

218 lags = self._lags 

219 if isinstance(lags, Iterable): 

220 lags = np.array(sorted([int_like(lag, 'lags') for lag in lags])) 

221 self._lags = lags 

222 if np.any(lags < 1) or np.unique(lags).shape[0] != lags.shape[0]: 

223 raise ValueError('All values in lags must be positive and ' 

224 'distinct.') 

225 self._maxlag = np.max(lags) 

226 else: 

227 self._maxlag = int_like(lags, 'lags') 

228 if self._maxlag < 0: 

229 raise ValueError('lags must be a positive scalar.') 

230 self._lags = np.arange(1, self._maxlag + 1) 

231 if self._hold_back is None: 

232 self._hold_back = self._maxlag 

233 if self._hold_back < self._maxlag: 

234 raise ValueError('hold_back must be >= lags if lags is an int or' 

235 'max(lags) if lags is array_like.') 

236 

237 def _setup_regressors(self): 

238 maxlag = self._maxlag 

239 hold_back = self._hold_back 

240 exog_names = [] 

241 endog_names = self.endog_names 

242 x, y = lagmat(self.endog, maxlag, original='sep') 

243 exog_names.extend([endog_names + '.L{0}'.format(lag) 

244 for lag in self._lags]) 

245 if len(self._lags) < maxlag: 

246 x = x[:, self._lags - 1] 

247 self._k_ar = x.shape[1] 

248 if self._seasonal: 

249 nobs, period = self.endog.shape[0], self._period 

250 season_names = ['seasonal.{0}'.format(i) for i in range(period)] 

251 dummies = np.zeros((nobs, period)) 

252 for i in range(period): 

253 dummies[i::period, i] = 1 

254 if 'c' in self._trend: 

255 dummies = dummies[:, 1:] 

256 season_names = season_names[1:] 

257 x = np.c_[dummies, x] 

258 exog_names = season_names + exog_names 

259 x = add_trend(x, trend=self._trend, prepend=True) 

260 if 't' in self._trend: 

261 exog_names.insert(0, 'trend') 

262 if 'c' in self._trend: 

263 exog_names.insert(0, 'intercept') 

264 if self.exog is not None: 

265 x = np.c_[x, self.exog] 

266 exog_names.extend(self.data.param_names) 

267 y = y[hold_back:] 

268 x = x[hold_back:] 

269 if y.shape[0] < x.shape[1]: 

270 reg = x.shape[1] 

271 trend = 0 if self._trend == 'n' else len(self._trend) 

272 seas = 0 if not self._seasonal else period - ('c' in self._trend) 

273 lags = self._lags.shape[0] 

274 nobs = y.shape[0] 

275 raise ValueError('The model specification cannot be estimated. ' 

276 'The model contains {0} regressors ({1} trend, ' 

277 '{2} seasonal, {3} lags) but after adjustment ' 

278 'for hold_back and creation of the lags, there ' 

279 'are only {4} data points available to estimate ' 

280 'parameters.'.format(reg, trend, seas, lags, 

281 nobs)) 

282 self._y, self._x = y, x 

283 self._exog_names = exog_names 

284 

285 def fit(self, cov_type='nonrobust', cov_kwds=None, use_t=False): 

286 """ 

287 Estimate the model parameters. 

288 

289 Parameters 

290 ---------- 

291 cov_type : str 

292 The covariance estimator to use. The most common choices are listed 

293 below. Supports all covariance estimators that are available 

294 in ``OLS.fit``. 

295 

296 * 'nonrobust' - The class OLS covariance estimator that assumes 

297 homoskedasticity. 

298 * 'HC0', 'HC1', 'HC2', 'HC3' - Variants of White's 

299 (or Eiker-Huber-White) covariance estimator. `HC0` is the 

300 standard implementation. The other make corrections to improve 

301 the finite sample performance of the heteroskedasticity robust 

302 covariance estimator. 

303 * 'HAC' - Heteroskedasticity-autocorrelation robust covariance 

304 estimation. Supports cov_kwds. 

305 

306 - `maxlag` integer (required) : number of lags to use. 

307 - `kernel` callable or str (optional) : kernel 

308 currently available kernels are ['bartlett', 'uniform'], 

309 default is Bartlett. 

310 - `use_correction` bool (optional) : If true, use small sample 

311 correction. 

312 cov_kwds : dict, optional 

313 A dictionary of keyword arguments to pass to the covariance 

314 estimator. `nonrobust` and `HC#` do not support cov_kwds. 

315 use_t : bool, optional 

316 A flag indicating that inference should use the Student's t 

317 distribution that accounts for model degree of freedom. If False, 

318 uses the normal distribution. If None, defers the choice to 

319 the cov_type. It also removes degree of freedom corrections from 

320 the covariance estimator when cov_type is 'nonrobust'. 

321 

322 Returns 

323 ------- 

324 AutoRegResults 

325 Estimation results. 

326 

327 See Also 

328 -------- 

329 statsmodels.regression.linear_model.OLS 

330 Ordinary Least Squares estimation. 

331 statsmodels.regression.linear_model.RegressionResults 

332 See ``get_robustcov_results`` for a detailed list of available 

333 covariance estimators and options. 

334 

335 Notes 

336 ----- 

337 Use ``OLS`` to estimate model parameters and to estimate parameter 

338 covariance. 

339 """ 

340 # TODO: Determine correction for degree-of-freedom 

341 # Special case parameterless model 

342 if self._x.shape[1] == 0: 

343 return AutoRegResultsWrapper(AutoRegResults(self, 

344 np.empty(0), 

345 np.empty((0, 0)))) 

346 

347 ols_mod = OLS(self._y, self._x) 

348 ols_res = ols_mod.fit(cov_type=cov_type, cov_kwds=cov_kwds, 

349 use_t=use_t) 

350 cov_params = ols_res.cov_params() 

351 use_t = ols_res.use_t 

352 if cov_type == "nonrobust" and not use_t: 

353 nobs = self._y.shape[0] 

354 k = self._x.shape[1] 

355 scale = nobs / (nobs - k) 

356 cov_params /= scale 

357 res = AutoRegResults(self, ols_res.params, cov_params, 

358 ols_res.normalized_cov_params) 

359 

360 return AutoRegResultsWrapper(res) 

361 

362 def _resid(self, params): 

363 params = array_like(params, 'params', ndim=2) 

364 resid = self._y - self._x @ params 

365 return resid.squeeze() 

366 

367 def loglike(self, params): 

368 """ 

369 Log-likelihood of model. 

370 

371 Parameters 

372 ---------- 

373 params : ndarray 

374 The model parameters used to compute the log-likelihood. 

375 

376 Returns 

377 ------- 

378 float 

379 The log-likelihood value. 

380 """ 

381 nobs = self.nobs 

382 resid = self._resid(params) 

383 ssr = resid @ resid 

384 llf = -(nobs / 2) * (np.log(2 * np.pi) + np.log(ssr / nobs) + 1) 

385 return llf 

386 

387 def score(self, params): 

388 """ 

389 Score vector of model. 

390 

391 The gradient of logL with respect to each parameter. 

392 

393 Parameters 

394 ---------- 

395 params : ndarray 

396 The parameters to use when evaluating the Hessian. 

397 

398 Returns 

399 ------- 

400 ndarray 

401 The score vector evaluated at the parameters. 

402 """ 

403 resid = self._resid(params) 

404 return self._x.T @ resid 

405 

406 def information(self, params): 

407 """ 

408 Fisher information matrix of model. 

409 

410 Returns -1 * Hessian of the log-likelihood evaluated at params. 

411 

412 Parameters 

413 ---------- 

414 params : ndarray 

415 The model parameters. 

416 

417 Returns 

418 ------- 

419 ndarray 

420 The information matrix. 

421 """ 

422 resid = self._resid(params) 

423 sigma2 = resid @ resid / self.nobs 

424 return sigma2 * (self._x.T @ self._x) 

425 

426 def hessian(self, params): 

427 """ 

428 The Hessian matrix of the model. 

429 

430 Parameters 

431 ---------- 

432 params : ndarray 

433 The parameters to use when evaluating the Hessian. 

434 

435 Returns 

436 ------- 

437 ndarray 

438 The hessian evaluated at the parameters. 

439 """ 

440 return -self.information(params) 

441 

442 def _setup_oos_forecast(self, add_forecasts, exog_oos): 

443 full_nobs = self.data.endog.shape[0] 

444 x = np.zeros((add_forecasts, self._x.shape[1])) 

445 loc = 0 

446 if 'c' in self._trend: 

447 x[:, 0] = 1 

448 loc += 1 

449 if 't' in self._trend: 

450 x[:, loc] = np.arange(full_nobs + 1, full_nobs + add_forecasts + 1) 

451 loc += 1 

452 if self.seasonal: 

453 seasonal = np.zeros((add_forecasts, self._period)) 

454 period = self._period 

455 col = full_nobs % period 

456 for i in range(period): 

457 seasonal[i::period, (col + i) % period] = 1 

458 if 'c' in self._trend: 

459 x[:, loc:loc + period - 1] = seasonal[:, 1:] 

460 loc += seasonal.shape[1] - 1 

461 else: 

462 x[:, loc:loc + period] = seasonal 

463 loc += seasonal.shape[1] 

464 # skip the AR columns 

465 loc += len(self._lags) 

466 if self.exog is not None: 

467 x[:, loc:] = exog_oos[:add_forecasts] 

468 return x 

469 

470 def _wrap_prediction(self, prediction, start, end): 

471 if not isinstance(self.data.orig_endog, (pd.Series, pd.DataFrame)): 

472 return prediction 

473 index = self.data.orig_endog.index 

474 if end > self.endog.shape[0]: 

475 freq = getattr(index, 'freq', None) 

476 if freq: 

477 index = pd.date_range(index[0], freq=freq, periods=end) 

478 else: 

479 index = pd.RangeIndex(end) 

480 index = index[start:end] 

481 return pd.Series(prediction, index=index) 

482 

483 def _dynamic_predict(self, params, start, end, dynamic, num_oos, exog, 

484 exog_oos): 

485 """ 

486 

487 :param params: 

488 :param start: 

489 :param end: 

490 :param dynamic: 

491 :param num_oos: 

492 :param exog: 

493 :param exog_oos: 

494 :return: 

495 """ 

496 reg = [] 

497 hold_back = self._hold_back 

498 if (start - hold_back) <= self.nobs: 

499 is_loc = slice(start - hold_back, end + 1 - hold_back) 

500 x = self._x[is_loc] 

501 if exog is not None: 

502 x = x.copy() 

503 # Replace final columns 

504 x[:, -exog.shape[1]:] = exog[start:end + 1] 

505 reg.append(x) 

506 if num_oos > 0: 

507 reg.append(self._setup_oos_forecast(num_oos, exog_oos)) 

508 reg = np.vstack(reg) 

509 det_col_idx = self._x.shape[1] - len(self._lags) 

510 det_col_idx -= 0 if self.exog is None else self.exog.shape[1] 

511 # + 1 is due t0 inclusiveness of predict functions 

512 adj_dynamic = dynamic - start + 1 

513 forecasts = np.empty(reg.shape[0]) 

514 forecasts[:adj_dynamic] = reg[:adj_dynamic] @ params 

515 for h in range(adj_dynamic, reg.shape[0]): 

516 # Fill in regressor matrix 

517 for j, lag in enumerate(self._lags): 

518 fcast_loc = h - lag 

519 if fcast_loc >= 0: 

520 val = forecasts[fcast_loc] 

521 else: 

522 # If before the start of the forecasts, use actual values 

523 val = self.endog[start + fcast_loc] 

524 reg[h, det_col_idx + j] = val 

525 forecasts[h] = reg[h:h + 1] @ params 

526 return self._wrap_prediction(forecasts, start, end + 1 + num_oos) 

527 

528 def _static_oos_predict(self, params, num_oos, exog_oos): 

529 new_x = self._setup_oos_forecast(num_oos, exog_oos) 

530 if self._maxlag == 0: 

531 return new_x @ params 

532 forecasts = np.empty(num_oos) 

533 nexog = 0 if self.exog is None else self.exog.shape[1] 

534 ar_offset = self._x.shape[1] - nexog - self._lags.shape[0] 

535 for i in range(num_oos): 

536 for j, lag in enumerate(self._lags): 

537 loc = i - lag 

538 val = self._y[loc] if loc < 0 else forecasts[loc] 

539 new_x[i, ar_offset + j] = val 

540 forecasts[i] = new_x[i:i + 1] @ params 

541 return forecasts 

542 

543 def _static_predict(self, params, start, end, num_oos, exog, exog_oos): 

544 """ 

545 Path for static predictions 

546 

547 Parameters 

548 ---------- 

549 start : int 

550 Index of first observation 

551 end : int 

552 Index of last in-sample observation. Inclusive, so start:end+1 

553 in slice notation. 

554 num_oos : int 

555 Number of out-of-sample observations, so that the returned size is 

556 num_oos + (end - start + 1). 

557 exog : ndarray 

558 Array containing replacement exog values 

559 exog_oos : ndarray 

560 Containing forecast exog values 

561 """ 

562 hold_back = self._hold_back 

563 nobs = self.endog.shape[0] 

564 

565 x = np.empty((0, self._x.shape[1])) 

566 if start <= nobs: 

567 is_loc = slice(start - hold_back, end + 1 - hold_back) 

568 x = self._x[is_loc] 

569 if exog is not None: 

570 x = x.copy() 

571 # Replace final columns 

572 x[:, -exog.shape[1]:] = exog[start:end + 1] 

573 in_sample = x @ params 

574 if num_oos == 0: # No out of sample 

575 return self._wrap_prediction(in_sample, start, end + 1) 

576 

577 out_of_sample = self._static_oos_predict(params, num_oos, exog_oos) 

578 

579 prediction = np.hstack((in_sample, out_of_sample)) 

580 return self._wrap_prediction(prediction, start, end + 1 + num_oos) 

581 

582 def predict(self, params, start=None, end=None, dynamic=False, exog=None, 

583 exog_oos=None): 

584 """ 

585 In-sample prediction and out-of-sample forecasting. 

586 

587 Parameters 

588 ---------- 

589 params : array_like 

590 The fitted model parameters. 

591 start : int, str, or datetime, optional 

592 Zero-indexed observation number at which to start forecasting, 

593 i.e., the first forecast is start. Can also be a date string to 

594 parse or a datetime type. Default is the the zeroth observation. 

595 end : int, str, or datetime, optional 

596 Zero-indexed observation number at which to end forecasting, i.e., 

597 the last forecast is end. Can also be a date string to 

598 parse or a datetime type. However, if the dates index does not 

599 have a fixed frequency, end must be an integer index if you 

600 want out-of-sample prediction. Default is the last observation in 

601 the sample. Unlike standard python slices, end is inclusive so 

602 that all the predictions [start, start+1, ..., end-1, end] are 

603 returned. 

604 dynamic : {bool, int, str, datetime, Timestamp}, optional 

605 Integer offset relative to `start` at which to begin dynamic 

606 prediction. Prior to this observation, true endogenous values 

607 will be used for prediction; starting with this observation and 

608 continuing through the end of prediction, forecasted endogenous 

609 values will be used instead. Datetime-like objects are not 

610 interpreted as offsets. They are instead used to find the index 

611 location of `dynamic` which is then used to to compute the offset. 

612 exog : array_like 

613 A replacement exogenous array. Must have the same shape as the 

614 exogenous data array used when the model was created. 

615 exog_oos : array_like 

616 An array containing out-of-sample values of the exogenous variable. 

617 Must has the same number of columns as the exog used when the 

618 model was created, and at least as many rows as the number of 

619 out-of-sample forecasts. 

620 

621 Returns 

622 ------- 

623 array_like 

624 Array of out of in-sample predictions and / or out-of-sample 

625 forecasts. An (npredict x k_endog) array. 

626 """ 

627 params = array_like(params, 'params') 

628 exog = array_like(exog, 'exog', ndim=2, optional=True) 

629 exog_oos = array_like(exog_oos, 'exog_oos', ndim=2, optional=True) 

630 

631 start = 0 if start is None else start 

632 end = self._index[-1] if end is None else end 

633 start, end, num_oos, _ = self._get_prediction_index(start, end) 

634 start = max(start, self._hold_back) 

635 if self.exog is None and (exog is not None or exog_oos is not None): 

636 raise ValueError('exog and exog_oos cannot be used when the model ' 

637 'does not contains exogenous regressors.') 

638 elif self.exog is not None: 

639 if exog is not None and exog.shape != self.exog.shape: 

640 msg = ('The shape of exog {0} must match the shape of the ' 

641 'exog variable used to create the model {1}.') 

642 raise ValueError(msg.format(exog.shape, self.exog.shape)) 

643 if exog_oos is not None and \ 

644 exog_oos.shape[1] != self.exog.shape[1]: 

645 msg = ('The number of columns in exog_oos ({0}) must match ' 

646 'the number of columns in the exog variable used to ' 

647 'create the model ({1}).') 

648 raise ValueError(msg.format(exog_oos.shape[1], 

649 self.exog.shape[1])) 

650 if num_oos > 0 and exog_oos is None: 

651 raise ValueError('exog_oos must be provided when producing ' 

652 'out-of-sample forecasts.') 

653 elif exog_oos is not None and num_oos > exog_oos.shape[0]: 

654 msg = ('start and end indicate that {0} out-of-sample ' 

655 'predictions must be computed. exog_oos has {1} rows ' 

656 'but must have at least {0}.') 

657 raise ValueError(msg.format(num_oos, exog_oos.shape[0])) 

658 

659 if (isinstance(dynamic, bool) and not dynamic) or self._maxlag == 0: 

660 # If model has no lags, static and dynamic are identical 

661 return self._static_predict(params, start, end, num_oos, 

662 exog, exog_oos) 

663 

664 if isinstance(dynamic, (str, bytes, pd.Timestamp, dt.datetime)): 

665 dynamic, _, _ = self._get_index_loc(dynamic) 

666 offset = dynamic - start 

667 elif dynamic is True: 

668 # if True, all forecasts are dynamic, except start 

669 offset = 0 

670 else: 

671 offset = int(dynamic) 

672 dynamic = start + offset 

673 if dynamic < 0: 

674 raise ValueError('Dynamic prediction cannot begin prior to the' 

675 ' first observation in the sample.') 

676 

677 return self._dynamic_predict(params, start, end, dynamic, num_oos, 

678 exog, exog_oos) 

679 

680 

681class AR(tsa_model.TimeSeriesModel): 

682 __doc__ = tsa_model._tsa_doc % {"model": "Autoregressive AR(p) model.\n\n" 

683 " .. deprecated:: 0.11", 

684 "params": """endog : array_like 

685 A 1-d endogenous response variable. The independent variable.""", 

686 "extra_params": base._missing_param_doc, 

687 "extra_sections": ""} 

688 

689 def __init__(self, endog, dates=None, freq=None, missing='none'): 

690 import warnings 

691 warnings.warn(AR_DEPRECATION_WARN, FutureWarning) 

692 super(AR, self).__init__(endog, None, dates, freq, missing=missing) 

693 endog = self.endog # original might not have been an ndarray 

694 if endog.ndim == 1: 

695 endog = endog[:, None] 

696 self.endog = endog # to get shapes right 

697 elif endog.ndim > 1 and endog.shape[1] != 1: 

698 raise ValueError("Only the univariate case is implemented") 

699 self._fit_params = None 

700 

701 def initialize(self): 

702 """Initialization of the model (no-op).""" 

703 pass 

704 

705 def _transparams(self, params): 

706 """ 

707 Transforms params to induce stationarity/invertability. 

708 

709 Reference 

710 --------- 

711 Jones(1980) 

712 """ 

713 p = self.k_ar 

714 k = self.k_trend 

715 newparams = params.copy() 

716 newparams[k:k + p] = _ar_transparams(params[k:k + p].copy()) 

717 return newparams 

718 

719 def _invtransparams(self, start_params): 

720 """ 

721 Inverse of the Jones reparameterization 

722 """ 

723 p = self.k_ar 

724 k = self.k_trend 

725 newparams = start_params.copy() 

726 newparams[k:k + p] = _ar_invtransparams(start_params[k:k + p].copy()) 

727 return newparams 

728 

729 def _presample_fit(self, params, start, p, end, y, predictedvalues): 

730 """ 

731 Return the pre-sample predicted values using the Kalman Filter 

732 

733 Notes 

734 ----- 

735 See predict method for how to use start and p. 

736 """ 

737 k = self.k_trend 

738 

739 # build system matrices 

740 T_mat = KalmanFilter.T(params, p, k, p) 

741 R_mat = KalmanFilter.R(params, p, k, 0, p) 

742 

743 # Initial State mean and variance 

744 alpha = np.zeros((p, 1)) 

745 Q_0 = np.dot(inv(np.identity(p ** 2) - np.kron(T_mat, T_mat)), 

746 np.dot(R_mat, R_mat.T).ravel('F')) 

747 

748 Q_0 = Q_0.reshape(p, p, order='F') # TODO: order might need to be p+k 

749 P = Q_0 

750 Z_mat = KalmanFilter.Z(p) 

751 for i in range(end): # iterate p-1 times to fit presample 

752 v_mat = y[i] - np.dot(Z_mat, alpha) 

753 F_mat = np.dot(np.dot(Z_mat, P), Z_mat.T) 

754 Finv = 1. / F_mat # inv. always scalar 

755 K = np.dot(np.dot(np.dot(T_mat, P), Z_mat.T), Finv) 

756 # update state 

757 alpha = np.dot(T_mat, alpha) + np.dot(K, v_mat) 

758 L = T_mat - np.dot(K, Z_mat) 

759 P = np.dot(np.dot(T_mat, P), L.T) + np.dot(R_mat, R_mat.T) 

760 if i >= start - 1: # only record if we ask for it 

761 predictedvalues[i + 1 - start] = np.dot(Z_mat, alpha) 

762 

763 def _get_prediction_index(self, start, end, dynamic, index=None): 

764 method = getattr(self, 'method', 'mle') 

765 k_ar = getattr(self, 'k_ar', 0) 

766 if start is None: 

767 if method == 'mle' and not dynamic: 

768 start = 0 

769 else: # cannot do presample fit for cmle or dynamic 

770 start = k_ar 

771 start = self._index[start] 

772 if end is None: 

773 end = self._index[-1] 

774 

775 start, end, out_of_sample, prediction_index = ( 

776 super(AR, self)._get_prediction_index(start, end, index)) 

777 

778 # Other validation 

779 if (method == 'cmle' or dynamic) and start < k_ar: 

780 raise ValueError("Start must be >= k_ar for conditional MLE " 

781 "or dynamic forecast. Got %d" % start) 

782 

783 return start, end, out_of_sample, prediction_index 

784 

785 def predict(self, params, start=None, end=None, dynamic=False): 

786 """ 

787 Construct in-sample and out-of-sample prediction. 

788 

789 Parameters 

790 ---------- 

791 params : ndarray 

792 The fitted model parameters. 

793 start : int, str, or datetime 

794 Zero-indexed observation number at which to start forecasting, ie., 

795 the first forecast is start. Can also be a date string to 

796 parse or a datetime type. 

797 end : int, str, or datetime 

798 Zero-indexed observation number at which to end forecasting, ie., 

799 the first forecast is start. Can also be a date string to 

800 parse or a datetime type. 

801 dynamic : bool 

802 The `dynamic` keyword affects in-sample prediction. If dynamic 

803 is False, then the in-sample lagged values are used for 

804 prediction. If `dynamic` is True, then in-sample forecasts are 

805 used in place of lagged dependent variables. The first forecasted 

806 value is `start`. 

807 

808 Returns 

809 ------- 

810 array_like 

811 An array containing the predicted values. 

812 

813 Notes 

814 ----- 

815 The linear Gaussian Kalman filter is used to return pre-sample fitted 

816 values. The exact initial Kalman Filter is used. See Durbin and Koopman 

817 in the references for more information. 

818 """ 

819 if not (hasattr(self, 'k_ar') and hasattr(self, 'k_trend')): 

820 raise RuntimeError('Model must be fit before calling predict') 

821 # will return an index of a date 

822 start, end, out_of_sample, _ = ( 

823 self._get_prediction_index(start, end, dynamic)) 

824 

825 k_ar = self.k_ar 

826 k_trend = self.k_trend 

827 method = self.method 

828 endog = self.endog.squeeze() 

829 

830 if dynamic: 

831 out_of_sample += end - start + 1 

832 return _ar_predict_out_of_sample(endog, params, k_ar, 

833 k_trend, out_of_sample, start) 

834 

835 predictedvalues = np.zeros(end + 1 - start) 

836 

837 # fit pre-sample 

838 if method == 'mle': # use Kalman Filter to get initial values 

839 if k_trend: 

840 mu = params[0] / (1 - np.sum(params[k_trend:])) 

841 else: 

842 mu = 0 

843 

844 # modifies predictedvalues in place 

845 if start < k_ar: 

846 self._presample_fit(params, start, k_ar, min(k_ar - 1, end), 

847 endog[:k_ar] - mu, predictedvalues) 

848 predictedvalues[:k_ar - start] += mu 

849 

850 if end < k_ar: 

851 return predictedvalues 

852 

853 # just do the whole thing and truncate 

854 fittedvalues = np.dot(self.X, params) 

855 

856 pv_start = max(k_ar - start, 0) 

857 fv_start = max(start - k_ar, 0) 

858 fv_end = min(len(fittedvalues), end - k_ar + 1) 

859 predictedvalues[pv_start:] = fittedvalues[fv_start:fv_end] 

860 

861 if out_of_sample: 

862 forecastvalues = _ar_predict_out_of_sample(endog, params, 

863 k_ar, k_trend, 

864 out_of_sample) 

865 predictedvalues = np.r_[predictedvalues, forecastvalues] 

866 

867 return predictedvalues 

868 

869 def _presample_varcov(self, params): 

870 """ 

871 Returns the inverse of the presample variance-covariance. 

872 

873 Notes 

874 ----- 

875 See Hamilton p. 125 

876 """ 

877 k = self.k_trend 

878 p = self.k_ar 

879 

880 # get inv(Vp) Hamilton 5.3.7 

881 params0 = np.r_[-1, params[k:]] 

882 

883 Vpinv = np.zeros((p, p), dtype=params.dtype) 

884 for i in range(1, p + 1): 

885 Vpinv[i - 1, i - 1:] = np.correlate(params0, params0[:i])[:-1] 

886 Vpinv[i - 1, i - 1:] -= np.correlate(params0[-i:], params0)[:-1] 

887 

888 Vpinv = Vpinv + Vpinv.T - np.diag(Vpinv.diagonal()) 

889 return Vpinv 

890 

891 def _loglike_css(self, params): 

892 """ 

893 Loglikelihood of AR(p) process using conditional sum of squares 

894 """ 

895 nobs = self.nobs 

896 Y = self.Y 

897 X = self.X 

898 ssr = sumofsq(Y.squeeze() - np.dot(X, params)) 

899 sigma2 = ssr / nobs 

900 return -nobs / 2 * (np.log(2 * np.pi) + np.log(sigma2) + 1) 

901 

902 def _loglike_mle(self, params): 

903 """ 

904 Loglikelihood of AR(p) process using exact maximum likelihood 

905 """ 

906 nobs = self.nobs 

907 X = self.X 

908 endog = self.endog 

909 k_ar = self.k_ar 

910 k_trend = self.k_trend 

911 

912 # reparameterize according to Jones (1980) like in ARMA/Kalman Filter 

913 if self.transparams: 

914 params = self._transparams(params) 

915 

916 # get mean and variance for pre-sample lags 

917 yp = endog[:k_ar].copy() 

918 if k_trend: 

919 c = [params[0]] * k_ar 

920 else: 

921 c = [0] 

922 mup = np.asarray(c / (1 - np.sum(params[k_trend:]))) 

923 diffp = yp - mup[:, None] 

924 

925 # get inv(Vp) Hamilton 5.3.7 

926 Vpinv = self._presample_varcov(params) 

927 

928 diffpVpinv = np.dot(np.dot(diffp.T, Vpinv), diffp).item() 

929 ssr = sumofsq(endog[k_ar:].squeeze() - np.dot(X, params)) 

930 

931 # concentrating the likelihood means that sigma2 is given by 

932 sigma2 = 1. / nobs * (diffpVpinv + ssr) 

933 self.sigma2 = sigma2 

934 logdet = slogdet(Vpinv)[1] # TODO: add check for singularity 

935 loglike = -1 / 2. * (nobs * (np.log(2 * np.pi) + np.log(sigma2)) 

936 - logdet + diffpVpinv / sigma2 + ssr / sigma2) 

937 return loglike 

938 

939 def loglike(self, params): 

940 r""" 

941 The loglikelihood of an AR(p) process. 

942 

943 Parameters 

944 ---------- 

945 params : ndarray 

946 The fitted parameters of the AR model. 

947 

948 Returns 

949 ------- 

950 float 

951 The loglikelihood evaluated at `params`. 

952 

953 Notes 

954 ----- 

955 Contains constant term. If the model is fit by OLS then this returns 

956 the conditional maximum likelihood. 

957 

958 .. math:: 

959 

960 \frac{\left(n-p\right)}{2}\left(\log\left(2\pi\right) 

961 +\log\left(\sigma^{2}\right)\right) 

962 -\frac{1}{\sigma^{2}}\sum_{i}\epsilon_{i}^{2} 

963 

964 If it is fit by MLE then the (exact) unconditional maximum likelihood 

965 is returned. 

966 

967 .. math:: 

968 

969 -\frac{n}{2}log\left(2\pi\right) 

970 -\frac{n}{2}\log\left(\sigma^{2}\right) 

971 +\frac{1}{2}\left|V_{p}^{-1}\right| 

972 -\frac{1}{2\sigma^{2}}\left(y_{p} 

973 -\mu_{p}\right)^{\prime}V_{p}^{-1}\left(y_{p}-\mu_{p}\right) 

974 -\frac{1}{2\sigma^{2}}\sum_{t=p+1}^{n}\epsilon_{i}^{2} 

975 

976 where 

977 

978 :math:`\mu_{p}` is a (`p` x 1) vector with each element equal to the 

979 mean of the AR process and :math:`\sigma^{2}V_{p}` is the (`p` x `p`) 

980 variance-covariance matrix of the first `p` observations. 

981 """ 

982 # Math is on Hamilton ~pp 124-5 

983 if self.method == "cmle": 

984 return self._loglike_css(params) 

985 

986 else: 

987 return self._loglike_mle(params) 

988 

989 def score(self, params): 

990 """ 

991 Compute the gradient of the log-likelihood at params. 

992 

993 Parameters 

994 ---------- 

995 params : array_like 

996 The parameter values at which to evaluate the score function. 

997 

998 Returns 

999 ------- 

1000 ndarray 

1001 The gradient computed using numerical methods. 

1002 """ 

1003 loglike = self.loglike 

1004 return approx_fprime(params, loglike, epsilon=1e-8) 

1005 

1006 def information(self, params): 

1007 """ 

1008 Not implemented. 

1009 

1010 Parameters 

1011 ---------- 

1012 params : ndarray 

1013 The model parameters. 

1014 """ 

1015 return 

1016 

1017 def hessian(self, params): 

1018 """ 

1019 Compute the hessian using a numerical approximation. 

1020 

1021 Parameters 

1022 ---------- 

1023 params : ndarray 

1024 The model parameters. 

1025 

1026 Returns 

1027 ------- 

1028 ndarray 

1029 The hessian evaluated at params. 

1030 """ 

1031 loglike = self.loglike 

1032 return approx_hess(params, loglike) 

1033 

1034 def _stackX(self, k_ar, trend): 

1035 """ 

1036 Private method to build the RHS matrix for estimation. 

1037 

1038 Columns are trend terms then lags. 

1039 """ 

1040 endog = self.endog 

1041 X = lagmat(endog, maxlag=k_ar, trim='both') 

1042 k_trend = util.get_trendorder(trend) 

1043 if k_trend: 

1044 X = add_trend(X, prepend=True, trend=trend, has_constant="raise") 

1045 self.k_trend = k_trend 

1046 return X 

1047 

1048 def select_order(self, maxlag, ic, trend='c', method='mle'): 

1049 """ 

1050 Select the lag order according to the information criterion. 

1051 

1052 Parameters 

1053 ---------- 

1054 maxlag : int 

1055 The highest lag length tried. See `AR.fit`. 

1056 ic : {'aic','bic','hqic','t-stat'} 

1057 Criterion used for selecting the optimal lag length. 

1058 See `AR.fit`. 

1059 trend : {'c','nc'} 

1060 Whether to include a constant or not. 'c' - include constant. 

1061 'nc' - no constant. 

1062 method : {'cmle', 'mle'}, optional 

1063 The method to use in estimation. 

1064 

1065 * 'cmle' - Conditional maximum likelihood using OLS 

1066 * 'mle' - Unconditional (exact) maximum likelihood. See `solver` 

1067 and the Notes. 

1068 

1069 Returns 

1070 ------- 

1071 int 

1072 Best lag according to the information criteria. 

1073 """ 

1074 endog = self.endog 

1075 

1076 # make Y and X with same nobs to compare ICs 

1077 Y = endog[maxlag:] 

1078 self.Y = Y # attach to get correct fit stats 

1079 X = self._stackX(maxlag, trend) # sets k_trend 

1080 self.X = X 

1081 k = self.k_trend # k_trend set in _stackX 

1082 k = max(1, k) # handle if startlag is 0 

1083 results = {} 

1084 

1085 if ic != 't-stat': 

1086 for lag in range(k, maxlag + 1): 

1087 # have to reinstantiate the model to keep comparable models 

1088 endog_tmp = endog[maxlag - lag:] 

1089 fit = AR(endog_tmp).fit(maxlag=lag, method=method, 

1090 full_output=0, trend=trend, 

1091 maxiter=100, disp=0) 

1092 results[lag] = getattr(fit, ic) 

1093 bestic, bestlag = min((res, k) for k, res in results.items()) 

1094 

1095 else: # choose by last t-stat. 

1096 stop = 1.6448536269514722 # for t-stat, norm.ppf(.95) 

1097 for lag in range(maxlag, k - 1, -1): 

1098 # have to reinstantiate the model to keep comparable models 

1099 endog_tmp = endog[maxlag - lag:] 

1100 fit = AR(endog_tmp).fit(maxlag=lag, method=method, 

1101 full_output=0, trend=trend, 

1102 maxiter=35, disp=-1) 

1103 

1104 bestlag = 0 

1105 if np.abs(fit.tvalues[-1]) >= stop: 

1106 bestlag = lag 

1107 break 

1108 return bestlag 

1109 

1110 def fit(self, maxlag=None, method='cmle', ic=None, trend='c', 

1111 transparams=True, start_params=None, solver='lbfgs', maxiter=35, 

1112 full_output=1, disp=1, callback=None, **kwargs): 

1113 """ 

1114 Fit the unconditional maximum likelihood of an AR(p) process. 

1115 

1116 Parameters 

1117 ---------- 

1118 maxlag : int 

1119 If `ic` is None, then maxlag is the lag length used in fit. If 

1120 `ic` is specified then maxlag is the highest lag order used to 

1121 select the correct lag order. If maxlag is None, the default is 

1122 round(12*(nobs/100.)**(1/4.)). 

1123 method : {'cmle', 'mle'}, optional 

1124 The method to use in estimation. 

1125 

1126 * 'cmle' - Conditional maximum likelihood using OLS 

1127 * 'mle' - Unconditional (exact) maximum likelihood. See `solver` 

1128 and the Notes. 

1129 ic : {'aic','bic','hic','t-stat'} 

1130 Criterion used for selecting the optimal lag length. 

1131 

1132 * 'aic' - Akaike Information Criterion 

1133 * 'bic' - Bayes Information Criterion 

1134 * 't-stat' - Based on last lag 

1135 * 'hqic' - Hannan-Quinn Information Criterion 

1136 

1137 If any of the information criteria are selected, the lag length 

1138 which results in the lowest value is selected. If t-stat, the 

1139 model starts with maxlag and drops a lag until the highest lag 

1140 has a t-stat that is significant at the 95 % level. 

1141 trend : {'c','nc'} 

1142 Whether to include a constant or not. 

1143 

1144 * 'c' - include constant. 

1145 * 'nc' - no constant. 

1146 transparams : bool, optional 

1147 Whether or not to transform the parameters to ensure stationarity. 

1148 Uses the transformation suggested in Jones (1980). 

1149 start_params : array_like, optional 

1150 A first guess on the parameters. Default is cmle estimates. 

1151 solver : str or None, optional 

1152 Solver to be used if method is 'mle'. The default is 'lbfgs' 

1153 (limited memory Broyden-Fletcher-Goldfarb-Shanno). Other choices 

1154 are 'bfgs', 'newton' (Newton-Raphson), 'nm' (Nelder-Mead), 

1155 'cg' - (conjugate gradient), 'ncg' (non-conjugate gradient), 

1156 and 'powell'. 

1157 maxiter : int, optional 

1158 The maximum number of function evaluations. Default is 35. 

1159 full_output : bool, optional 

1160 If True, all output from solver will be available in 

1161 the Results object's mle_retvals attribute. Output is dependent 

1162 on the solver. See Notes for more information. 

1163 disp : bool, optional 

1164 If True, convergence information is output. 

1165 callback : function, optional 

1166 Called after each iteration as callback(xk) where xk is the current 

1167 parameter vector. 

1168 **kwargs 

1169 See LikelihoodModel.fit for keyword arguments that can be passed 

1170 to fit. 

1171 

1172 Returns 

1173 ------- 

1174 ARResults 

1175 Results instance. 

1176 

1177 See Also 

1178 -------- 

1179 statsmodels.base.model.LikelihoodModel.fit 

1180 Base fit class with further details about options. 

1181 

1182 Notes 

1183 ----- 

1184 The parameters after `trend` are only used when method is 'mle'. 

1185 

1186 References 

1187 ---------- 

1188 .. [*] Jones, R.H. 1980 "Maximum likelihood fitting of ARMA models to 

1189 time series with missing observations." `Technometrics`. 22.3. 

1190 389-95. 

1191 """ 

1192 start_params = array_like(start_params, 'start_params', ndim=1, 

1193 optional=True) 

1194 method = method.lower() 

1195 if method not in ['cmle', 'mle']: 

1196 raise ValueError("Method %s not recognized" % method) 

1197 self.method = method 

1198 self.trend = trend 

1199 self.transparams = transparams 

1200 nobs = len(self.endog) # overwritten if method is 'cmle' 

1201 endog = self.endog 

1202 # The parameters are no longer allowed to change in an instance 

1203 fit_params = (maxlag, method, ic, trend) 

1204 if self._fit_params is not None and self._fit_params != fit_params: 

1205 raise RuntimeError(REPEATED_FIT_ERROR.format(*self._fit_params)) 

1206 if maxlag is None: 

1207 maxlag = int(round(12 * (nobs / 100.) ** (1 / 4.))) 

1208 k_ar = maxlag # stays this if ic is None 

1209 

1210 # select lag length 

1211 if ic is not None: 

1212 ic = ic.lower() 

1213 if ic not in ['aic', 'bic', 'hqic', 't-stat']: 

1214 raise ValueError("ic option %s not understood" % ic) 

1215 k_ar = self.select_order(k_ar, ic, trend, method) 

1216 

1217 self.k_ar = k_ar # change to what was chosen by ic 

1218 

1219 # redo estimation for best lag 

1220 # make LHS 

1221 Y = endog[k_ar:, :] 

1222 # make lagged RHS 

1223 X = self._stackX(k_ar, trend) # sets self.k_trend 

1224 k_trend = self.k_trend 

1225 self.exog_names = util.make_lag_names(self.endog_names, k_ar, k_trend) 

1226 self.Y = Y 

1227 self.X = X 

1228 

1229 if method == "cmle": # do OLS 

1230 arfit = OLS(Y, X).fit() 

1231 params = arfit.params 

1232 self.nobs = nobs - k_ar 

1233 self.sigma2 = arfit.ssr / arfit.nobs # needed for predict fcasterr 

1234 

1235 else: # method == "mle" 

1236 solver = solver.lower() 

1237 self.nobs = nobs 

1238 if start_params is None: 

1239 start_params = OLS(Y, X).fit().params 

1240 else: 

1241 if len(start_params) != k_trend + k_ar: 

1242 raise ValueError("Length of start params is %d. There" 

1243 " are %d parameters." % 

1244 (len(start_params), k_trend + k_ar)) 

1245 start_params = self._invtransparams(start_params) 

1246 if solver == 'lbfgs': 

1247 kwargs.setdefault('pgtol', 1e-8) 

1248 kwargs.setdefault('factr', 1e2) 

1249 kwargs.setdefault('m', 12) 

1250 kwargs.setdefault('approx_grad', True) 

1251 mlefit = super(AR, self).fit(start_params=start_params, 

1252 method=solver, maxiter=maxiter, 

1253 full_output=full_output, disp=disp, 

1254 callback=callback, **kwargs) 

1255 

1256 params = mlefit.params 

1257 if self.transparams: 

1258 params = self._transparams(params) 

1259 self.transparams = False # turn off now for other results 

1260 

1261 pinv_exog = np.linalg.pinv(X) 

1262 normalized_cov_params = np.dot(pinv_exog, pinv_exog.T) 

1263 arfit = ARResults(copy.copy(self), params, normalized_cov_params) 

1264 if method == 'mle' and full_output: 

1265 arfit.mle_retvals = mlefit.mle_retvals 

1266 arfit.mle_settings = mlefit.mle_settings 

1267 # Set fit params since completed the fit 

1268 if self._fit_params is None: 

1269 self._fit_params = fit_params 

1270 return ARResultsWrapper(arfit) 

1271 

1272 

1273class ARResults(tsa_model.TimeSeriesModelResults): 

1274 """ 

1275 Class to hold results from fitting an AR model. 

1276 

1277 Parameters 

1278 ---------- 

1279 model : AR Model instance 

1280 Reference to the model that is fit. 

1281 params : ndarray 

1282 The fitted parameters from the AR Model. 

1283 normalized_cov_params : ndarray 

1284 The array inv(dot(x.T,x)) where x contains the regressors in the 

1285 model. 

1286 scale : float, optional 

1287 An estimate of the scale of the model. 

1288 

1289 Attributes 

1290 ---------- 

1291 k_ar : float 

1292 Lag length. Sometimes used as `p` in the docs. 

1293 k_trend : float 

1294 The number of trend terms included. 'nc'=0, 'c'=1. 

1295 llf : float 

1296 The loglikelihood of the model evaluated at `params`. See `AR.loglike` 

1297 model : AR model instance 

1298 A reference to the fitted AR model. 

1299 nobs : float 

1300 The number of available observations `nobs` - `k_ar` 

1301 n_totobs : float 

1302 The number of total observations in `endog`. Sometimes `n` in the docs. 

1303 params : ndarray 

1304 The fitted parameters of the model. 

1305 scale : float 

1306 Same as sigma2 

1307 sigma2 : float 

1308 The variance of the innovations (residuals). 

1309 trendorder : int 

1310 The polynomial order of the trend. 'nc' = None, 'c' or 't' = 0, 

1311 'ct' = 1, etc. 

1312 """ 

1313 

1314 _cache = {} # for scale setter 

1315 

1316 def __init__(self, model, params, normalized_cov_params=None, scale=1.): 

1317 super(ARResults, self).__init__(model, params, normalized_cov_params, 

1318 scale) 

1319 self._cache = {} 

1320 self.nobs = model.nobs 

1321 n_totobs = len(model.endog) 

1322 self.n_totobs = n_totobs 

1323 self.X = model.X # copy? 

1324 self.Y = model.Y 

1325 k_ar = model.k_ar 

1326 self.k_ar = k_ar 

1327 k_trend = model.k_trend 

1328 self.k_trend = k_trend 

1329 trendorder = None 

1330 if k_trend > 0: 

1331 trendorder = k_trend - 1 

1332 self.trendorder = trendorder 

1333 # TODO: cmle vs mle? 

1334 self.df_model = k_ar + k_trend 

1335 self.df_resid = self.model.df_resid = n_totobs - self.df_model 

1336 

1337 @cache_writable() 

1338 def sigma2(self): 

1339 model = self.model 

1340 if model.method == "cmle": # do DOF correction 

1341 return 1. / self.nobs * sumofsq(self.resid) 

1342 else: 

1343 return self.model.sigma2 

1344 

1345 @cache_writable() # for compatability with RegressionResults 

1346 def scale(self): 

1347 return self.sigma2 

1348 

1349 @cache_readonly 

1350 def bse(self): # allow user to specify? 

1351 """ 

1352 The standard errors of the estimated parameters. 

1353 

1354 If `method` is 'cmle', then the standard errors that are returned are 

1355 the OLS standard errors of the coefficients. If the `method` is 'mle' 

1356 then they are computed using the numerical Hessian. 

1357 """ 

1358 if self.model.method == "cmle": # uses different scale/sigma def. 

1359 resid = self.resid 

1360 ssr = np.dot(resid, resid) 

1361 ols_scale = ssr / (self.nobs - self.k_ar - self.k_trend) 

1362 return np.sqrt(np.diag(self.cov_params(scale=ols_scale))) 

1363 else: 

1364 hess = approx_hess(self.params, self.model.loglike) 

1365 return np.sqrt(np.diag(-np.linalg.inv(hess))) 

1366 

1367 @cache_readonly 

1368 def pvalues(self): 

1369 """The p values associated with the standard errors.""" 

1370 return norm.sf(np.abs(self.tvalues)) * 2 

1371 

1372 @cache_readonly 

1373 def aic(self): 

1374 """ 

1375 Akaike Information Criterion using Lutkephol's definition. 

1376 

1377 :math:`log(sigma) + 2*(1 + k_ar + k_trend)/nobs` 

1378 """ 

1379 # TODO: this is based on loglike with dropped constant terms ? 

1380 # Lutkepohl 

1381 # return np.log(self.sigma2) + 1./self.model.nobs * self.k_ar 

1382 # Include constant as estimated free parameter and double the loss 

1383 return np.log(self.sigma2) + 2 * (1 + self.df_model) / self.nobs 

1384 # Stata defintion 

1385 # nobs = self.nobs 

1386 # return -2 * self.llf/nobs + 2 * (self.k_ar+self.k_trend)/nobs 

1387 

1388 @cache_readonly 

1389 def hqic(self): 

1390 """Hannan-Quinn Information Criterion.""" 

1391 nobs = self.nobs 

1392 # Lutkepohl 

1393 # return np.log(self.sigma2)+ 2 * np.log(np.log(nobs))/nobs * self.k_ar 

1394 # R uses all estimated parameters rather than just lags 

1395 return (np.log(self.sigma2) + 2 * np.log(np.log(nobs)) 

1396 / nobs * (1 + self.df_model)) 

1397 # Stata 

1398 # nobs = self.nobs 

1399 # return -2 * self.llf/nobs + 2 * np.log(np.log(nobs))/nobs * \ 

1400 # (self.k_ar + self.k_trend) 

1401 

1402 @cache_readonly 

1403 def fpe(self): 

1404 """ 

1405 Final prediction error using Lütkepohl's definition. 

1406 

1407 ((n_totobs+k_trend)/(n_totobs-k_ar-k_trend))*sigma 

1408 """ 

1409 nobs = self.nobs 

1410 df_model = self.df_model 

1411 # Lutkepohl 

1412 return ((nobs + df_model) / (nobs - df_model)) * self.sigma2 

1413 

1414 @cache_readonly 

1415 def bic(self): 

1416 """ 

1417 Bayes Information Criterion 

1418 

1419 :math:`\\log(\\sigma) + (1 + k_ar + k_trend)*\\log(nobs)/nobs` 

1420 """ 

1421 nobs = self.nobs 

1422 # Lutkepohl 

1423 # return np.log(self.sigma2) + np.log(nobs)/nobs * self.k_ar 

1424 # Include constant as est. free parameter 

1425 return np.log(self.sigma2) + (1 + self.df_model) * np.log(nobs) / nobs 

1426 # Stata 

1427 # return -2 * self.llf/nobs + np.log(nobs)/nobs * (self.k_ar + \ 

1428 # self.k_trend) 

1429 

1430 @cache_readonly 

1431 def resid(self): 

1432 """ 

1433 The residuals of the model. 

1434 

1435 If the model is fit by 'mle' then the pre-sample residuals are 

1436 calculated using fittedvalues from the Kalman Filter. 

1437 """ 

1438 # NOTE: uses fittedvalues because it calculate presample values for mle 

1439 model = self.model 

1440 endog = model.endog.squeeze() 

1441 if model.method == "cmle": # elimate pre-sample 

1442 return endog[self.k_ar:] - self.fittedvalues 

1443 else: 

1444 return model.endog.squeeze() - self.fittedvalues 

1445 

1446 @cache_readonly 

1447 def roots(self): 

1448 """ 

1449 The roots of the AR process. 

1450 

1451 The roots are the solution to 

1452 (1 - arparams[0]*z - arparams[1]*z**2 -...- arparams[p-1]*z**k_ar) = 0. 

1453 Stability requires that the roots in modulus lie outside the unit 

1454 circle. 

1455 """ 

1456 k = self.k_trend 

1457 return np.roots(np.r_[1, -self.params[k:]]) ** -1 

1458 

1459 @cache_readonly 

1460 def arfreq(self): 

1461 r""" 

1462 Returns the frequency of the AR roots. 

1463 

1464 This is the solution, x, to z = abs(z)*exp(2j*np.pi*x) where z are the 

1465 roots. 

1466 """ 

1467 z = self.roots 

1468 return np.arctan2(z.imag, z.real) / (2 * np.pi) 

1469 

1470 @cache_readonly 

1471 def fittedvalues(self): 

1472 """ 

1473 The in-sample predicted values of the fitted AR model. 

1474 

1475 The `k_ar` initial values are computed via the Kalman Filter if the 

1476 model is fit by `mle`. 

1477 """ 

1478 return self.model.predict(self.params) 

1479 

1480 @Appender(remove_parameters(AR.predict.__doc__, 'params')) 

1481 def predict(self, start=None, end=None, dynamic=False): 

1482 params = self.params 

1483 predictedvalues = self.model.predict(params, start, end, dynamic) 

1484 return predictedvalues 

1485 # TODO: consider returning forecast errors and confidence intervals? 

1486 

1487 def summary(self, alpha=.05): 

1488 """Summarize the Model 

1489 

1490 Parameters 

1491 ---------- 

1492 alpha : float, optional 

1493 Significance level for the confidence intervals. 

1494 

1495 Returns 

1496 ------- 

1497 smry : Summary instance 

1498 This holds the summary table and text, which can be printed or 

1499 converted to various output formats. 

1500 

1501 See Also 

1502 -------- 

1503 statsmodels.iolib.summary.Summary 

1504 """ 

1505 model = self.model 

1506 title = model.__class__.__name__ + ' Model Results' 

1507 method = model.method 

1508 # get sample 

1509 start = 0 if 'mle' in method else self.k_ar 

1510 if self.data.dates is not None: 

1511 dates = self.data.dates 

1512 sample = [dates[start].strftime('%m-%d-%Y')] 

1513 sample += ['- ' + dates[-1].strftime('%m-%d-%Y')] 

1514 else: 

1515 sample = str(start) + ' - ' + str(len(self.data.orig_endog)) 

1516 

1517 k_ar = self.k_ar 

1518 order = '({0})'.format(k_ar) 

1519 dep_name = str(self.model.endog_names) 

1520 top_left = [('Dep. Variable:', dep_name), 

1521 ('Model:', [model.__class__.__name__ + order]), 

1522 ('Method:', [method]), 

1523 ('Date:', None), 

1524 ('Time:', None), 

1525 ('Sample:', [sample[0]]), 

1526 ('', [sample[1]]) 

1527 ] 

1528 

1529 top_right = [ 

1530 ('No. Observations:', [str(len(self.model.endog))]), 

1531 ('Log Likelihood', ["%#5.3f" % self.llf]), 

1532 ('S.D. of innovations', ["%#5.3f" % self.sigma2 ** .5]), 

1533 ('AIC', ["%#5.3f" % self.aic]), 

1534 ('BIC', ["%#5.3f" % self.bic]), 

1535 ('HQIC', ["%#5.3f" % self.hqic])] 

1536 

1537 smry = Summary() 

1538 smry.add_table_2cols(self, gleft=top_left, gright=top_right, 

1539 title=title) 

1540 smry.add_table_params(self, alpha=alpha, use_t=False) 

1541 

1542 # Make the roots table 

1543 from statsmodels.iolib.table import SimpleTable 

1544 

1545 if k_ar: 

1546 arstubs = ["AR.%d" % i for i in range(1, k_ar + 1)] 

1547 stubs = arstubs 

1548 roots = self.roots 

1549 freq = self.arfreq 

1550 else: # AR(0) model 

1551 stubs = [] 

1552 if len(stubs): # not AR(0) 

1553 modulus = np.abs(roots) 

1554 data = np.column_stack((roots.real, roots.imag, modulus, freq)) 

1555 roots_table = SimpleTable([('%17.4f' % row[0], 

1556 '%+17.4fj' % row[1], 

1557 '%17.4f' % row[2], 

1558 '%17.4f' % row[3]) for row in data], 

1559 headers=[' Real', 

1560 ' Imaginary', 

1561 ' Modulus', 

1562 ' Frequency'], 

1563 title="Roots", 

1564 stubs=stubs) 

1565 

1566 smry.tables.append(roots_table) 

1567 return smry 

1568 

1569 

1570class ARResultsWrapper(wrap.ResultsWrapper): 

1571 _attrs = {} 

1572 _wrap_attrs = wrap.union_dicts( 

1573 tsa_model.TimeSeriesResultsWrapper._wrap_attrs, _attrs) 

1574 _methods = {} 

1575 _wrap_methods = wrap.union_dicts( 

1576 tsa_model.TimeSeriesResultsWrapper._wrap_methods, _methods) 

1577 

1578 

1579wrap.populate_wrapper(ARResultsWrapper, ARResults) 

1580 

1581doc = Docstring(AutoReg.predict.__doc__) 

1582_predict_params = doc.extract_parameters(['start', 'end', 'dynamic', 

1583 'exog', 'exog_oos'], 8) 

1584 

1585 

1586class AutoRegResults(tsa_model.TimeSeriesModelResults): 

1587 """ 

1588 Class to hold results from fitting an AutoReg model. 

1589 

1590 Parameters 

1591 ---------- 

1592 model : AutoReg 

1593 Reference to the model that is fit. 

1594 params : ndarray 

1595 The fitted parameters from the AR Model. 

1596 cov_params : ndarray 

1597 The estimated covariance matrix of the model parameters. 

1598 normalized_cov_params : ndarray 

1599 The array inv(dot(x.T,x)) where x contains the regressors in the 

1600 model. 

1601 scale : float, optional 

1602 An estimate of the scale of the model. 

1603 """ 

1604 

1605 _cache = {} # for scale setter 

1606 

1607 def __init__(self, model, params, cov_params, normalized_cov_params=None, 

1608 scale=1.): 

1609 super(AutoRegResults, self).__init__(model, params, 

1610 normalized_cov_params, scale) 

1611 self._cache = {} 

1612 self._params = params 

1613 self._nobs = model.nobs 

1614 self._n_totobs = model.endog.shape[0] 

1615 self._df_model = model.df_model 

1616 self._ar_lags = model.ar_lags 

1617 self._max_lag = 0 

1618 if self._ar_lags.shape[0] > 0: 

1619 self._max_lag = self._ar_lags.max() 

1620 self._hold_back = self.model.hold_back 

1621 self.cov_params_default = cov_params 

1622 

1623 def initialize(self, model, params, **kwargs): 

1624 """ 

1625 Initialize (possibly re-initialize) a Results instance. 

1626 

1627 Parameters 

1628 ---------- 

1629 model : Model 

1630 The model instance. 

1631 params : ndarray 

1632 The model parameters. 

1633 **kwargs 

1634 Any additional keyword arguments required to initialize the model. 

1635 """ 

1636 self._params = params 

1637 self.model = model 

1638 

1639 @property 

1640 def ar_lags(self): 

1641 """The autoregressive lags included in the model""" 

1642 return self._ar_lags 

1643 

1644 @property 

1645 def params(self): 

1646 """The estimated parameters.""" 

1647 return self._params 

1648 

1649 @property 

1650 def df_model(self): 

1651 """The degrees of freedom consumed by the model.""" 

1652 return self._df_model 

1653 

1654 @property 

1655 def df_resid(self): 

1656 """The remaining degrees of freedom in the residuals.""" 

1657 return self.nobs - self._df_model 

1658 

1659 @property 

1660 def nobs(self): 

1661 """ 

1662 The number of observations after adjusting for losses due to lags. 

1663 """ 

1664 return self._nobs 

1665 

1666 @cache_writable() 

1667 def sigma2(self): 

1668 return 1. / self.nobs * sumofsq(self.resid) 

1669 

1670 @cache_writable() # for compatability with RegressionResults 

1671 def scale(self): 

1672 return self.sigma2 

1673 

1674 @cache_readonly 

1675 def bse(self): # allow user to specify? 

1676 """ 

1677 The standard errors of the estimated parameters. 

1678 

1679 If `method` is 'cmle', then the standard errors that are returned are 

1680 the OLS standard errors of the coefficients. If the `method` is 'mle' 

1681 then they are computed using the numerical Hessian. 

1682 """ 

1683 return np.sqrt(np.diag(self.cov_params())) 

1684 

1685 @cache_readonly 

1686 def aic(self): 

1687 """ 

1688 Akaike Information Criterion using Lutkephol's definition. 

1689 

1690 :math:`log(sigma) + 2*(1 + df_model) / nobs` 

1691 """ 

1692 # This is based on loglike with dropped constant terms ? 

1693 # Lutkepohl 

1694 # return np.log(self.sigma2) + 1./self.model.nobs * self.k_ar 

1695 # Include constant as estimated free parameter and double the loss 

1696 # Stata defintion 

1697 # nobs = self.nobs 

1698 # return -2 * self.llf/nobs + 2 * (self.k_ar+self.k_trend)/nobs 

1699 return np.log(self.sigma2) + 2 * (1 + self.df_model) / self.nobs 

1700 

1701 @cache_readonly 

1702 def hqic(self): 

1703 """Hannan-Quinn Information Criterion.""" 

1704 # Lutkepohl 

1705 # return np.log(self.sigma2)+ 2 * np.log(np.log(nobs))/nobs * self.k_ar 

1706 # R uses all estimated parameters rather than just lags 

1707 # Stata 

1708 # nobs = self.nobs 

1709 # return -2 * self.llf/nobs + 2 * np.log(np.log(nobs))/nobs * \ 

1710 # (self.k_ar + self.k_trend) 

1711 nobs = self.nobs 

1712 loglog_n = np.log(np.log(nobs)) 

1713 log_sigma2 = np.log(self.sigma2) 

1714 return (log_sigma2 + 2 * loglog_n / nobs * (1 + self.df_model)) 

1715 

1716 @cache_readonly 

1717 def fpe(self): 

1718 """ 

1719 Final prediction error using Lütkepohl's definition. 

1720 

1721 ((n_totobs+k_trend)/(n_totobs-k_ar-k_trend))*sigma 

1722 """ 

1723 nobs = self.nobs 

1724 df_model = self.df_model 

1725 # Lutkepohl 

1726 return ((nobs + df_model) / (nobs - df_model)) * self.sigma2 

1727 

1728 @cache_readonly 

1729 def bic(self): 

1730 r""" 

1731 Bayes Information Criterion 

1732 

1733 :math:`\ln(\sigma) + df_{model} \ln(nobs)/nobs` 

1734 """ 

1735 # Lutkepohl 

1736 # np.log(self.sigma2) + np.log(nobs)/nobs * self.k_ar 

1737 # Include constant as est. free parameter 

1738 # Stata 

1739 # -2 * self.llf/nobs + np.log(nobs)/nobs * (self.k_ar + self.k_trend) 

1740 nobs = self.nobs 

1741 return np.log(self.sigma2) + (1 + self.df_model) * np.log(nobs) / nobs 

1742 

1743 @cache_readonly 

1744 def resid(self): 

1745 """ 

1746 The residuals of the model. 

1747 """ 

1748 model = self.model 

1749 endog = model.endog.squeeze() 

1750 return endog[self._hold_back:] - self.fittedvalues 

1751 

1752 def _lag_repr(self): 

1753 """Returns poly repr of an AR, (1 -phi1 L -phi2 L^2-...)""" 

1754 k_ar = len(self.ar_lags) 

1755 ar_params = np.zeros(self._max_lag + 1) 

1756 ar_params[0] = 1 

1757 df_model = self._df_model 

1758 exog = self.model.exog 

1759 k_exog = exog.shape[1] if exog is not None else 0 

1760 params = self._params[df_model - k_ar - k_exog:df_model - k_exog] 

1761 for i, lag in enumerate(self._ar_lags): 

1762 ar_params[lag] = -params[i] 

1763 return ar_params 

1764 

1765 @cache_readonly 

1766 def roots(self): 

1767 """ 

1768 The roots of the AR process. 

1769 

1770 The roots are the solution to 

1771 (1 - arparams[0]*z - arparams[1]*z**2 -...- arparams[p-1]*z**k_ar) = 0. 

1772 Stability requires that the roots in modulus lie outside the unit 

1773 circle. 

1774 """ 

1775 lag_repr = self._lag_repr() 

1776 if lag_repr.shape[0] == 1: 

1777 return np.empty(0) 

1778 

1779 return np.roots(lag_repr) ** -1 

1780 

1781 @cache_readonly 

1782 def arfreq(self): 

1783 r""" 

1784 Returns the frequency of the AR roots. 

1785 

1786 This is the solution, x, to z = abs(z)*exp(2j*np.pi*x) where z are the 

1787 roots. 

1788 """ 

1789 z = self.roots 

1790 return np.arctan2(z.imag, z.real) / (2 * np.pi) 

1791 

1792 @cache_readonly 

1793 def fittedvalues(self): 

1794 """ 

1795 The in-sample predicted values of the fitted AR model. 

1796 

1797 The `k_ar` initial values are computed via the Kalman Filter if the 

1798 model is fit by `mle`. 

1799 """ 

1800 return self.model.predict(self.params) 

1801 

1802 def test_serial_correlation(self, lags=None, model_df=None): 

1803 """ 

1804 Ljung-Box test for residual serial correlation 

1805 

1806 Parameters 

1807 ---------- 

1808 lags : int 

1809 The maximum number of lags to use in the test. Jointly tests that 

1810 all autocorrelations up to and including lag j are zero for 

1811 j = 1, 2, ..., lags. If None, uses lag=12*(nobs/100)^{1/4}. 

1812 model_df : int 

1813 The model degree of freedom to use when adjusting computing the 

1814 test statistic to account for parameter estimation. If None, uses 

1815 the number of AR lags included in the model. 

1816 

1817 Returns 

1818 ------- 

1819 output : DataFrame 

1820 DataFrame containing three columns: the test statistic, the 

1821 p-value of the test, and the degree of freedom used in the test. 

1822 

1823 Notes 

1824 ----- 

1825 Null hypothesis is no serial correlation. 

1826 

1827 The the test degree-of-freedom is 0 or negative once accounting for 

1828 model_df, then the test statistic's p-value is missing. 

1829 

1830 See Also 

1831 -------- 

1832 statsmodels.stats.diagnostic.acorr_ljungbox 

1833 Ljung-Box test for serial correlation. 

1834 """ 

1835 # Deferred to prevent circular import 

1836 from statsmodels.stats.diagnostic import acorr_ljungbox 

1837 

1838 lags = int_like(lags, 'lags', optional=True) 

1839 model_df = int_like(model_df, 'df_model', optional=True) 

1840 model_df = len(self.ar_lags) if model_df is None else model_df 

1841 nobs_effective = self.resid.shape[0] 

1842 # Default lags for acorr_ljungbox is 40, but may not always have 

1843 # that many observations 

1844 if lags is None: 

1845 lags = int(min(12 * (nobs_effective / 100) ** (1 / 4), 

1846 nobs_effective - 1)) 

1847 test_stats = acorr_ljungbox(self.resid, lags=lags, boxpierce=False, 

1848 model_df=model_df, return_df=False) 

1849 cols = ['Ljung-Box', 'LB P-value', 'DF'] 

1850 if lags == 1: 

1851 test_stats = [list(test_stats) + [max(0, 1 - model_df)]] 

1852 else: 

1853 df = np.clip(np.arange(1, lags + 1) - model_df, 0, np.inf).astype( 

1854 np.int) 

1855 test_stats = list(test_stats) + [df] 

1856 test_stats = [[test_stats[i][j] for i in range(3)] for j in 

1857 range(lags)] 

1858 index = pd.RangeIndex(1, lags + 1, name='Lag') 

1859 return pd.DataFrame(test_stats, 

1860 columns=cols, 

1861 index=index) 

1862 

1863 def test_normality(self): 

1864 """ 

1865 Test for normality of standardized residuals. 

1866 

1867 Returns 

1868 ------- 

1869 Series 

1870 Series containing four values, the test statistic and its p-value, 

1871 the skewness and the kurtosis. 

1872 

1873 Notes 

1874 ----- 

1875 Null hypothesis is normality. 

1876 

1877 See Also 

1878 -------- 

1879 statsmodels.stats.stattools.jarque_bera 

1880 The Jarque-Bera test of normality. 

1881 """ 

1882 # Deferred to prevent circular import 

1883 from statsmodels.stats.stattools import jarque_bera 

1884 index = ['Jarque-Bera', 'P-value', 'Skewness', 'Kurtosis'] 

1885 return pd.Series(jarque_bera(self.resid), index=index) 

1886 

1887 def test_heteroskedasticity(self, lags=None): 

1888 """ 

1889 ARCH-LM test of residual heteroskedasticity 

1890 

1891 Parameters 

1892 ---------- 

1893 lags : int 

1894 The maximum number of lags to use in the test. Jointly tests that 

1895 all squared autocorrelations up to and including lag j are zero for 

1896 j = 1, 2, ..., lags. If None, uses lag=12*(nobs/100)^{1/4}. 

1897 

1898 Returns 

1899 ------- 

1900 Series 

1901 Series containing the test statistic and its p-values. 

1902 

1903 See Also 

1904 -------- 

1905 statsmodels.stats.diagnostic.het_arch 

1906 ARCH-LM test. 

1907 statsmodels.stats.diagnostic.acorr_lm 

1908 LM test for autocorrelation. 

1909 """ 

1910 from statsmodels.stats.diagnostic import het_arch 

1911 

1912 lags = int_like(lags, 'lags', optional=True) 

1913 nobs_effective = self.resid.shape[0] 

1914 if lags is None: 

1915 max_lag = (nobs_effective - 1) // 2 

1916 lags = int(min(12 * (nobs_effective / 100) ** (1 / 4), max_lag)) 

1917 out = [] 

1918 for lag in range(1, lags + 1): 

1919 res = het_arch(self.resid, maxlag=lag, autolag=None) 

1920 out.append([res[0], res[1], lag]) 

1921 index = pd.RangeIndex(1, lags + 1, name='Lag') 

1922 cols = ['ARCH-LM', 'P-value', 'DF'] 

1923 return pd.DataFrame(out, columns=cols, index=index) 

1924 

1925 def diagnostic_summary(self): 

1926 """ 

1927 Returns a summary containing standard model diagnostic tests 

1928 

1929 Returns 

1930 ------- 

1931 Summary 

1932 A summary instance with panels for serial correlation tests, 

1933 normality tests and heteroskedasticity tests. 

1934 

1935 See Also 

1936 -------- 

1937 test_serial_correlation 

1938 Test models residuals for serial correlation. 

1939 test_normality 

1940 Test models residuals for deviations from normality. 

1941 test_heteroskedasticity 

1942 Test models residuals for conditional heteroskedasticity. 

1943 """ 

1944 from statsmodels.iolib.table import SimpleTable 

1945 spacer = SimpleTable(['']) 

1946 smry = Summary() 

1947 sc = self.test_serial_correlation() 

1948 sc = sc.loc[sc.DF > 0] 

1949 values = [[i + 1] + row for i, row in enumerate(sc.values.tolist())] 

1950 data_fmts = ('%10d', '%10.3f', '%10.3f', '%10d') 

1951 if sc.shape[0]: 

1952 tab = SimpleTable(values, 

1953 headers=['Lag'] + list(sc.columns), 

1954 title='Test of No Serial Correlation', 

1955 header_align='r', data_fmts=data_fmts) 

1956 smry.tables.append(tab) 

1957 smry.tables.append(spacer) 

1958 jb = self.test_normality() 

1959 data_fmts = ('%10.3f', '%10.3f', '%10.3f', '%10.3f') 

1960 tab = SimpleTable([jb.values], headers=list(jb.index), 

1961 title='Test of Normality', 

1962 header_align='r', data_fmts=data_fmts) 

1963 smry.tables.append(tab) 

1964 smry.tables.append(spacer) 

1965 arch_lm = self.test_heteroskedasticity() 

1966 values = [[i + 1] + row for i, row in 

1967 enumerate(arch_lm.values.tolist())] 

1968 data_fmts = ('%10d', '%10.3f', '%10.3f', '%10d') 

1969 tab = SimpleTable(values, 

1970 headers=['Lag'] + list(arch_lm.columns), 

1971 title='Test of Conditional Homoskedasticity', 

1972 header_align='r', data_fmts=data_fmts) 

1973 smry.tables.append(tab) 

1974 return smry 

1975 

1976 @Appender(remove_parameters(AutoReg.predict.__doc__, 'params')) 

1977 def predict(self, start=None, end=None, dynamic=False, exog=None, 

1978 exog_oos=None): 

1979 return self.model.predict(self._params, start=start, end=end, 

1980 dynamic=dynamic, exog=exog, 

1981 exog_oos=exog_oos) 

1982 

1983 @Substitution(predict_params=_predict_params) 

1984 def plot_predict(self, start=None, end=None, dynamic=False, exog=None, 

1985 exog_oos=None, alpha=.05, in_sample=True, fig=None, 

1986 figsize=None): 

1987 """ 

1988 Plot in- and out-of-sample predictions 

1989 

1990 Parameters 

1991 ---------- 

1992%(predict_params)s 

1993 alpha : {float, None} 

1994 The tail probability not covered by the confidence interval. Must 

1995 be in (0, 1). Confidence interval is constructed assuming normally 

1996 distributed shocks. If None, figure will not show the confidence 

1997 interval. 

1998 in_sample : bool 

1999 Flag indicating whether to include the in-sample period in the 

2000 plot. 

2001 fig : Figure 

2002 An existing figure handle. If not provided, a new figure is 

2003 created. 

2004 figsize: tuple[float, float] 

2005 Tuple containing the figure size values. 

2006 

2007 Returns 

2008 ------- 

2009 Figure 

2010 Figure handle containing the plot. 

2011 """ 

2012 from statsmodels.graphics.utils import _import_mpl, create_mpl_fig 

2013 _import_mpl() 

2014 fig = create_mpl_fig(fig, figsize) 

2015 predictions = self.predict(start=start, end=end, dynamic=dynamic, 

2016 exog=exog, exog_oos=exog_oos) 

2017 start = 0 if start is None else start 

2018 end = self.model._index[-1] if end is None else end 

2019 _, _, oos, _ = self.model._get_prediction_index(start, end) 

2020 

2021 ax = fig.add_subplot(111) 

2022 if in_sample: 

2023 ax.plot(predictions) 

2024 elif oos: 

2025 if isinstance(predictions, pd.Series): 

2026 predictions = predictions.iloc[-oos:] 

2027 else: 

2028 predictions = predictions[-oos:] 

2029 else: 

2030 raise ValueError('in_sample is False but there are no' 

2031 'out-of-sample forecasts to plot.') 

2032 

2033 if oos and alpha is not None: 

2034 pred_oos = np.asarray(predictions)[-oos:] 

2035 ar_params = self._lag_repr() 

2036 ma = arma2ma(ar_params, [1], lags=oos) 

2037 fc_error = np.sqrt(self.sigma2) * np.cumsum(ma ** 2) 

2038 quantile = norm.ppf(alpha / 2) 

2039 lower = pred_oos + fc_error * quantile 

2040 upper = pred_oos + fc_error * -quantile 

2041 label = "{0:.0%} confidence interval".format(1 - alpha) 

2042 x = ax.get_lines()[-1].get_xdata() 

2043 ax.fill_between(x[-oos:], lower, upper, 

2044 color='gray', alpha=.5, label=label) 

2045 

2046 ax.legend(loc='best') 

2047 

2048 return fig 

2049 

2050 def plot_diagnostics(self, lags=10, fig=None, figsize=None): 

2051 """ 

2052 Diagnostic plots for standardized residuals 

2053 

2054 Parameters 

2055 ---------- 

2056 lags : int, optional 

2057 Number of lags to include in the correlogram. Default is 10. 

2058 fig : Figure, optional 

2059 If given, subplots are created in this figure instead of in a new 

2060 figure. Note that the 2x2 grid will be created in the provided 

2061 figure using `fig.add_subplot()`. 

2062 figsize : tuple, optional 

2063 If a figure is created, this argument allows specifying a size. 

2064 The tuple is (width, height). 

2065 

2066 Notes 

2067 ----- 

2068 Produces a 2x2 plot grid with the following plots (ordered clockwise 

2069 from top left): 

2070 

2071 1. Standardized residuals over time 

2072 2. Histogram plus estimated density of standardized residuals, along 

2073 with a Normal(0,1) density plotted for reference. 

2074 3. Normal Q-Q plot, with Normal reference line. 

2075 4. Correlogram 

2076 

2077 See Also 

2078 -------- 

2079 statsmodels.graphics.gofplots.qqplot 

2080 statsmodels.graphics.tsaplots.plot_acf 

2081 """ 

2082 from statsmodels.graphics.utils import _import_mpl, create_mpl_fig 

2083 _import_mpl() 

2084 fig = create_mpl_fig(fig, figsize) 

2085 # Eliminate residuals associated with burned or diffuse likelihoods 

2086 resid = self.resid 

2087 

2088 # Top-left: residuals vs time 

2089 ax = fig.add_subplot(221) 

2090 if hasattr(self.model.data, 'dates') and self.data.dates is not None: 

2091 x = self.model.data.dates._mpl_repr() 

2092 x = x[self.model.hold_back:] 

2093 else: 

2094 hold_back = self.model.hold_back 

2095 x = hold_back + np.arange(self.resid.shape[0]) 

2096 std_resid = resid / np.sqrt(self.sigma2) 

2097 ax.plot(x, std_resid) 

2098 ax.hlines(0, x[0], x[-1], alpha=0.5) 

2099 ax.set_xlim(x[0], x[-1]) 

2100 ax.set_title('Standardized residual') 

2101 

2102 # Top-right: histogram, Gaussian kernel density, Normal density 

2103 # Can only do histogram and Gaussian kernel density on the non-null 

2104 # elements 

2105 std_resid_nonmissing = std_resid[~(np.isnan(resid))] 

2106 ax = fig.add_subplot(222) 

2107 

2108 # gh5792: Remove except after support for matplotlib>2.1 required 

2109 try: 

2110 ax.hist(std_resid_nonmissing, density=True, label='Hist') 

2111 except AttributeError: 

2112 ax.hist(std_resid_nonmissing, normed=True, label='Hist') 

2113 

2114 kde = gaussian_kde(std_resid) 

2115 xlim = (-1.96 * 2, 1.96 * 2) 

2116 x = np.linspace(xlim[0], xlim[1]) 

2117 ax.plot(x, kde(x), label='KDE') 

2118 ax.plot(x, norm.pdf(x), label='N(0,1)') 

2119 ax.set_xlim(xlim) 

2120 ax.legend() 

2121 ax.set_title('Histogram plus estimated density') 

2122 

2123 # Bottom-left: QQ plot 

2124 ax = fig.add_subplot(223) 

2125 from statsmodels.graphics.gofplots import qqplot 

2126 qqplot(std_resid, line='s', ax=ax) 

2127 ax.set_title('Normal Q-Q') 

2128 

2129 # Bottom-right: Correlogram 

2130 ax = fig.add_subplot(224) 

2131 from statsmodels.graphics.tsaplots import plot_acf 

2132 plot_acf(resid, ax=ax, lags=lags) 

2133 ax.set_title('Correlogram') 

2134 

2135 ax.set_ylim(-1, 1) 

2136 

2137 return fig 

2138 

2139 def summary(self, alpha=.05): 

2140 """ 

2141 Summarize the Model 

2142 

2143 Parameters 

2144 ---------- 

2145 alpha : float, optional 

2146 Significance level for the confidence intervals. 

2147 

2148 Returns 

2149 ------- 

2150 smry : Summary instance 

2151 This holds the summary table and text, which can be printed or 

2152 converted to various output formats. 

2153 

2154 See Also 

2155 -------- 

2156 statsmodels.iolib.summary.Summary 

2157 """ 

2158 model = self.model 

2159 

2160 title = model.__class__.__name__ + ' Model Results' 

2161 method = 'Conditional MLE' 

2162 # get sample 

2163 start = self._hold_back 

2164 if self.data.dates is not None: 

2165 dates = self.data.dates 

2166 sample = [dates[start].strftime('%m-%d-%Y')] 

2167 sample += ['- ' + dates[-1].strftime('%m-%d-%Y')] 

2168 else: 

2169 sample = [str(start), str(len(self.data.orig_endog))] 

2170 model = model.__class__.__name__ 

2171 if self.model.seasonal: 

2172 model = 'Seas. ' + model 

2173 if len(self.ar_lags) < self._max_lag: 

2174 model = 'Restr. ' + model 

2175 if self.model.exog is not None: 

2176 model += '-X' 

2177 

2178 order = '({0})'.format(self._max_lag) 

2179 dep_name = str(self.model.endog_names) 

2180 top_left = [('Dep. Variable:', [dep_name]), 

2181 ('Model:', [model + order]), 

2182 ('Method:', [method]), 

2183 ('Date:', None), 

2184 ('Time:', None), 

2185 ('Sample:', [sample[0]]), 

2186 ('', [sample[1]]) 

2187 ] 

2188 

2189 top_right = [ 

2190 ('No. Observations:', [str(len(self.model.endog))]), 

2191 ('Log Likelihood', ["%#5.3f" % self.llf]), 

2192 ('S.D. of innovations', ["%#5.3f" % self.sigma2 ** .5]), 

2193 ('AIC', ["%#5.3f" % self.aic]), 

2194 ('BIC', ["%#5.3f" % self.bic]), 

2195 ('HQIC', ["%#5.3f" % self.hqic])] 

2196 

2197 smry = Summary() 

2198 smry.add_table_2cols(self, gleft=top_left, gright=top_right, 

2199 title=title) 

2200 smry.add_table_params(self, alpha=alpha, use_t=False) 

2201 

2202 # Make the roots table 

2203 from statsmodels.iolib.table import SimpleTable 

2204 

2205 if self._max_lag: 

2206 arstubs = ["AR.%d" % i for i in range(1, self._max_lag + 1)] 

2207 stubs = arstubs 

2208 roots = self.roots 

2209 freq = self.arfreq 

2210 modulus = np.abs(roots) 

2211 data = np.column_stack((roots.real, roots.imag, modulus, freq)) 

2212 roots_table = SimpleTable([('%17.4f' % row[0], 

2213 '%+17.4fj' % row[1], 

2214 '%17.4f' % row[2], 

2215 '%17.4f' % row[3]) for row in data], 

2216 headers=[' Real', 

2217 ' Imaginary', 

2218 ' Modulus', 

2219 ' Frequency'], 

2220 title="Roots", 

2221 stubs=stubs) 

2222 

2223 smry.tables.append(roots_table) 

2224 return smry 

2225 

2226 

2227class AutoRegResultsWrapper(wrap.ResultsWrapper): 

2228 _attrs = {} 

2229 _wrap_attrs = wrap.union_dicts( 

2230 tsa_model.TimeSeriesResultsWrapper._wrap_attrs, _attrs) 

2231 _methods = {} 

2232 _wrap_methods = wrap.union_dicts( 

2233 tsa_model.TimeSeriesResultsWrapper._wrap_methods, _methods) 

2234 

2235 

2236wrap.populate_wrapper(AutoRegResultsWrapper, AutoRegResults) 

2237 

2238doc = Docstring(AutoReg.__doc__) 

2239_auto_reg_params = doc.extract_parameters(['trend', 'seasonal', 'exog', 

2240 'hold_back', 'period', 'missing'], 

2241 4) 

2242 

2243 

2244@Substitution(auto_reg_params=_auto_reg_params) 

2245def ar_select_order(endog, maxlag, ic='bic', glob=False, trend='c', 

2246 seasonal=False, exog=None, hold_back=None, period=None, 

2247 missing='none'): 

2248 """ 

2249 Autoregressive AR-X(p) model order selection. 

2250 

2251 Parameters 

2252 ---------- 

2253 endog : array_like 

2254 A 1-d endogenous response variable. The independent variable. 

2255 maxlag : int 

2256 The maximum lag to consider. 

2257 ic : {'aic', 'hqic', 'bic'} 

2258 The information criterion to use in the selection. 

2259 glob : bool 

2260 Flag indicating where to use a global search across all combinations 

2261 of lags. In practice, this option is not computational feasible when 

2262 maxlag is larger than 15 (or perhaps 20) since the global search 

2263 requires fitting 2**maxlag models. 

2264%(auto_reg_params)s 

2265 

2266 Returns 

2267 ------- 

2268 AROrderSelectionResults 

2269 A results holder containing the model and the complete set of 

2270 information criteria for all models fit. 

2271 

2272 Examples 

2273 -------- 

2274 >>> from statsmodels.tsa.ar_model import ar_select_order 

2275 >>> data = sm.datasets.sunspots.load_pandas().data['SUNACTIVITY'] 

2276 

2277 Determine the optimal lag structure 

2278 

2279 >>> mod = ar_select_order(data, maxlag=13) 

2280 >>> mod.ar_lags 

2281 array([1, 2, 3, 4, 5, 6, 7, 8, 9]) 

2282 

2283 Determine the optimal lag structure with seasonal terms 

2284 

2285 >>> mod = ar_select_order(data, maxlag=13, seasonal=True, period=12) 

2286 >>> mod.ar_lags 

2287 array([1, 2, 3, 4, 5, 6, 7, 8, 9]) 

2288 

2289 Globally determine the optimal lag structure 

2290 

2291 >>> mod = ar_select_order(data, maxlag=13, glob=True) 

2292 >>> mod.ar_lags 

2293 array([1, 2, 9]) 

2294 """ 

2295 full_mod = AutoReg(endog, maxlag, trend=trend, seasonal=seasonal, 

2296 exog=exog, hold_back=hold_back, period=period, 

2297 missing=missing) 

2298 nexog = full_mod.exog.shape[1] if full_mod.exog is not None else 0 

2299 y, x = full_mod._y, full_mod._x 

2300 base_col = x.shape[1] - nexog - maxlag 

2301 sel = np.ones(x.shape[1], dtype=bool) 

2302 ics = [] 

2303 

2304 def compute_ics(res): 

2305 nobs = res.nobs 

2306 df_model = res.df_model 

2307 sigma2 = 1. / nobs * sumofsq(res.resid) 

2308 

2309 res = SimpleNamespace(nobs=nobs, df_model=df_model, sigma2=sigma2) 

2310 

2311 aic = AutoRegResults.aic.func(res) 

2312 bic = AutoRegResults.bic.func(res) 

2313 hqic = AutoRegResults.hqic.func(res) 

2314 

2315 return aic, bic, hqic 

2316 

2317 def ic_no_data(): 

2318 """Fake mod and results to handle no regressor case""" 

2319 mod = SimpleNamespace(nobs=y.shape[0], 

2320 endog=y, 

2321 exog=np.empty((y.shape[0], 0))) 

2322 llf = OLS.loglike(mod, np.empty(0)) 

2323 res = SimpleNamespace(resid=y, nobs=y.shape[0], llf=llf, 

2324 df_model=0, k_constant=0) 

2325 

2326 return compute_ics(res) 

2327 

2328 if not glob: 

2329 sel[base_col: base_col + maxlag] = False 

2330 for i in range(maxlag + 1): 

2331 sel[base_col:base_col + i] = True 

2332 if not np.any(sel): 

2333 ics.append((0, ic_no_data())) 

2334 continue 

2335 res = OLS(y, x[:, sel]).fit() 

2336 lags = tuple(j for j in range(1, i + 1)) 

2337 lags = 0 if not lags else lags 

2338 ics.append((lags, compute_ics(res))) 

2339 else: 

2340 bits = np.arange(2 ** maxlag, dtype=np.int32)[:, None] 

2341 bits = bits.view(np.uint8) 

2342 bits = np.unpackbits(bits).reshape(-1, 32) 

2343 for i in range(4): 

2344 bits[:, 8 * i:8 * (i + 1)] = bits[:, 8 * i:8 * (i + 1)][:, ::-1] 

2345 masks = bits[:, :maxlag] 

2346 for mask in masks: 

2347 sel[base_col:base_col + maxlag] = mask 

2348 if not np.any(sel): 

2349 ics.append((0, ic_no_data())) 

2350 continue 

2351 res = OLS(y, x[:, sel]).fit() 

2352 lags = tuple(np.where(mask)[0] + 1) 

2353 lags = 0 if not lags else lags 

2354 ics.append((lags, compute_ics(res))) 

2355 

2356 key_loc = {'aic': 0, 'bic': 1, 'hqic': 2}[ic] 

2357 ics = sorted(ics, key=lambda x: x[1][key_loc]) 

2358 selected_model = ics[0][0] 

2359 mod = AutoReg(endog, selected_model, trend=trend, seasonal=seasonal, 

2360 exog=exog, hold_back=hold_back, period=period, 

2361 missing=missing) 

2362 return AROrderSelectionResults(mod, ics, trend, seasonal, period) 

2363 

2364 

2365class AROrderSelectionResults(object): 

2366 """ 

2367 Results from an AR order selection 

2368 

2369 Contains the information criteria for all fitted model orders. 

2370 """ 

2371 

2372 def __init__(self, model, ics, trend, seasonal, period): 

2373 self._model = model 

2374 self._ics = ics 

2375 self._trend = trend 

2376 self._seasonal = seasonal 

2377 self._period = period 

2378 aic = sorted(ics, key=lambda r: r[1][0]) 

2379 self._aic = dict([(key, val[0]) for key, val in aic]) 

2380 bic = sorted(ics, key=lambda r: r[1][1]) 

2381 self._bic = dict([(key, val[1]) for key, val in bic]) 

2382 hqic = sorted(ics, key=lambda r: r[1][2]) 

2383 self._hqic = dict([(key, val[2]) for key, val in hqic]) 

2384 

2385 @property 

2386 def model(self): 

2387 """The model selected using the chosen information criterion.""" 

2388 return self._model 

2389 

2390 @property 

2391 def seasonal(self): 

2392 """Flag indicating if a seasonal component is included.""" 

2393 return self._seasonal 

2394 

2395 @property 

2396 def trend(self): 

2397 """The trend included in the model selection.""" 

2398 return self._trend 

2399 

2400 @property 

2401 def period(self): 

2402 """The period of the seasonal component.""" 

2403 return self._period 

2404 

2405 @property 

2406 def aic(self): 

2407 """ 

2408 The Akaike information criterion for the models fit. 

2409 

2410 Returns 

2411 ------- 

2412 dict[tuple, float] 

2413 """ 

2414 return self._aic 

2415 

2416 @property 

2417 def bic(self): 

2418 """ 

2419 The Bayesian (Schwarz) information criteria for the models fit. 

2420 

2421 Returns 

2422 ------- 

2423 dict[tuple, float] 

2424 """ 

2425 return self._bic 

2426 

2427 @property 

2428 def hqic(self): 

2429 """ 

2430 The Hannan-Quinn information criteria for the models fit. 

2431 

2432 Returns 

2433 ------- 

2434 dict[tuple, float] 

2435 """ 

2436 return self._hqic 

2437 

2438 @property 

2439 def ar_lags(self): 

2440 """The lags included in the selected model.""" 

2441 return self._model.ar_lags