Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Statistical tests to be used in conjunction with the models 

3 

4Notes 

5----- 

6These functions have not been formally tested. 

7""" 

8 

9from scipy import stats 

10import numpy as np 

11from statsmodels.tools.sm_exceptions import ValueWarning 

12 

13 

14# TODO: these are pretty straightforward but they should be tested 

15def durbin_watson(resids, axis=0): 

16 r""" 

17 Calculates the Durbin-Watson statistic 

18 

19 Parameters 

20 ---------- 

21 resids : array_like 

22 

23 Returns 

24 ------- 

25 dw : float, array_like 

26 The Durbin-Watson statistic. 

27 

28 Notes 

29 ----- 

30 The null hypothesis of the test is that there is no serial correlation. 

31 The Durbin-Watson test statistics is defined as: 

32 

33 .. math:: 

34 

35 \sum_{t=2}^T((e_t - e_{t-1})^2)/\sum_{t=1}^Te_t^2 

36 

37 The test statistic is approximately equal to 2*(1-r) where ``r`` is the 

38 sample autocorrelation of the residuals. Thus, for r == 0, indicating no 

39 serial correlation, the test statistic equals 2. This statistic will 

40 always be between 0 and 4. The closer to 0 the statistic, the more 

41 evidence for positive serial correlation. The closer to 4, the more 

42 evidence for negative serial correlation. 

43 """ 

44 resids = np.asarray(resids) 

45 diff_resids = np.diff(resids, 1, axis=axis) 

46 dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis) 

47 return dw 

48 

49 

50def omni_normtest(resids, axis=0): 

51 """ 

52 Omnibus test for normality 

53 

54 Parameters 

55 ---------- 

56 resid : array_like 

57 axis : int, optional 

58 Default is 0 

59 

60 Returns 

61 ------- 

62 Chi^2 score, two-tail probability 

63 """ 

64 # TODO: change to exception in summary branch and catch in summary() 

65 # behavior changed between scipy 0.9 and 0.10 

66 resids = np.asarray(resids) 

67 n = resids.shape[axis] 

68 if n < 8: 

69 from warnings import warn 

70 warn("omni_normtest is not valid with less than 8 observations; %i " 

71 "samples were given." % int(n), ValueWarning) 

72 return np.nan, np.nan 

73 

74 return stats.normaltest(resids, axis=axis) 

75 

76 

77def jarque_bera(resids, axis=0): 

78 r""" 

79 The Jarque-Bera test of normality. 

80 

81 Parameters 

82 ---------- 

83 resids : array_like 

84 Data to test for normality. Usually regression model residuals that 

85 are mean 0. 

86 axis : int, optional 

87 Axis to use if data has more than 1 dimension. Default is 0. 

88 

89 Returns 

90 ------- 

91 JB : {float, ndarray} 

92 The Jarque-Bera test statistic. 

93 JBpv : {float, ndarray} 

94 The pvalue of the test statistic. 

95 skew : {float, ndarray} 

96 Estimated skewness of the data. 

97 kurtosis : {float, ndarray} 

98 Estimated kurtosis of the data. 

99 

100 Notes 

101 ----- 

102 Each output returned has 1 dimension fewer than data 

103 

104 The Jarque-Bera test statistic tests the null that the data is normally 

105 distributed against an alternative that the data follow some other 

106 distribution. The test statistic is based on two moments of the data, 

107 the skewness, and the kurtosis, and has an asymptotic :math:`\chi^2_2` 

108 distribution. 

109 

110 The test statistic is defined 

111 

112 .. math:: JB = n(S^2/6+(K-3)^2/24) 

113 

114 where n is the number of data points, S is the sample skewness, and K is 

115 the sample kurtosis of the data. 

116 """ 

117 resids = np.asarray(resids) 

118 # Calculate residual skewness and kurtosis 

119 skew = stats.skew(resids, axis=axis) 

120 kurtosis = 3 + stats.kurtosis(resids, axis=axis) 

121 

122 # Calculate the Jarque-Bera test for normality 

123 n = resids.shape[axis] 

124 jb = (n / 6.) * (skew ** 2 + (1 / 4.) * (kurtosis - 3) ** 2) 

125 jb_pv = stats.chi2.sf(jb, 2) 

126 

127 return jb, jb_pv, skew, kurtosis 

128 

129 

130def robust_skewness(y, axis=0): 

131 """ 

132 Calculates the four skewness measures in Kim & White 

133 

134 Parameters 

135 ---------- 

136 y : array_like 

137 Data to compute use in the estimator. 

138 axis : int or None, optional 

139 Axis along which the skewness measures are computed. If `None`, the 

140 entire array is used. 

141 

142 Returns 

143 ------- 

144 sk1 : ndarray 

145 The standard skewness estimator. 

146 sk2 : ndarray 

147 Skewness estimator based on quartiles. 

148 sk3 : ndarray 

149 Skewness estimator based on mean-median difference, standardized by 

150 absolute deviation. 

151 sk4 : ndarray 

152 Skewness estimator based on mean-median difference, standardized by 

153 standard deviation. 

154 

155 Notes 

156 ----- 

157 The robust skewness measures are defined 

158 

159 .. math:: 

160 

161 SK_{2}=\\frac{\\left(q_{.75}-q_{.5}\\right) 

162 -\\left(q_{.5}-q_{.25}\\right)}{q_{.75}-q_{.25}} 

163 

164 .. math:: 

165 

166 SK_{3}=\\frac{\\mu-\\hat{q}_{0.5}} 

167 {\\hat{E}\\left[\\left|y-\\hat{\\mu}\\right|\\right]} 

168 

169 .. math:: 

170 

171 SK_{4}=\\frac{\\mu-\\hat{q}_{0.5}}{\\hat{\\sigma}} 

172 

173 .. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of 

174 skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, 

175 March 2004. 

176 """ 

177 

178 if axis is None: 

179 y = y.ravel() 

180 axis = 0 

181 

182 y = np.sort(y, axis) 

183 

184 q1, q2, q3 = np.percentile(y, [25.0, 50.0, 75.0], axis=axis) 

185 

186 mu = y.mean(axis) 

187 shape = (y.size,) 

188 if axis is not None: 

189 shape = list(mu.shape) 

190 shape.insert(axis, 1) 

191 shape = tuple(shape) 

192 

193 mu_b = np.reshape(mu, shape) 

194 q2_b = np.reshape(q2, shape) 

195 

196 sigma = np.sqrt(np.mean(((y - mu_b)**2), axis)) 

197 

198 sk1 = stats.skew(y, axis=axis) 

199 sk2 = (q1 + q3 - 2.0 * q2) / (q3 - q1) 

200 sk3 = (mu - q2) / np.mean(abs(y - q2_b), axis=axis) 

201 sk4 = (mu - q2) / sigma 

202 

203 return sk1, sk2, sk3, sk4 

204 

205 

206def _kr3(y, alpha=5.0, beta=50.0): 

207 """ 

208 KR3 estimator from Kim & White 

209 

210 Parameters 

211 ---------- 

212 y : array_like, 1-d 

213 Data to compute use in the estimator. 

214 alpha : float, optional 

215 Lower cut-off for measuring expectation in tail. 

216 beta : float, optional 

217 Lower cut-off for measuring expectation in center. 

218 

219 Returns 

220 ------- 

221 kr3 : float 

222 Robust kurtosis estimator based on standardized lower- and upper-tail 

223 expected values 

224 

225 Notes 

226 ----- 

227 .. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of 

228 skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, 

229 March 2004. 

230 """ 

231 perc = (alpha, 100.0 - alpha, beta, 100.0 - beta) 

232 lower_alpha, upper_alpha, lower_beta, upper_beta = np.percentile(y, perc) 

233 l_alpha = np.mean(y[y < lower_alpha]) 

234 u_alpha = np.mean(y[y > upper_alpha]) 

235 

236 l_beta = np.mean(y[y < lower_beta]) 

237 u_beta = np.mean(y[y > upper_beta]) 

238 

239 return (u_alpha - l_alpha) / (u_beta - l_beta) 

240 

241 

242def expected_robust_kurtosis(ab=(5.0, 50.0), dg=(2.5, 25.0)): 

243 """ 

244 Calculates the expected value of the robust kurtosis measures in Kim and 

245 White assuming the data are normally distributed. 

246 

247 Parameters 

248 ---------- 

249 ab : iterable, optional 

250 Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail 

251 quantile cut-off for measuring the extreme tail and beta is the central 

252 quantile cutoff for the standardization of the measure 

253 db : iterable, optional 

254 Contains 100*(delta, gamma) in the kr4 measure where delta is the tail 

255 quantile for measuring extreme values and gamma is the central quantile 

256 used in the the standardization of the measure 

257 

258 Returns 

259 ------- 

260 ekr : ndarray, 4-element 

261 Contains the expected values of the 4 robust kurtosis measures 

262 

263 Notes 

264 ----- 

265 See `robust_kurtosis` for definitions of the robust kurtosis measures 

266 """ 

267 

268 alpha, beta = ab 

269 delta, gamma = dg 

270 expected_value = np.zeros(4) 

271 ppf = stats.norm.ppf 

272 pdf = stats.norm.pdf 

273 q1, q2, q3, q5, q6, q7 = ppf(np.array((1.0, 2.0, 3.0, 5.0, 6.0, 7.0)) / 8) 

274 expected_value[0] = 3 

275 

276 expected_value[1] = ((q7 - q5) + (q3 - q1)) / (q6 - q2) 

277 

278 q_alpha, q_beta = ppf(np.array((alpha / 100.0, beta / 100.0))) 

279 expected_value[2] = (2 * pdf(q_alpha) / alpha) / (2 * pdf(q_beta) / beta) 

280 

281 q_delta, q_gamma = ppf(np.array((delta / 100.0, gamma / 100.0))) 

282 expected_value[3] = (-2.0 * q_delta) / (-2.0 * q_gamma) 

283 

284 return expected_value 

285 

286 

287def robust_kurtosis(y, axis=0, ab=(5.0, 50.0), dg=(2.5, 25.0), excess=True): 

288 """ 

289 Calculates the four kurtosis measures in Kim & White 

290 

291 Parameters 

292 ---------- 

293 y : array_like 

294 Data to compute use in the estimator. 

295 axis : int or None, optional 

296 Axis along which the kurtosis are computed. If `None`, the 

297 entire array is used. 

298 a iterable, optional 

299 Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail 

300 quantile cut-off for measuring the extreme tail and beta is the central 

301 quantile cutoff for the standardization of the measure 

302 db : iterable, optional 

303 Contains 100*(delta, gamma) in the kr4 measure where delta is the tail 

304 quantile for measuring extreme values and gamma is the central quantile 

305 used in the the standardization of the measure 

306 excess : bool, optional 

307 If true (default), computed values are excess of those for a standard 

308 normal distribution. 

309 

310 Returns 

311 ------- 

312 kr1 : ndarray 

313 The standard kurtosis estimator. 

314 kr2 : ndarray 

315 Kurtosis estimator based on octiles. 

316 kr3 : ndarray 

317 Kurtosis estimators based on exceedance expectations. 

318 kr4 : ndarray 

319 Kurtosis measure based on the spread between high and low quantiles. 

320 

321 Notes 

322 ----- 

323 The robust kurtosis measures are defined 

324 

325 .. math:: 

326 

327 KR_{2}=\\frac{\\left(\\hat{q}_{.875}-\\hat{q}_{.625}\\right) 

328 +\\left(\\hat{q}_{.375}-\\hat{q}_{.125}\\right)} 

329 {\\hat{q}_{.75}-\\hat{q}_{.25}} 

330 

331 .. math:: 

332 

333 KR_{3}=\\frac{\\hat{E}\\left(y|y>\\hat{q}_{1-\\alpha}\\right) 

334 -\\hat{E}\\left(y|y<\\hat{q}_{\\alpha}\\right)} 

335 {\\hat{E}\\left(y|y>\\hat{q}_{1-\\beta}\\right) 

336 -\\hat{E}\\left(y|y<\\hat{q}_{\\beta}\\right)} 

337 

338 .. math:: 

339 

340 KR_{4}=\\frac{\\hat{q}_{1-\\delta}-\\hat{q}_{\\delta}} 

341 {\\hat{q}_{1-\\gamma}-\\hat{q}_{\\gamma}} 

342 

343 where :math:`\\hat{q}_{p}` is the estimated quantile at :math:`p`. 

344 

345 .. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of 

346 skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, 

347 March 2004. 

348 """ 

349 if (axis is None or 

350 (y.squeeze().ndim == 1 and y.ndim != 1)): 

351 y = y.ravel() 

352 axis = 0 

353 

354 alpha, beta = ab 

355 delta, gamma = dg 

356 

357 perc = (12.5, 25.0, 37.5, 62.5, 75.0, 87.5, 

358 delta, 100.0 - delta, gamma, 100.0 - gamma) 

359 e1, e2, e3, e5, e6, e7, fd, f1md, fg, f1mg = np.percentile(y, perc, 

360 axis=axis) 

361 

362 expected_value = (expected_robust_kurtosis(ab, dg) 

363 if excess else np.zeros(4)) 

364 

365 kr1 = stats.kurtosis(y, axis, False) - expected_value[0] 

366 kr2 = ((e7 - e5) + (e3 - e1)) / (e6 - e2) - expected_value[1] 

367 if y.ndim == 1: 

368 kr3 = _kr3(y, alpha, beta) 

369 else: 

370 kr3 = np.apply_along_axis(_kr3, axis, y, alpha, beta) 

371 kr3 -= expected_value[2] 

372 kr4 = (f1md - fd) / (f1mg - fg) - expected_value[3] 

373 return kr1, kr2, kr3, kr4 

374 

375 

376def _medcouple_1d(y): 

377 """ 

378 Calculates the medcouple robust measure of skew. 

379 

380 Parameters 

381 ---------- 

382 y : array_like, 1-d 

383 Data to compute use in the estimator. 

384 

385 Returns 

386 ------- 

387 mc : float 

388 The medcouple statistic 

389 

390 Notes 

391 ----- 

392 The current algorithm requires a O(N**2) memory allocations, and so may 

393 not work for very large arrays (N>10000). 

394 

395 .. [*] M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed 

396 distributions" Computational Statistics & Data Analysis, vol. 52, pp. 

397 5186-5201, August 2008. 

398 """ 

399 

400 # Parameter changes the algorithm to the slower for large n 

401 

402 y = np.squeeze(np.asarray(y)) 

403 if y.ndim != 1: 

404 raise ValueError("y must be squeezable to a 1-d array") 

405 

406 y = np.sort(y) 

407 

408 n = y.shape[0] 

409 if n % 2 == 0: 

410 mf = (y[n // 2 - 1] + y[n // 2]) / 2 

411 else: 

412 mf = y[(n - 1) // 2] 

413 

414 z = y - mf 

415 lower = z[z <= 0.0] 

416 upper = z[z >= 0.0] 

417 upper = upper[:, None] 

418 standardization = upper - lower 

419 is_zero = np.logical_and(lower == 0.0, upper == 0.0) 

420 standardization[is_zero] = np.inf 

421 spread = upper + lower 

422 h = spread / standardization 

423 # GH5395 

424 num_ties = np.sum(lower == 0.0) 

425 if num_ties: 

426 # Replacements has -1 above the anti-diagonal, 0 on the anti-diagonal, 

427 # and 1 below the anti-diagonal 

428 replacements = np.ones((num_ties, num_ties)) - np.eye(num_ties) 

429 replacements -= 2 * np.triu(replacements) 

430 # Convert diagonal to anti-diagonal 

431 replacements = np.fliplr(replacements) 

432 # Always replace upper right block 

433 h[:num_ties, -num_ties:] = replacements 

434 

435 return np.median(h) 

436 

437 

438def medcouple(y, axis=0): 

439 """ 

440 Calculate the medcouple robust measure of skew. 

441 

442 Parameters 

443 ---------- 

444 y : array_like 

445 Data to compute use in the estimator. 

446 axis : {int, None} 

447 Axis along which the medcouple statistic is computed. If `None`, the 

448 entire array is used. 

449 

450 Returns 

451 ------- 

452 mc : ndarray 

453 The medcouple statistic with the same shape as `y`, with the specified 

454 axis removed. 

455 

456 Notes 

457 ----- 

458 The current algorithm requires a O(N**2) memory allocations, and so may 

459 not work for very large arrays (N>10000). 

460 

461 .. [*] M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed 

462 distributions" Computational Statistics & Data Analysis, vol. 52, pp. 

463 5186-5201, August 2008. 

464 """ 

465 y = np.asarray(y, dtype=np.double) # GH 4243 

466 if axis is None: 

467 return _medcouple_1d(y.ravel()) 

468 

469 return np.apply_along_axis(_medcouple_1d, axis, y)