Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Robust linear models with support for the M-estimators listed under 

3:ref:`norms <norms>`. 

4 

5References 

6---------- 

7PJ Huber. 'Robust Statistics' John Wiley and Sons, Inc., New York. 1981. 

8 

9PJ Huber. 1973, 'The 1972 Wald Memorial Lectures: Robust Regression: 

10 Asymptotics, Conjectures, and Monte Carlo.' The Annals of Statistics, 

11 1.5, 799-821. 

12 

13R Venables, B Ripley. 'Modern Applied Statistics in S' Springer, New York, 

14 2002. 

15""" 

16import numpy as np 

17import scipy.stats as stats 

18 

19from statsmodels.tools.decorators import cache_readonly 

20from statsmodels.tools.sm_exceptions import ConvergenceWarning 

21import statsmodels.regression.linear_model as lm 

22import statsmodels.regression._tools as reg_tools 

23import statsmodels.robust.norms as norms 

24import statsmodels.robust.scale as scale 

25import statsmodels.base.model as base 

26import statsmodels.base.wrapper as wrap 

27 

28__all__ = ['RLM'] 

29 

30 

31def _check_convergence(criterion, iteration, tol, maxiter): 

32 cond = np.abs(criterion[iteration] - criterion[iteration - 1]) 

33 return not (np.any(cond > tol) and iteration < maxiter) 

34 

35 

36class RLM(base.LikelihoodModel): 

37 __doc__ = """ 

38 Robust Linear Model 

39 

40 Estimate a robust linear model via iteratively reweighted least squares 

41 given a robust criterion estimator. 

42 

43 %(params)s 

44 M : statsmodels.robust.norms.RobustNorm, optional 

45 The robust criterion function for downweighting outliers. 

46 The current options are LeastSquares, HuberT, RamsayE, AndrewWave, 

47 TrimmedMean, Hampel, and TukeyBiweight. The default is HuberT(). 

48 See statsmodels.robust.norms for more information. 

49 %(extra_params)s 

50 

51 Attributes 

52 ---------- 

53 

54 df_model : float 

55 The degrees of freedom of the model. The number of regressors p less 

56 one for the intercept. Note that the reported model degrees 

57 of freedom does not count the intercept as a regressor, though 

58 the model is assumed to have an intercept. 

59 df_resid : float 

60 The residual degrees of freedom. The number of observations n 

61 less the number of regressors p. Note that here p does include 

62 the intercept as using a degree of freedom. 

63 endog : ndarray 

64 See above. Note that endog is a reference to the data so that if 

65 data is already an array and it is changed, then `endog` changes 

66 as well. 

67 exog : ndarray 

68 See above. Note that endog is a reference to the data so that if 

69 data is already an array and it is changed, then `endog` changes 

70 as well. 

71 M : statsmodels.robust.norms.RobustNorm 

72 See above. Robust estimator instance instantiated. 

73 nobs : float 

74 The number of observations n 

75 pinv_wexog : ndarray 

76 The pseudoinverse of the design / exogenous data array. Note that 

77 RLM has no whiten method, so this is just the pseudo inverse of the 

78 design. 

79 normalized_cov_params : ndarray 

80 The p x p normalized covariance of the design / exogenous data. 

81 This is approximately equal to (X.T X)^(-1) 

82 

83 Examples 

84 -------- 

85 >>> import statsmodels.api as sm 

86 >>> data = sm.datasets.stackloss.load(as_pandas=False) 

87 >>> data.exog = sm.add_constant(data.exog) 

88 >>> rlm_model = sm.RLM(data.endog, data.exog, \ 

89 M=sm.robust.norms.HuberT()) 

90 

91 >>> rlm_results = rlm_model.fit() 

92 >>> rlm_results.params 

93 array([ 0.82938433, 0.92606597, -0.12784672, -41.02649835]) 

94 >>> rlm_results.bse 

95 array([ 0.11100521, 0.30293016, 0.12864961, 9.79189854]) 

96 >>> rlm_results_HC2 = rlm_model.fit(cov="H2") 

97 >>> rlm_results_HC2.params 

98 array([ 0.82938433, 0.92606597, -0.12784672, -41.02649835]) 

99 >>> rlm_results_HC2.bse 

100 array([ 0.11945975, 0.32235497, 0.11796313, 9.08950419]) 

101 >>> mod = sm.RLM(data.endog, data.exog, M=sm.robust.norms.Hampel()) 

102 >>> rlm_hamp_hub = mod.fit(scale_est=sm.robust.scale.HuberScale()) 

103 >>> rlm_hamp_hub.params 

104 array([ 0.73175452, 1.25082038, -0.14794399, -40.27122257]) 

105 """ % {'params': base._model_params_doc, 

106 'extra_params': base._missing_param_doc} 

107 

108 def __init__(self, endog, exog, M=None, missing='none', 

109 **kwargs): 

110 self.M = M if M is not None else norms.HuberT() 

111 super(base.LikelihoodModel, self).__init__(endog, exog, 

112 missing=missing, **kwargs) 

113 self._initialize() 

114 # things to remove_data 

115 self._data_attr.extend(['weights', 'pinv_wexog']) 

116 

117 def _initialize(self): 

118 """ 

119 Initializes the model for the IRLS fit. 

120 

121 Resets the history and number of iterations. 

122 """ 

123 self.pinv_wexog = np.linalg.pinv(self.exog) 

124 self.normalized_cov_params = np.dot(self.pinv_wexog, 

125 np.transpose(self.pinv_wexog)) 

126 self.df_resid = (np.float(self.exog.shape[0] - 

127 np.linalg.matrix_rank(self.exog))) 

128 self.df_model = np.float(np.linalg.matrix_rank(self.exog) - 1) 

129 self.nobs = float(self.endog.shape[0]) 

130 

131 def score(self, params): 

132 raise NotImplementedError 

133 

134 def information(self, params): 

135 raise NotImplementedError 

136 

137 def predict(self, params, exog=None): 

138 """ 

139 Return linear predicted values from a design matrix. 

140 

141 Parameters 

142 ---------- 

143 params : array_like 

144 Parameters of a linear model 

145 exog : array_like, optional. 

146 Design / exogenous data. Model exog is used if None. 

147 

148 Returns 

149 ------- 

150 An array of fitted values 

151 """ 

152 # copied from linear_model # TODO: then is it needed? 

153 if exog is None: 

154 exog = self.exog 

155 return np.dot(exog, params) 

156 

157 def loglike(self, params): 

158 raise NotImplementedError 

159 

160 def deviance(self, tmp_results): 

161 """ 

162 Returns the (unnormalized) log-likelihood from the M estimator. 

163 """ 

164 tmp_resid = self.endog - tmp_results.fittedvalues 

165 return self.M(tmp_resid / tmp_results.scale).sum() 

166 

167 def _update_history(self, tmp_results, history, conv): 

168 history['params'].append(tmp_results.params) 

169 history['scale'].append(tmp_results.scale) 

170 if conv == 'dev': 

171 history['deviance'].append(self.deviance(tmp_results)) 

172 elif conv == 'sresid': 

173 history['sresid'].append(tmp_results.resid / tmp_results.scale) 

174 elif conv == 'weights': 

175 history['weights'].append(tmp_results.model.weights) 

176 return history 

177 

178 def _estimate_scale(self, resid): 

179 """ 

180 Estimates the scale based on the option provided to the fit method. 

181 """ 

182 if isinstance(self.scale_est, str): 

183 if self.scale_est.lower() == 'mad': 

184 return scale.mad(resid, center=0) 

185 else: 

186 raise ValueError("Option %s for scale_est not understood" % 

187 self.scale_est) 

188 elif isinstance(self.scale_est, scale.HuberScale): 

189 return self.scale_est(self.df_resid, self.nobs, resid) 

190 else: 

191 return scale.scale_est(self, resid) ** 2 

192 

193 def fit(self, maxiter=50, tol=1e-8, scale_est='mad', init=None, cov='H1', 

194 update_scale=True, conv='dev', start_params=None): 

195 """ 

196 Fits the model using iteratively reweighted least squares. 

197 

198 The IRLS routine runs until the specified objective converges to `tol` 

199 or `maxiter` has been reached. 

200 

201 Parameters 

202 ---------- 

203 conv : str 

204 Indicates the convergence criteria. 

205 Available options are "coefs" (the coefficients), "weights" (the 

206 weights in the iteration), "sresid" (the standardized residuals), 

207 and "dev" (the un-normalized log-likelihood for the M 

208 estimator). The default is "dev". 

209 cov : str, optional 

210 'H1', 'H2', or 'H3' 

211 Indicates how the covariance matrix is estimated. Default is 'H1'. 

212 See rlm.RLMResults for more information. 

213 init : str 

214 Specifies method for the initial estimates of the parameters. 

215 Default is None, which means that the least squares estimate 

216 is used. Currently it is the only available choice. 

217 maxiter : int 

218 The maximum number of iterations to try. Default is 50. 

219 scale_est : str or HuberScale() 

220 'mad' or HuberScale() 

221 Indicates the estimate to use for scaling the weights in the IRLS. 

222 The default is 'mad' (median absolute deviation. Other options are 

223 'HuberScale' for Huber's proposal 2. Huber's proposal 2 has 

224 optional keyword arguments d, tol, and maxiter for specifying the 

225 tuning constant, the convergence tolerance, and the maximum number 

226 of iterations. See statsmodels.robust.scale for more information. 

227 tol : float 

228 The convergence tolerance of the estimate. Default is 1e-8. 

229 update_scale : Bool 

230 If `update_scale` is False then the scale estimate for the 

231 weights is held constant over the iteration. Otherwise, it 

232 is updated for each fit in the iteration. Default is True. 

233 start_params : array-like, optional 

234 Initial guess of the solution of the optimizer. If not provided, 

235 the initial parameters are computed using OLS. 

236 

237 Returns 

238 ------- 

239 results : statsmodels.rlm.RLMresults 

240 Results instance 

241 """ 

242 if cov.upper() not in ["H1", "H2", "H3"]: 

243 raise ValueError("Covariance matrix %s not understood" % cov) 

244 else: 

245 self.cov = cov.upper() 

246 conv = conv.lower() 

247 if conv not in ["weights", "coefs", "dev", "sresid"]: 

248 raise ValueError("Convergence argument %s not understood" % conv) 

249 self.scale_est = scale_est 

250 

251 if start_params is None: 

252 wls_results = lm.WLS(self.endog, self.exog).fit() 

253 else: 

254 start_params = np.asarray(start_params, dtype=np.double).squeeze() 

255 if (start_params.shape[0] != self.exog.shape[1] or 

256 start_params.ndim != 1): 

257 raise ValueError('start_params must by a 1-d array with {0} ' 

258 'values'.format(self.exog.shape[1])) 

259 fake_wls = reg_tools._MinimalWLS(self.endog, self.exog, 

260 weights=np.ones_like(self.endog), 

261 check_weights=False) 

262 wls_results = fake_wls.results(start_params) 

263 

264 if not init: 

265 self.scale = self._estimate_scale(wls_results.resid) 

266 

267 history = dict(params=[np.inf], scale=[]) 

268 if conv == 'coefs': 

269 criterion = history['params'] 

270 elif conv == 'dev': 

271 history.update(dict(deviance=[np.inf])) 

272 criterion = history['deviance'] 

273 elif conv == 'sresid': 

274 history.update(dict(sresid=[np.inf])) 

275 criterion = history['sresid'] 

276 elif conv == 'weights': 

277 history.update(dict(weights=[np.inf])) 

278 criterion = history['weights'] 

279 

280 # done one iteration so update 

281 history = self._update_history(wls_results, history, conv) 

282 iteration = 1 

283 converged = 0 

284 while not converged: 

285 if self.scale == 0.0: 

286 import warnings 

287 warnings.warn('Estimated scale is 0.0 indicating that the most' 

288 ' last iteration produced a perfect fit of the ' 

289 'weighted data.', ConvergenceWarning) 

290 break 

291 self.weights = self.M.weights(wls_results.resid / self.scale) 

292 wls_results = reg_tools._MinimalWLS(self.endog, self.exog, 

293 weights=self.weights, 

294 check_weights=True).fit() 

295 if update_scale is True: 

296 self.scale = self._estimate_scale(wls_results.resid) 

297 history = self._update_history(wls_results, history, conv) 

298 iteration += 1 

299 converged = _check_convergence(criterion, iteration, tol, maxiter) 

300 results = RLMResults(self, wls_results.params, 

301 self.normalized_cov_params, self.scale) 

302 

303 history['iteration'] = iteration 

304 results.fit_history = history 

305 results.fit_options = dict(cov=cov.upper(), scale_est=scale_est, 

306 norm=self.M.__class__.__name__, conv=conv) 

307 # norm is not changed in fit, no old state 

308 

309 # doing the next causes exception 

310 # self.cov = self.scale_est = None #reset for additional fits 

311 # iteration and history could contain wrong state with repeated fit 

312 return RLMResultsWrapper(results) 

313 

314 

315class RLMResults(base.LikelihoodModelResults): 

316 """ 

317 Class to contain RLM results 

318 

319 Attributes 

320 ---------- 

321 

322 bcov_scaled : ndarray 

323 p x p scaled covariance matrix specified in the model fit method. 

324 The default is H1. H1 is defined as 

325 ``k**2 * (1/df_resid*sum(M.psi(sresid)**2)*scale**2)/ 

326 ((1/nobs*sum(M.psi_deriv(sresid)))**2) * (X.T X)^(-1)`` 

327 

328 where ``k = 1 + (df_model +1)/nobs * var_psiprime/m**2`` 

329 where ``m = mean(M.psi_deriv(sresid))`` and 

330 ``var_psiprime = var(M.psi_deriv(sresid))`` 

331 

332 H2 is defined as 

333 ``k * (1/df_resid) * sum(M.psi(sresid)**2) *scale**2/ 

334 ((1/nobs)*sum(M.psi_deriv(sresid)))*W_inv`` 

335 

336 H3 is defined as 

337 ``1/k * (1/df_resid * sum(M.psi(sresid)**2)*scale**2 * 

338 (W_inv X.T X W_inv))`` 

339 

340 where `k` is defined as above and 

341 ``W_inv = (M.psi_deriv(sresid) exog.T exog)^(-1)`` 

342 

343 See the technical documentation for cleaner formulae. 

344 bcov_unscaled : ndarray 

345 The usual p x p covariance matrix with scale set equal to 1. It 

346 is then just equivalent to normalized_cov_params. 

347 bse : ndarray 

348 An array of the standard errors of the parameters. The standard 

349 errors are taken from the robust covariance matrix specified in the 

350 argument to fit. 

351 chisq : ndarray 

352 An array of the chi-squared values of the parameter estimates. 

353 df_model 

354 See RLM.df_model 

355 df_resid 

356 See RLM.df_resid 

357 fit_history : dict 

358 Contains information about the iterations. Its keys are `deviance`, 

359 `params`, `iteration` and the convergence criteria specified in 

360 `RLM.fit`, if different from `deviance` or `params`. 

361 fit_options : dict 

362 Contains the options given to fit. 

363 fittedvalues : ndarray 

364 The linear predicted values. dot(exog, params) 

365 model : statsmodels.rlm.RLM 

366 A reference to the model instance 

367 nobs : float 

368 The number of observations n 

369 normalized_cov_params : ndarray 

370 See RLM.normalized_cov_params 

371 params : ndarray 

372 The coefficients of the fitted model 

373 pinv_wexog : ndarray 

374 See RLM.pinv_wexog 

375 pvalues : ndarray 

376 The p values associated with `tvalues`. Note that `tvalues` are assumed 

377 to be distributed standard normal rather than Student's t. 

378 resid : ndarray 

379 The residuals of the fitted model. endog - fittedvalues 

380 scale : float 

381 The type of scale is determined in the arguments to the fit method in 

382 RLM. The reported scale is taken from the residuals of the weighted 

383 least squares in the last IRLS iteration if update_scale is True. If 

384 update_scale is False, then it is the scale given by the first OLS 

385 fit before the IRLS iterations. 

386 sresid : ndarray 

387 The scaled residuals. 

388 tvalues : ndarray 

389 The "t-statistics" of params. These are defined as params/bse where 

390 bse are taken from the robust covariance matrix specified in the 

391 argument to fit. 

392 weights : ndarray 

393 The reported weights are determined by passing the scaled residuals 

394 from the last weighted least squares fit in the IRLS algorithm. 

395 

396 See Also 

397 -------- 

398 statsmodels.base.model.LikelihoodModelResults 

399 """ 

400 

401 def __init__(self, model, params, normalized_cov_params, scale): 

402 super(RLMResults, self).__init__(model, params, 

403 normalized_cov_params, scale) 

404 self.model = model 

405 self.df_model = model.df_model 

406 self.df_resid = model.df_resid 

407 self.nobs = model.nobs 

408 self._cache = {} 

409 # for remove_data 

410 self.data_in_cache = ['sresid'] 

411 

412 self.cov_params_default = self.bcov_scaled 

413 # TODO: "pvals" should come from chisq on bse? 

414 

415 @cache_readonly 

416 def fittedvalues(self): 

417 return np.dot(self.model.exog, self.params) 

418 

419 @cache_readonly 

420 def resid(self): 

421 return self.model.endog - self.fittedvalues # before bcov 

422 

423 @cache_readonly 

424 def sresid(self): 

425 if self.scale == 0.0: 

426 sresid = self.resid.copy() 

427 sresid[:] = 0.0 

428 return sresid 

429 return self.resid / self.scale 

430 

431 @cache_readonly 

432 def bcov_unscaled(self): 

433 return self.normalized_cov_params 

434 

435 @cache_readonly 

436 def weights(self): 

437 return self.model.weights 

438 

439 @cache_readonly 

440 def bcov_scaled(self): 

441 model = self.model 

442 m = np.mean(model.M.psi_deriv(self.sresid)) 

443 var_psiprime = np.var(model.M.psi_deriv(self.sresid)) 

444 k = 1 + (self.df_model + 1) / self.nobs * var_psiprime / m ** 2 

445 

446 if model.cov == "H1": 

447 ss_psi = np.sum(model.M.psi(self.sresid) ** 2) 

448 s_psi_deriv = np.sum(model.M.psi_deriv(self.sresid)) 

449 return k ** 2 * (1 / self.df_resid * ss_psi * self.scale ** 2) /\ 

450 ((1 / self.nobs * s_psi_deriv) ** 2) *\ 

451 model.normalized_cov_params 

452 else: 

453 W = np.dot(model.M.psi_deriv(self.sresid) * model.exog.T, 

454 model.exog) 

455 W_inv = np.linalg.inv(W) 

456 # [W_jk]^-1 = [SUM(psi_deriv(Sr_i)*x_ij*x_jk)]^-1 

457 # where Sr are the standardized residuals 

458 if model.cov == "H2": 

459 # These are correct, based on Huber (1973) 8.13 

460 return k * (1 / self.df_resid) * np.sum( 

461 model.M.psi(self.sresid) ** 2) * self.scale ** 2 \ 

462 / ((1 / self.nobs) * 

463 np.sum(model.M.psi_deriv(self.sresid))) * W_inv 

464 elif model.cov == "H3": 

465 return k ** -1 * 1 / self.df_resid * np.sum( 

466 model.M.psi(self.sresid) ** 2) * self.scale ** 2 \ 

467 * np.dot( 

468 np.dot(W_inv, np.dot(model.exog.T, model.exog)), 

469 W_inv) 

470 

471 @cache_readonly 

472 def pvalues(self): 

473 return stats.norm.sf(np.abs(self.tvalues)) * 2 

474 

475 @cache_readonly 

476 def bse(self): 

477 return np.sqrt(np.diag(self.bcov_scaled)) 

478 

479 @cache_readonly 

480 def chisq(self): 

481 return (self.params / self.bse) ** 2 

482 

483 def summary(self, yname=None, xname=None, title=0, alpha=.05, 

484 return_fmt='text'): 

485 """ 

486 This is for testing the new summary setup 

487 """ 

488 top_left = [('Dep. Variable:', None), 

489 ('Model:', None), 

490 ('Method:', ['IRLS']), 

491 ('Norm:', [self.fit_options['norm']]), 

492 ('Scale Est.:', [self.fit_options['scale_est']]), 

493 ('Cov Type:', [self.fit_options['cov']]), 

494 ('Date:', None), 

495 ('Time:', None), 

496 ('No. Iterations:', ["%d" % self.fit_history['iteration']]) 

497 ] 

498 top_right = [('No. Observations:', None), 

499 ('Df Residuals:', None), 

500 ('Df Model:', None) 

501 ] 

502 

503 if title is not None: 

504 title = "Robust linear Model Regression Results" 

505 

506 # boiler plate 

507 from statsmodels.iolib.summary import Summary 

508 smry = Summary() 

509 smry.add_table_2cols(self, gleft=top_left, gright=top_right, 

510 yname=yname, xname=xname, title=title) 

511 smry.add_table_params(self, yname=yname, xname=xname, alpha=alpha, 

512 use_t=self.use_t) 

513 

514 # add warnings/notes, added to text format only 

515 etext = [] 

516 wstr = ("If the model instance has been used for another fit with " 

517 "different fit parameters, then the fit options might not be " 

518 "the correct ones anymore .") 

519 etext.append(wstr) 

520 

521 if etext: 

522 smry.add_extra_txt(etext) 

523 

524 return smry 

525 

526 def summary2(self, xname=None, yname=None, title=None, alpha=.05, 

527 float_format="%.4f"): 

528 """Experimental summary function for regression results 

529 

530 Parameters 

531 ---------- 

532 yname : str 

533 Name of the dependent variable (optional) 

534 xname : list[str], optional 

535 Names for the exogenous variables. Default is `var_##` for ## in 

536 the number of regressors. Must match the number of parameters 

537 in the model 

538 title : str, optional 

539 Title for the top table. If not None, then this replaces the 

540 default title 

541 alpha : float 

542 significance level for the confidence intervals 

543 float_format : str 

544 print format for floats in parameters summary 

545 

546 Returns 

547 ------- 

548 smry : Summary instance 

549 this holds the summary tables and text, which can be printed or 

550 converted to various output formats. 

551 

552 See Also 

553 -------- 

554 statsmodels.iolib.summary2.Summary : class to hold summary results 

555 """ 

556 from statsmodels.iolib import summary2 

557 smry = summary2.Summary() 

558 smry.add_base(results=self, alpha=alpha, float_format=float_format, 

559 xname=xname, yname=yname, title=title) 

560 

561 return smry 

562 

563 

564class RLMResultsWrapper(lm.RegressionResultsWrapper): 

565 pass 

566 

567 

568wrap.populate_wrapper(RLMResultsWrapper, RLMResults) # noqa:E305