Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Base tools for handling various kinds of data structures, attaching metadata to 

3results, and doing data cleaning 

4""" 

5from statsmodels.compat.python import iteritems, lmap 

6 

7from functools import reduce 

8 

9import numpy as np 

10from pandas import DataFrame, Series, isnull, MultiIndex 

11 

12import statsmodels.tools.data as data_util 

13from statsmodels.tools.decorators import cache_readonly, cache_writable 

14from statsmodels.tools.sm_exceptions import MissingDataError 

15 

16 

17def _asarray_2dcolumns(x): 

18 if np.asarray(x).ndim > 1 and np.asarray(x).squeeze().ndim == 1: 

19 return 

20 

21 

22def _asarray_2d_null_rows(x): 

23 """ 

24 Makes sure input is an array and is 2d. Makes sure output is 2d. True 

25 indicates a null in the rows of 2d x. 

26 """ 

27 #Have to have the asarrays because isnull does not account for array_like 

28 #input 

29 x = np.asarray(x) 

30 if x.ndim == 1: 

31 x = x[:, None] 

32 return np.any(isnull(x), axis=1)[:, None] 

33 

34 

35def _nan_rows(*arrs): 

36 """ 

37 Returns a boolean array which is True where any of the rows in any 

38 of the _2d_ arrays in arrs are NaNs. Inputs can be any mixture of Series, 

39 DataFrames or array_like. 

40 """ 

41 if len(arrs) == 1: 

42 arrs += ([[False]],) 

43 

44 def _nan_row_maybe_two_inputs(x, y): 

45 # check for dtype bc dataframe has dtypes 

46 x_is_boolean_array = hasattr(x, 'dtype') and x.dtype == bool and x 

47 return np.logical_or(_asarray_2d_null_rows(x), 

48 (x_is_boolean_array | _asarray_2d_null_rows(y))) 

49 return reduce(_nan_row_maybe_two_inputs, arrs).squeeze() 

50 

51 

52class ModelData(object): 

53 """ 

54 Class responsible for handling input data and extracting metadata into the 

55 appropriate form 

56 """ 

57 _param_names = None 

58 _cov_names = None 

59 

60 def __init__(self, endog, exog=None, missing='none', hasconst=None, 

61 **kwargs): 

62 if data_util._is_recarray(endog) or data_util._is_recarray(exog): 

63 import warnings 

64 from statsmodels.tools.sm_exceptions import recarray_warning 

65 warnings.warn(recarray_warning, FutureWarning) 

66 if 'design_info' in kwargs: 

67 self.design_info = kwargs.pop('design_info') 

68 if 'formula' in kwargs: 

69 self.formula = kwargs.pop('formula') 

70 if missing != 'none': 

71 arrays, nan_idx = self.handle_missing(endog, exog, missing, 

72 **kwargs) 

73 self.missing_row_idx = nan_idx 

74 self.__dict__.update(arrays) # attach all the data arrays 

75 self.orig_endog = self.endog 

76 self.orig_exog = self.exog 

77 self.endog, self.exog = self._convert_endog_exog(self.endog, 

78 self.exog) 

79 else: 

80 self.__dict__.update(kwargs) # attach the extra arrays anyway 

81 self.orig_endog = endog 

82 self.orig_exog = exog 

83 self.endog, self.exog = self._convert_endog_exog(endog, exog) 

84 

85 self.const_idx = None 

86 self.k_constant = 0 

87 self._handle_constant(hasconst) 

88 self._check_integrity() 

89 self._cache = {} 

90 

91 def __getstate__(self): 

92 from copy import copy 

93 d = copy(self.__dict__) 

94 if "design_info" in d: 

95 del d["design_info"] 

96 d["restore_design_info"] = True 

97 return d 

98 

99 def __setstate__(self, d): 

100 if "restore_design_info" in d: 

101 # NOTE: there may be a more performant way to do this 

102 from patsy import dmatrices, PatsyError 

103 exc = [] 

104 try: 

105 data = d['frame'] 

106 except KeyError: 

107 data = d['orig_endog'].join(d['orig_exog']) 

108 

109 for depth in [2, 3, 1, 0, 4]: # sequence is a guess where to likely find it 

110 try: 

111 _, design = dmatrices(d['formula'], data, eval_env=depth, 

112 return_type='dataframe') 

113 break 

114 except (NameError, PatsyError) as e: 

115 exc.append(e) # why do I need a reference from outside except block 

116 pass 

117 else: 

118 raise exc[-1] 

119 

120 self.design_info = design.design_info 

121 del d["restore_design_info"] 

122 self.__dict__.update(d) 

123 

124 def _handle_constant(self, hasconst): 

125 if hasconst is False or self.exog is None: 

126 self.k_constant = 0 

127 self.const_idx = None 

128 else: 

129 # detect where the constant is 

130 check_implicit = False 

131 exog_max = np.max(self.exog, axis=0) 

132 if not np.isfinite(exog_max).all(): 

133 raise MissingDataError('exog contains inf or nans') 

134 exog_min = np.min(self.exog, axis=0) 

135 const_idx = np.where(exog_max == exog_min)[0].squeeze() 

136 self.k_constant = const_idx.size 

137 

138 if self.k_constant == 1: 

139 if self.exog[:, const_idx].mean() != 0: 

140 self.const_idx = int(const_idx) 

141 else: 

142 # we only have a zero column and no other constant 

143 check_implicit = True 

144 elif self.k_constant > 1: 

145 # we have more than one constant column 

146 # look for ones 

147 values = [] # keep values if we need != 0 

148 for idx in const_idx: 

149 value = self.exog[:, idx].mean() 

150 if value == 1: 

151 self.k_constant = 1 

152 self.const_idx = int(idx) 

153 break 

154 values.append(value) 

155 else: 

156 # we did not break, no column of ones 

157 pos = (np.array(values) != 0) 

158 if pos.any(): 

159 # take the first nonzero column 

160 self.k_constant = 1 

161 self.const_idx = int(const_idx[pos.argmax()]) 

162 else: 

163 # only zero columns 

164 check_implicit = True 

165 elif self.k_constant == 0: 

166 check_implicit = True 

167 else: 

168 # should not be here 

169 pass 

170 

171 if check_implicit and not hasconst: 

172 # look for implicit constant 

173 # Compute rank of augmented matrix 

174 augmented_exog = np.column_stack( 

175 (np.ones(self.exog.shape[0]), self.exog)) 

176 rank_augm = np.linalg.matrix_rank(augmented_exog) 

177 rank_orig = np.linalg.matrix_rank(self.exog) 

178 self.k_constant = int(rank_orig == rank_augm) 

179 self.const_idx = None 

180 elif hasconst: 

181 # Ensure k_constant is 1 any time hasconst is True 

182 # even if one is not found 

183 self.k_constant = 1 

184 

185 @classmethod 

186 def _drop_nans(cls, x, nan_mask): 

187 return x[nan_mask] 

188 

189 @classmethod 

190 def _drop_nans_2d(cls, x, nan_mask): 

191 return x[nan_mask][:, nan_mask] 

192 

193 @classmethod 

194 def handle_missing(cls, endog, exog, missing, **kwargs): 

195 """ 

196 This returns a dictionary with keys endog, exog and the keys of 

197 kwargs. It preserves Nones. 

198 """ 

199 none_array_names = [] 

200 

201 # patsy's already dropped NaNs in y/X 

202 missing_idx = kwargs.pop('missing_idx', None) 

203 

204 if missing_idx is not None: 

205 # y, X already handled by patsy. add back in later. 

206 combined = () 

207 combined_names = [] 

208 if exog is None: 

209 none_array_names += ['exog'] 

210 elif exog is not None: 

211 combined = (endog, exog) 

212 combined_names = ['endog', 'exog'] 

213 else: 

214 combined = (endog,) 

215 combined_names = ['endog'] 

216 none_array_names += ['exog'] 

217 

218 # deal with other arrays 

219 combined_2d = () 

220 combined_2d_names = [] 

221 if len(kwargs): 

222 for key, value_array in iteritems(kwargs): 

223 if value_array is None or value_array.ndim == 0: 

224 none_array_names += [key] 

225 continue 

226 # grab 1d arrays 

227 if value_array.ndim == 1: 

228 combined += (np.asarray(value_array),) 

229 combined_names += [key] 

230 elif value_array.squeeze().ndim == 1: 

231 combined += (np.asarray(value_array),) 

232 combined_names += [key] 

233 

234 # grab 2d arrays that are _assumed_ to be symmetric 

235 elif value_array.ndim == 2: 

236 combined_2d += (np.asarray(value_array),) 

237 combined_2d_names += [key] 

238 else: 

239 raise ValueError("Arrays with more than 2 dimensions " 

240 "are not yet handled") 

241 

242 if missing_idx is not None: 

243 nan_mask = missing_idx 

244 updated_row_mask = None 

245 if combined: # there were extra arrays not handled by patsy 

246 combined_nans = _nan_rows(*combined) 

247 if combined_nans.shape[0] != nan_mask.shape[0]: 

248 raise ValueError("Shape mismatch between endog/exog " 

249 "and extra arrays given to model.") 

250 # for going back and updated endog/exog 

251 updated_row_mask = combined_nans[~nan_mask] 

252 nan_mask |= combined_nans # for updating extra arrays only 

253 if combined_2d: 

254 combined_2d_nans = _nan_rows(combined_2d) 

255 if combined_2d_nans.shape[0] != nan_mask.shape[0]: 

256 raise ValueError("Shape mismatch between endog/exog " 

257 "and extra 2d arrays given to model.") 

258 if updated_row_mask is not None: 

259 updated_row_mask |= combined_2d_nans[~nan_mask] 

260 else: 

261 updated_row_mask = combined_2d_nans[~nan_mask] 

262 nan_mask |= combined_2d_nans 

263 

264 else: 

265 nan_mask = _nan_rows(*combined) 

266 if combined_2d: 

267 nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d) 

268 

269 if not np.any(nan_mask): # no missing do not do anything 

270 combined = dict(zip(combined_names, combined)) 

271 if combined_2d: 

272 combined.update(dict(zip(combined_2d_names, combined_2d))) 

273 if none_array_names: 

274 combined.update(dict(zip(none_array_names, 

275 [None] * len(none_array_names)))) 

276 

277 if missing_idx is not None: 

278 combined.update({'endog': endog}) 

279 if exog is not None: 

280 combined.update({'exog': exog}) 

281 

282 return combined, [] 

283 

284 elif missing == 'raise': 

285 raise MissingDataError("NaNs were encountered in the data") 

286 

287 elif missing == 'drop': 

288 nan_mask = ~nan_mask 

289 drop_nans = lambda x: cls._drop_nans(x, nan_mask) 

290 drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask) 

291 combined = dict(zip(combined_names, lmap(drop_nans, combined))) 

292 

293 if missing_idx is not None: 

294 if updated_row_mask is not None: 

295 updated_row_mask = ~updated_row_mask 

296 # update endog/exog with this new information 

297 endog = cls._drop_nans(endog, updated_row_mask) 

298 if exog is not None: 

299 exog = cls._drop_nans(exog, updated_row_mask) 

300 

301 combined.update({'endog': endog}) 

302 if exog is not None: 

303 combined.update({'exog': exog}) 

304 

305 if combined_2d: 

306 combined.update(dict(zip(combined_2d_names, 

307 lmap(drop_nans_2d, combined_2d)))) 

308 if none_array_names: 

309 combined.update(dict(zip(none_array_names, 

310 [None] * len(none_array_names)))) 

311 

312 return combined, np.where(~nan_mask)[0].tolist() 

313 else: 

314 raise ValueError("missing option %s not understood" % missing) 

315 

316 def _convert_endog_exog(self, endog, exog): 

317 

318 # for consistent outputs if endog is (n,1) 

319 yarr = self._get_yarr(endog) 

320 xarr = None 

321 if exog is not None: 

322 xarr = self._get_xarr(exog) 

323 if xarr.ndim == 1: 

324 xarr = xarr[:, None] 

325 if xarr.ndim != 2: 

326 raise ValueError("exog is not 1d or 2d") 

327 

328 return yarr, xarr 

329 

330 @cache_writable() 

331 def ynames(self): 

332 endog = self.orig_endog 

333 ynames = self._get_names(endog) 

334 if not ynames: 

335 ynames = _make_endog_names(self.endog) 

336 

337 if len(ynames) == 1: 

338 return ynames[0] 

339 else: 

340 return list(ynames) 

341 

342 @cache_writable() 

343 def xnames(self): 

344 exog = self.orig_exog 

345 if exog is not None: 

346 xnames = self._get_names(exog) 

347 if not xnames: 

348 xnames = _make_exog_names(self.exog) 

349 return list(xnames) 

350 return None 

351 

352 @property 

353 def param_names(self): 

354 # for handling names of 'extra' parameters in summary, etc. 

355 return self._param_names or self.xnames 

356 

357 @param_names.setter 

358 def param_names(self, values): 

359 self._param_names = values 

360 

361 @property 

362 def cov_names(self): 

363 """ 

364 Labels for covariance matrices 

365 

366 In multidimensional models, each dimension of a covariance matrix 

367 differs from the number of param_names. 

368 

369 If not set, returns param_names 

370 """ 

371 # for handling names of covariance names in multidimensional models 

372 if self._cov_names is not None: 

373 return self._cov_names 

374 return self.param_names 

375 

376 @cov_names.setter 

377 def cov_names(self, value): 

378 # for handling names of covariance names in multidimensional models 

379 self._cov_names = value 

380 

381 @cache_readonly 

382 def row_labels(self): 

383 exog = self.orig_exog 

384 if exog is not None: 

385 row_labels = self._get_row_labels(exog) 

386 else: 

387 endog = self.orig_endog 

388 row_labels = self._get_row_labels(endog) 

389 return row_labels 

390 

391 def _get_row_labels(self, arr): 

392 return None 

393 

394 def _get_names(self, arr): 

395 if isinstance(arr, DataFrame): 

396 if isinstance(arr.columns, MultiIndex): 

397 # Flatten MultiIndexes into "simple" column names 

398 return ['_'.join((level for level in c if level)) 

399 for c in arr.columns] 

400 else: 

401 return list(arr.columns) 

402 elif isinstance(arr, Series): 

403 if arr.name: 

404 return [arr.name] 

405 else: 

406 return 

407 else: 

408 try: 

409 return arr.dtype.names 

410 except AttributeError: 

411 pass 

412 

413 return None 

414 

415 def _get_yarr(self, endog): 

416 if data_util._is_structured_ndarray(endog): 

417 endog = data_util.struct_to_ndarray(endog) 

418 endog = np.asarray(endog) 

419 if len(endog) == 1: # never squeeze to a scalar 

420 if endog.ndim == 1: 

421 return endog 

422 elif endog.ndim > 1: 

423 return np.asarray([endog.squeeze()]) 

424 

425 return endog.squeeze() 

426 

427 def _get_xarr(self, exog): 

428 if data_util._is_structured_ndarray(exog): 

429 exog = data_util.struct_to_ndarray(exog) 

430 return np.asarray(exog) 

431 

432 def _check_integrity(self): 

433 if self.exog is not None: 

434 if len(self.exog) != len(self.endog): 

435 raise ValueError("endog and exog matrices are different sizes") 

436 

437 def wrap_output(self, obj, how='columns', names=None): 

438 if how == 'columns': 

439 return self.attach_columns(obj) 

440 elif how == 'rows': 

441 return self.attach_rows(obj) 

442 elif how == 'cov': 

443 return self.attach_cov(obj) 

444 elif how == 'dates': 

445 return self.attach_dates(obj) 

446 elif how == 'columns_eq': 

447 return self.attach_columns_eq(obj) 

448 elif how == 'cov_eq': 

449 return self.attach_cov_eq(obj) 

450 elif how == 'generic_columns': 

451 return self.attach_generic_columns(obj, names) 

452 elif how == 'generic_columns_2d': 

453 return self.attach_generic_columns_2d(obj, names) 

454 elif how == 'ynames': 

455 return self.attach_ynames(obj) 

456 elif how == 'multivariate_confint': 

457 return self.attach_mv_confint(obj) 

458 else: 

459 return obj 

460 

461 def attach_columns(self, result): 

462 return result 

463 

464 def attach_columns_eq(self, result): 

465 return result 

466 

467 def attach_cov(self, result): 

468 return result 

469 

470 def attach_cov_eq(self, result): 

471 return result 

472 

473 def attach_rows(self, result): 

474 return result 

475 

476 def attach_dates(self, result): 

477 return result 

478 

479 def attach_mv_confint(self, result): 

480 return result 

481 

482 def attach_generic_columns(self, result, *args, **kwargs): 

483 return result 

484 

485 def attach_generic_columns_2d(self, result, *args, **kwargs): 

486 return result 

487 

488 def attach_ynames(self, result): 

489 return result 

490 

491 

492class PatsyData(ModelData): 

493 def _get_names(self, arr): 

494 return arr.design_info.column_names 

495 

496 

497class PandasData(ModelData): 

498 """ 

499 Data handling class which knows how to reattach pandas metadata to model 

500 results 

501 """ 

502 

503 def _convert_endog_exog(self, endog, exog=None): 

504 #TODO: remove this when we handle dtype systematically 

505 endog = np.asarray(endog) 

506 exog = exog if exog is None else np.asarray(exog) 

507 if endog.dtype == object or exog is not None and exog.dtype == object: 

508 raise ValueError("Pandas data cast to numpy dtype of object. " 

509 "Check input data with np.asarray(data).") 

510 return super(PandasData, self)._convert_endog_exog(endog, exog) 

511 

512 @classmethod 

513 def _drop_nans(cls, x, nan_mask): 

514 if isinstance(x, (Series, DataFrame)): 

515 return x.loc[nan_mask] 

516 else: # extra arguments could be plain ndarrays 

517 return super(PandasData, cls)._drop_nans(x, nan_mask) 

518 

519 @classmethod 

520 def _drop_nans_2d(cls, x, nan_mask): 

521 if isinstance(x, (Series, DataFrame)): 

522 return x.loc[nan_mask].loc[:, nan_mask] 

523 else: # extra arguments could be plain ndarrays 

524 return super(PandasData, cls)._drop_nans_2d(x, nan_mask) 

525 

526 def _check_integrity(self): 

527 endog, exog = self.orig_endog, self.orig_exog 

528 # exog can be None and we could be upcasting one or the other 

529 if (exog is not None and 

530 (hasattr(endog, 'index') and hasattr(exog, 'index')) and 

531 not self.orig_endog.index.equals(self.orig_exog.index)): 

532 raise ValueError("The indices for endog and exog are not aligned") 

533 super(PandasData, self)._check_integrity() 

534 

535 def _get_row_labels(self, arr): 

536 try: 

537 return arr.index 

538 except AttributeError: 

539 # if we've gotten here it's because endog is pandas and 

540 # exog is not, so just return the row labels from endog 

541 return self.orig_endog.index 

542 

543 def attach_generic_columns(self, result, names): 

544 # get the attribute to use 

545 column_names = getattr(self, names, None) 

546 return Series(result, index=column_names) 

547 

548 def attach_generic_columns_2d(self, result, rownames, colnames=None): 

549 colnames = colnames or rownames 

550 rownames = getattr(self, rownames, None) 

551 colnames = getattr(self, colnames, None) 

552 return DataFrame(result, index=rownames, columns=colnames) 

553 

554 def attach_columns(self, result): 

555 # this can either be a 1d array or a scalar 

556 # do not squeeze because it might be a 2d row array 

557 # if it needs a squeeze, the bug is elsewhere 

558 if result.ndim <= 1: 

559 return Series(result, index=self.param_names) 

560 else: # for e.g., confidence intervals 

561 return DataFrame(result, index=self.param_names) 

562 

563 def attach_columns_eq(self, result): 

564 return DataFrame(result, index=self.xnames, columns=self.ynames) 

565 

566 def attach_cov(self, result): 

567 return DataFrame(result, index=self.cov_names, columns=self.cov_names) 

568 

569 def attach_cov_eq(self, result): 

570 return DataFrame(result, index=self.ynames, columns=self.ynames) 

571 

572 def attach_rows(self, result): 

573 # assumes if len(row_labels) > len(result) it's bc it was truncated 

574 # at the front, for AR lags, for example 

575 squeezed = result.squeeze() 

576 k_endog = np.array(self.ynames, ndmin=1).shape[0] 

577 if k_endog > 1 and squeezed.shape == (k_endog,): 

578 squeezed = squeezed[None, :] 

579 # May be zero-dim, for example in the case of forecast one step in tsa 

580 if squeezed.ndim < 2: 

581 return Series(squeezed, index=self.row_labels[-len(result):]) 

582 else: 

583 return DataFrame(result, index=self.row_labels[-len(result):], 

584 columns=self.ynames) 

585 

586 def attach_dates(self, result): 

587 squeezed = result.squeeze() 

588 k_endog = np.array(self.ynames, ndmin=1).shape[0] 

589 if k_endog > 1 and squeezed.shape == (k_endog,): 

590 squeezed = np.asarray(squeezed)[None, :] 

591 # May be zero-dim, for example in the case of forecast one step in tsa 

592 if squeezed.ndim < 2: 

593 return Series(squeezed, index=self.predict_dates) 

594 else: 

595 return DataFrame(result, index=self.predict_dates, 

596 columns=self.ynames) 

597 

598 def attach_mv_confint(self, result): 

599 return DataFrame(result.reshape((-1, 2)), 

600 index=self.cov_names, 

601 columns=['lower', 'upper']) 

602 

603 def attach_ynames(self, result): 

604 squeezed = result.squeeze() 

605 # May be zero-dim, for example in the case of forecast one step in tsa 

606 if squeezed.ndim < 2: 

607 return Series(squeezed, name=self.ynames) 

608 else: 

609 return DataFrame(result, columns=self.ynames) 

610 

611 

612def _make_endog_names(endog): 

613 if endog.ndim == 1 or endog.shape[1] == 1: 

614 ynames = ['y'] 

615 else: # for VAR 

616 ynames = ['y%d' % (i+1) for i in range(endog.shape[1])] 

617 

618 return ynames 

619 

620 

621def _make_exog_names(exog): 

622 exog_var = exog.var(0) 

623 if (exog_var == 0).any(): 

624 # assumes one constant in first or last position 

625 # avoid exception if more than one constant 

626 const_idx = exog_var.argmin() 

627 exog_names = ['x%d' % i for i in range(1, exog.shape[1])] 

628 exog_names.insert(const_idx, 'const') 

629 else: 

630 exog_names = ['x%d' % i for i in range(1, exog.shape[1]+1)] 

631 

632 return exog_names 

633 

634 

635def handle_missing(endog, exog=None, missing='none', **kwargs): 

636 klass = handle_data_class_factory(endog, exog) 

637 if missing == 'none': 

638 ret_dict = dict(endog=endog, exog=exog) 

639 ret_dict.update(kwargs) 

640 return ret_dict, None 

641 return klass.handle_missing(endog, exog, missing=missing, **kwargs) 

642 

643 

644def handle_data_class_factory(endog, exog): 

645 """ 

646 Given inputs 

647 """ 

648 if data_util._is_using_ndarray_type(endog, exog): 

649 klass = ModelData 

650 elif data_util._is_using_pandas(endog, exog): 

651 klass = PandasData 

652 elif data_util._is_using_patsy(endog, exog): 

653 klass = PatsyData 

654 # keep this check last 

655 elif data_util._is_using_ndarray(endog, exog): 

656 klass = ModelData 

657 else: 

658 raise ValueError('unrecognized data structures: %s / %s' % 

659 (type(endog), type(exog))) 

660 return klass 

661 

662 

663def handle_data(endog, exog, missing='none', hasconst=None, **kwargs): 

664 # deal with lists and tuples up-front 

665 if isinstance(endog, (list, tuple)): 

666 endog = np.asarray(endog) 

667 if isinstance(exog, (list, tuple)): 

668 exog = np.asarray(exog) 

669 

670 klass = handle_data_class_factory(endog, exog) 

671 return klass(endog, exog=exog, missing=missing, hasconst=hasconst, 

672 **kwargs)