Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Utility functions models code 

3""" 

4import numpy as np 

5import numpy.lib.recfunctions as nprf 

6import pandas as pd 

7 

8from statsmodels.compat.python import lzip, lmap 

9 

10from statsmodels.tools.data import _is_using_pandas, _is_recarray 

11from statsmodels.tools.validation import array_like 

12 

13 

14def asstr2(s): 

15 if isinstance(s, str): 

16 return s 

17 elif isinstance(s, bytes): 

18 return s.decode('latin1') 

19 else: 

20 return str(s) 

21 

22 

23def _make_dictnames(tmp_arr, offset=0): 

24 """ 

25 Helper function to create a dictionary mapping a column number 

26 to the name in tmp_arr. 

27 """ 

28 col_map = {} 

29 for i, col_name in enumerate(tmp_arr): 

30 col_map[i + offset] = col_name 

31 return col_map 

32 

33 

34def drop_missing(Y, X=None, axis=1): 

35 """ 

36 Returns views on the arrays Y and X where missing observations are dropped. 

37 

38 Y : array_like 

39 X : array_like, optional 

40 axis : int 

41 Axis along which to look for missing observations. Default is 1, ie., 

42 observations in rows. 

43 

44 Returns 

45 ------- 

46 Y : ndarray 

47 All Y where the 

48 X : ndarray 

49 

50 Notes 

51 ----- 

52 If either Y or X is 1d, it is reshaped to be 2d. 

53 """ 

54 Y = np.asarray(Y) 

55 if Y.ndim == 1: 

56 Y = Y[:, None] 

57 if X is not None: 

58 X = np.array(X) 

59 if X.ndim == 1: 

60 X = X[:, None] 

61 keepidx = np.logical_and(~np.isnan(Y).any(axis), 

62 ~np.isnan(X).any(axis)) 

63 return Y[keepidx], X[keepidx] 

64 else: 

65 keepidx = ~np.isnan(Y).any(axis) 

66 return Y[keepidx] 

67 

68 

69# TODO: needs to better preserve dtype and be more flexible 

70# ie., if you still have a string variable in your array you do not 

71# want to cast it to float 

72# TODO: add name validator (ie., bad names for datasets.grunfeld) 

73def categorical(data, col=None, dictnames=False, drop=False): 

74 """ 

75 Construct a dummy matrix from categorical variables 

76 

77 Parameters 

78 ---------- 

79 data : array_like 

80 A structured array, recarray, array, Series or DataFrame. This can be 

81 either a 1d vector of the categorical variable or a 2d array with 

82 the column specifying the categorical variable specified by the col 

83 argument. 

84 col : {str, int, None} 

85 If data is a DataFrame col must in a column of data. If data is a 

86 Series, col must be either the name of the Series or None. If data is a 

87 structured array or a recarray, `col` can be a string that is the name 

88 of the column that contains the variable. For all other 

89 arrays `col` can be an int that is the (zero-based) column index 

90 number. `col` can only be None for a 1d array. The default is None. 

91 dictnames : bool, optional 

92 If True, a dictionary mapping the column number to the categorical 

93 name is returned. Used to have information about plain arrays. 

94 drop : bool 

95 Whether or not keep the categorical variable in the returned matrix. 

96 

97 Returns 

98 ------- 

99 dummy_matrix : array_like 

100 A matrix of dummy (indicator/binary) float variables for the 

101 categorical data. 

102 dictnames : dict[int, str], optional 

103 Mapping between column numbers and categorical names. 

104 

105 Notes 

106 ----- 

107 This returns a dummy variable for *each* distinct variable. If a 

108 a structured or recarray is provided, the names for the new variable is the 

109 old variable name - underscore - category name. So if the a variable 

110 'vote' had answers as 'yes' or 'no' then the returned array would have to 

111 new variables-- 'vote_yes' and 'vote_no'. There is currently 

112 no name checking. 

113 

114 Examples 

115 -------- 

116 >>> import numpy as np 

117 >>> import statsmodels.api as sm 

118 

119 Univariate examples 

120 

121 >>> import string 

122 >>> string_var = [string.ascii_lowercase[0:5], 

123 ... string.ascii_lowercase[5:10], 

124 ... string.ascii_lowercase[10:15], 

125 ... string.ascii_lowercase[15:20], 

126 ... string.ascii_lowercase[20:25]] 

127 >>> string_var *= 5 

128 >>> string_var = np.asarray(sorted(string_var)) 

129 >>> design = sm.tools.categorical(string_var, drop=True) 

130 

131 Or for a numerical categorical variable 

132 

133 >>> instr = np.floor(np.arange(10,60, step=2)/10) 

134 >>> design = sm.tools.categorical(instr, drop=True) 

135 

136 With a structured array 

137 

138 >>> num = np.random.randn(25,2) 

139 >>> struct_ar = np.zeros((25,1), 

140 ... dtype=[('var1', 'f4'),('var2', 'f4'), 

141 ... ('instrument','f4'),('str_instr','a5')]) 

142 >>> struct_ar['var1'] = num[:,0][:,None] 

143 >>> struct_ar['var2'] = num[:,1][:,None] 

144 >>> struct_ar['instrument'] = instr[:,None] 

145 >>> struct_ar['str_instr'] = string_var[:,None] 

146 >>> design = sm.tools.categorical(struct_ar, col='instrument', drop=True) 

147 

148 Or 

149 

150 >>> design2 = sm.tools.categorical(struct_ar, col='str_instr', drop=True) 

151 """ 

152 # TODO: add a NameValidator function 

153 if isinstance(col, (list, tuple)): 

154 if len(col) == 1: 

155 col = col[0] 

156 else: 

157 raise ValueError("Can only convert one column at a time") 

158 if (not isinstance(data, (pd.DataFrame, pd.Series)) and 

159 not isinstance(col, (str, int)) and 

160 col is not None): 

161 raise TypeError('col must be a str, int or None') 

162 

163 # Pull out a Series from a DataFrame if provided 

164 if isinstance(data, pd.DataFrame): 

165 if col is None: 

166 raise TypeError('col must be a str or int when using a DataFrame') 

167 elif col not in data: 

168 raise ValueError('Column \'{0}\' not found in data'.format(col)) 

169 data = data[col] 

170 # Set col to None since we not have a Series 

171 col = None 

172 

173 if isinstance(data, pd.Series): 

174 if col is not None and data.name != col: 

175 raise ValueError('data.name does not match col ' 

176 '\'{0}\''.format(col)) 

177 data_cat = pd.Categorical(data) 

178 dummies = pd.get_dummies(data_cat) 

179 col_map = {i: cat for i, cat in enumerate(data_cat.categories) if 

180 cat in dummies} 

181 if not drop: 

182 dummies.columns = list(dummies.columns) 

183 dummies = pd.concat([dummies, data], 1) 

184 if dictnames: 

185 return dummies, col_map 

186 return dummies 

187 # catch recarrays and structured arrays 

188 elif data.dtype.names or data.__class__ is np.recarray: 

189 # deprecated: remove path after 0.12 

190 import warnings 

191 from statsmodels.tools.sm_exceptions import recarray_warning 

192 warnings.warn(recarray_warning, FutureWarning) 

193 if not col and np.squeeze(data).ndim > 1: 

194 raise IndexError("col is None and the input array is not 1d") 

195 if isinstance(col, int): 

196 col = data.dtype.names[col] 

197 if col is None and data.dtype.names and len(data.dtype.names) == 1: 

198 col = data.dtype.names[0] 

199 

200 tmp_arr = np.unique(data[col]) 

201 

202 # if the cols are shape (#,) vs (#,1) need to add an axis and flip 

203 _swap = True 

204 if data[col].ndim == 1: 

205 tmp_arr = tmp_arr[:, None] 

206 _swap = False 

207 tmp_dummy = (tmp_arr == data[col]).astype(float) 

208 if _swap: 

209 tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0) 

210 

211 if not tmp_arr.dtype.names: # how do we get to this code path? 

212 tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr)] 

213 elif tmp_arr.dtype.names: 

214 tmp_arr = [asstr2(item) for item in np.squeeze(tmp_arr.tolist())] 

215 

216 # prepend the varname and underscore, if col is numeric attribute 

217 # lookup is lost for recarrays... 

218 if col is None: 

219 try: 

220 col = data.dtype.names[0] 

221 except: 

222 col = 'var' 

223 # TODO: the above needs to be made robust because there could be many 

224 # var_yes, var_no varaibles for instance. 

225 tmp_arr = [col + '_' + item for item in tmp_arr] 

226 # TODO: test this for rec and structured arrays!!! 

227 

228 if drop is True: 

229 if len(data.dtype) <= 1: 

230 if tmp_dummy.shape[0] < tmp_dummy.shape[1]: 

231 tmp_dummy = np.squeeze(tmp_dummy).swapaxes(1, 0) 

232 dt = lzip(tmp_arr, [tmp_dummy.dtype.str]*len(tmp_arr)) 

233 # preserve array type 

234 return np.array(lmap(tuple, tmp_dummy.tolist()), 

235 dtype=dt).view(type(data)) 

236 

237 data = nprf.drop_fields(data, col, usemask=False, 

238 asrecarray=type(data) is np.recarray) 

239 data = nprf.append_fields(data, tmp_arr, data=tmp_dummy, 

240 usemask=False, 

241 asrecarray=type(data) is np.recarray) 

242 return data 

243 

244 # Catch array_like for an error 

245 elif not isinstance(data, np.ndarray): 

246 raise NotImplementedError("array_like objects are not supported") 

247 else: 

248 if isinstance(col, int): 

249 offset = data.shape[1] # need error catching here? 

250 tmp_arr = np.unique(data[:, col]) 

251 tmp_dummy = (tmp_arr[:, np.newaxis] == data[:, col]).astype(float) 

252 tmp_dummy = tmp_dummy.swapaxes(1, 0) 

253 if drop is True: 

254 offset -= 1 

255 data = np.delete(data, col, axis=1).astype(float) 

256 data = np.column_stack((data, tmp_dummy)) 

257 if dictnames is True: 

258 col_map = _make_dictnames(tmp_arr, offset) 

259 return data, col_map 

260 return data 

261 elif col is None and np.squeeze(data).ndim == 1: 

262 tmp_arr = np.unique(data) 

263 tmp_dummy = (tmp_arr[:, None] == data).astype(float) 

264 tmp_dummy = tmp_dummy.swapaxes(1, 0) 

265 if drop is True: 

266 if dictnames is True: 

267 col_map = _make_dictnames(tmp_arr) 

268 return tmp_dummy, col_map 

269 return tmp_dummy 

270 else: 

271 data = np.column_stack((data, tmp_dummy)) 

272 if dictnames is True: 

273 col_map = _make_dictnames(tmp_arr, offset=1) 

274 return data, col_map 

275 return data 

276 else: 

277 raise IndexError("The index %s is not understood" % col) 

278 

279 

280# TODO: add an axis argument to this for sysreg 

281def add_constant(data, prepend=True, has_constant='skip'): 

282 """ 

283 Add a column of ones to an array. 

284 

285 Parameters 

286 ---------- 

287 data : array_like 

288 A column-ordered design matrix. 

289 prepend : bool 

290 If true, the constant is in the first column. Else the constant is 

291 appended (last column). 

292 has_constant : str {'raise', 'add', 'skip'} 

293 Behavior if ``data`` already has a constant. The default will return 

294 data without adding another constant. If 'raise', will raise an 

295 error if any column has a constant value. Using 'add' will add a 

296 column of 1s if a constant column is present. 

297 

298 Returns 

299 ------- 

300 array_like 

301 The original values with a constant (column of ones) as the first or 

302 last column. Returned value type depends on input type. 

303 

304 Notes 

305 ----- 

306 When the input is recarray or a pandas Series or DataFrame, the added 

307 column's name is 'const'. 

308 """ 

309 if _is_using_pandas(data, None) or _is_recarray(data): 

310 if _is_recarray(data): 

311 # deprecated: remove recarray support after 0.12 

312 import warnings 

313 from statsmodels.tools.sm_exceptions import recarray_warning 

314 warnings.warn(recarray_warning, FutureWarning) 

315 from statsmodels.tsa.tsatools import add_trend 

316 return add_trend(data, trend='c', prepend=prepend, has_constant=has_constant) 

317 

318 # Special case for NumPy 

319 x = np.asanyarray(data) 

320 if x.ndim == 1: 

321 x = x[:, None] 

322 elif x.ndim > 2: 

323 raise ValueError('Only implemented for 2-dimensional arrays') 

324 

325 is_nonzero_const = np.ptp(x, axis=0) == 0 

326 is_nonzero_const &= np.all(x != 0.0, axis=0) 

327 if is_nonzero_const.any(): 

328 if has_constant == 'skip': 

329 return x 

330 elif has_constant == 'raise': 

331 raise ValueError("data already contains a constant") 

332 

333 x = [np.ones(x.shape[0]), x] 

334 x = x if prepend else x[::-1] 

335 return np.column_stack(x) 

336 

337 

338def isestimable(c, d): 

339 """ 

340 True if (Q, P) contrast `c` is estimable for (N, P) design `d`. 

341 

342 From an Q x P contrast matrix `C` and an N x P design matrix `D`, checks if 

343 the contrast `C` is estimable by looking at the rank of ``vstack([C,D])`` 

344 and verifying it is the same as the rank of `D`. 

345 

346 Parameters 

347 ---------- 

348 c : array_like 

349 A contrast matrix with shape (Q, P). If 1 dimensional assume shape is 

350 (1, P). 

351 d : array_like 

352 The design matrix, (N, P). 

353 

354 Returns 

355 ------- 

356 bool 

357 True if the contrast `c` is estimable on design `d`. 

358 

359 Examples 

360 -------- 

361 >>> d = np.array([[1, 1, 1, 0, 0, 0], 

362 ... [0, 0, 0, 1, 1, 1], 

363 ... [1, 1, 1, 1, 1, 1]]).T 

364 >>> isestimable([1, 0, 0], d) 

365 False 

366 >>> isestimable([1, -1, 0], d) 

367 True 

368 """ 

369 c = array_like(c, 'c', maxdim=2) 

370 d = array_like(d, 'd', ndim=2) 

371 c = c[None, :] if c.ndim == 1 else c 

372 if c.shape[1] != d.shape[1]: 

373 raise ValueError('Contrast should have %d columns' % d.shape[1]) 

374 new = np.vstack([c, d]) 

375 if np.linalg.matrix_rank(new) != np.linalg.matrix_rank(d): 

376 return False 

377 return True 

378 

379 

380def pinv_extended(X, rcond=1e-15): 

381 """ 

382 Return the pinv of an array X as well as the singular values 

383 used in computation. 

384 

385 Code adapted from numpy. 

386 """ 

387 X = np.asarray(X) 

388 X = X.conjugate() 

389 u, s, vt = np.linalg.svd(X, 0) 

390 s_orig = np.copy(s) 

391 m = u.shape[0] 

392 n = vt.shape[1] 

393 cutoff = rcond * np.maximum.reduce(s) 

394 for i in range(min(n, m)): 

395 if s[i] > cutoff: 

396 s[i] = 1./s[i] 

397 else: 

398 s[i] = 0. 

399 res = np.dot(np.transpose(vt), np.multiply(s[:, np.core.newaxis], 

400 np.transpose(u))) 

401 return res, s_orig 

402 

403 

404def recipr(x): 

405 """ 

406 Reciprocal of an array with entries less than or equal to 0 set to 0. 

407 

408 Parameters 

409 ---------- 

410 x : array_like 

411 The input array. 

412 

413 Returns 

414 ------- 

415 ndarray 

416 The array with 0-filled reciprocals. 

417 """ 

418 x = np.asarray(x) 

419 out = np.zeros_like(x, dtype=np.float64) 

420 nans = np.isnan(x.flat) 

421 pos = ~nans 

422 pos[pos] = pos[pos] & (x.flat[pos] > 0) 

423 out.flat[pos] = 1.0 / x.flat[pos] 

424 out.flat[nans] = np.nan 

425 return out 

426 

427 

428def recipr0(x): 

429 """ 

430 Reciprocal of an array with entries less than 0 set to 0. 

431 

432 Parameters 

433 ---------- 

434 x : array_like 

435 The input array. 

436 

437 Returns 

438 ------- 

439 ndarray 

440 The array with 0-filled reciprocals. 

441 """ 

442 x = np.asarray(x) 

443 out = np.zeros_like(x, dtype=np.float64) 

444 nans = np.isnan(x.flat) 

445 non_zero = ~nans 

446 non_zero[non_zero] = non_zero[non_zero] & (x.flat[non_zero] != 0) 

447 out.flat[non_zero] = 1.0 / x.flat[non_zero] 

448 out.flat[nans] = np.nan 

449 return out 

450 

451 

452def clean0(matrix): 

453 """ 

454 Erase columns of zeros: can save some time in pseudoinverse. 

455 

456 Parameters 

457 ---------- 

458 matrix : ndarray 

459 The array to clean. 

460 

461 Returns 

462 ------- 

463 ndarray 

464 The cleaned array. 

465 """ 

466 colsum = np.add.reduce(matrix**2, 0) 

467 val = [matrix[:, i] for i in np.flatnonzero(colsum)] 

468 return np.array(np.transpose(val)) 

469 

470 

471def fullrank(x, r=None): 

472 """ 

473 Return an array whose column span is the same as x. 

474 

475 Parameters 

476 ---------- 

477 x : ndarray 

478 The array to adjust, 2d. 

479 r : int, optional 

480 The rank of x. If not provided, determined by `np.linalg.matrix_rank`. 

481 

482 Returns 

483 ------- 

484 ndarray 

485 The array adjusted to have full rank. 

486 

487 Notes 

488 ----- 

489 If the rank of x is known it can be specified as r -- no check 

490 is made to ensure that this really is the rank of x. 

491 """ 

492 if r is None: 

493 r = np.linalg.matrix_rank(x) 

494 

495 v, d, u = np.linalg.svd(x, full_matrices=False) 

496 order = np.argsort(d) 

497 order = order[::-1] 

498 value = [] 

499 for i in range(r): 

500 value.append(v[:, order[i]]) 

501 return np.asarray(np.transpose(value)).astype(np.float64) 

502 

503 

504def unsqueeze(data, axis, oldshape): 

505 """ 

506 Unsqueeze a collapsed array. 

507 

508 Parameters 

509 ---------- 

510 data : ndarray 

511 The data to unsqueeze. 

512 axis : int 

513 The axis to unsqueeze. 

514 oldshape : tuple[int] 

515 The original shape before the squeeze or reduce operation. 

516 

517 Returns 

518 ------- 

519 ndarray 

520 The unsqueezed array. 

521 

522 Examples 

523 -------- 

524 >>> from numpy import mean 

525 >>> from numpy.random import standard_normal 

526 >>> x = standard_normal((3,4,5)) 

527 >>> m = mean(x, axis=1) 

528 >>> m.shape 

529 (3, 5) 

530 >>> m = unsqueeze(m, 1, x.shape) 

531 >>> m.shape 

532 (3, 1, 5) 

533 >>> 

534 """ 

535 newshape = list(oldshape) 

536 newshape[axis] = 1 

537 return data.reshape(newshape) 

538 

539 

540def nan_dot(A, B): 

541 """ 

542 Returns np.dot(left_matrix, right_matrix) with the convention that 

543 nan * 0 = 0 and nan * x = nan if x != 0. 

544 

545 Parameters 

546 ---------- 

547 A, B : ndarray 

548 """ 

549 # Find out who should be nan due to nan * nonzero 

550 should_be_nan_1 = np.dot(np.isnan(A), (B != 0)) 

551 should_be_nan_2 = np.dot((A != 0), np.isnan(B)) 

552 should_be_nan = should_be_nan_1 + should_be_nan_2 

553 

554 # Multiply after setting all nan to 0 

555 # This is what happens if there were no nan * nonzero conflicts 

556 C = np.dot(np.nan_to_num(A), np.nan_to_num(B)) 

557 

558 C[should_be_nan] = np.nan 

559 

560 return C 

561 

562 

563def maybe_unwrap_results(results): 

564 """ 

565 Gets raw results back from wrapped results. 

566 

567 Can be used in plotting functions or other post-estimation type 

568 routines. 

569 """ 

570 return getattr(results, '_results', results) 

571 

572 

573class Bunch(dict): 

574 """ 

575 Returns a dict-like object with keys accessible via attribute lookup. 

576 

577 Parameters 

578 ---------- 

579 *args 

580 Arguments passed to dict constructor, tuples (key, value). 

581 **kwargs 

582 Keyword agument passed to dict constructor, key=value. 

583 """ 

584 def __init__(self, *args, **kwargs): 

585 super(Bunch, self).__init__(*args, **kwargs) 

586 self.__dict__ = self 

587 

588 

589def _ensure_2d(x, ndarray=False): 

590 """ 

591 

592 Parameters 

593 ---------- 

594 x : ndarray, Series, DataFrame or None 

595 Input to verify dimensions, and to transform as necesary 

596 ndarray : bool 

597 Flag indicating whether to always return a NumPy array. Setting False 

598 will return an pandas DataFrame when the input is a Series or a 

599 DataFrame. 

600 

601 Returns 

602 ------- 

603 out : ndarray, DataFrame or None 

604 array or DataFrame with 2 dimensiona. One dimensional arrays are 

605 returned as nobs by 1. None is returned if x is None. 

606 names : list of str or None 

607 list containing variables names when the input is a pandas datatype. 

608 Returns None if the input is an ndarray. 

609 

610 Notes 

611 ----- 

612 Accepts None for simplicity 

613 """ 

614 if x is None: 

615 return x 

616 is_pandas = _is_using_pandas(x, None) 

617 if x.ndim == 2: 

618 if is_pandas: 

619 return x, x.columns 

620 else: 

621 return x, None 

622 elif x.ndim > 2: 

623 raise ValueError('x mst be 1 or 2-dimensional.') 

624 

625 name = x.name if is_pandas else None 

626 if ndarray: 

627 return np.asarray(x)[:, None], name 

628 else: 

629 return pd.DataFrame(x), name