Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1from statsmodels.compat.python import (lrange, iterkeys, iteritems, lzip, 

2 itervalues) 

3 

4from collections import OrderedDict 

5import datetime 

6from functools import reduce 

7import re 

8import textwrap 

9 

10import numpy as np 

11import pandas as pd 

12 

13from .table import SimpleTable 

14from .tableformatting import fmt_latex, fmt_txt 

15 

16 

17class Summary(object): 

18 def __init__(self): 

19 self.tables = [] 

20 self.settings = [] 

21 self.extra_txt = [] 

22 self.title = None 

23 self._merge_latex = False 

24 

25 def __str__(self): 

26 return self.as_text() 

27 

28 def __repr__(self): 

29 return str(type(self)) + '\n"""\n' + self.__str__() + '\n"""' 

30 

31 def _repr_html_(self): 

32 '''Display as HTML in IPython notebook.''' 

33 return self.as_html() 

34 

35 def add_df(self, df, index=True, header=True, float_format='%.4f', 

36 align='r'): 

37 '''Add the contents of a DataFrame to summary table 

38 

39 Parameters 

40 ---------- 

41 df : DataFrame 

42 header: bool 

43 Reproduce the DataFrame column labels in summary table 

44 index: bool 

45 Reproduce the DataFrame row labels in summary table 

46 float_format : str 

47 Formatting to float data columns 

48 align : str 

49 Data alignment (l/c/r) 

50 ''' 

51 

52 settings = {'index': index, 'header': header, 

53 'float_format': float_format, 'align': align} 

54 self.tables.append(df) 

55 self.settings.append(settings) 

56 

57 def add_array(self, array, align='r', float_format="%.4f"): 

58 '''Add the contents of a Numpy array to summary table 

59 

60 Parameters 

61 ---------- 

62 array : numpy array (2D) 

63 float_format : str 

64 Formatting to array if type is float 

65 align : str 

66 Data alignment (l/c/r) 

67 ''' 

68 

69 table = pd.DataFrame(array) 

70 self.add_df(table, index=False, header=False, 

71 float_format=float_format, align=align) 

72 

73 def add_dict(self, d, ncols=2, align='l', float_format="%.4f"): 

74 '''Add the contents of a Dict to summary table 

75 

76 Parameters 

77 ---------- 

78 d : dict 

79 Keys and values are automatically coerced to strings with str(). 

80 Users are encouraged to format them before using add_dict. 

81 ncols: int 

82 Number of columns of the output table 

83 align : str 

84 Data alignment (l/c/r) 

85 ''' 

86 

87 keys = [_formatter(x, float_format) for x in iterkeys(d)] 

88 vals = [_formatter(x, float_format) for x in itervalues(d)] 

89 data = np.array(lzip(keys, vals)) 

90 

91 if data.shape[0] % ncols != 0: 

92 pad = ncols - (data.shape[0] % ncols) 

93 data = np.vstack([data, np.array(pad * [['', '']])]) 

94 

95 data = np.split(data, ncols) 

96 data = reduce(lambda x, y: np.hstack([x, y]), data) 

97 self.add_array(data, align=align) 

98 

99 def add_text(self, string): 

100 '''Append a note to the bottom of the summary table. In ASCII tables, 

101 the note will be wrapped to table width. Notes are not indendented. 

102 ''' 

103 self.extra_txt.append(string) 

104 

105 def add_title(self, title=None, results=None): 

106 '''Insert a title on top of the summary table. If a string is provided 

107 in the title argument, that string is printed. If no title string is 

108 provided but a results instance is provided, statsmodels attempts 

109 to construct a useful title automatically. 

110 ''' 

111 if isinstance(title, str): 

112 self.title = title 

113 else: 

114 if results is not None: 

115 model = results.model.__class__.__name__ 

116 if model in _model_types: 

117 model = _model_types[model] 

118 self.title = 'Results: ' + model 

119 else: 

120 self.title = '' 

121 

122 def add_base(self, results, alpha=0.05, float_format="%.4f", title=None, 

123 xname=None, yname=None): 

124 '''Try to construct a basic summary instance. 

125 

126 Parameters 

127 ---------- 

128 results : Model results instance 

129 alpha : float 

130 significance level for the confidence intervals (optional) 

131 float_formatting: str 

132 Float formatting for summary of parameters (optional) 

133 title : str 

134 Title of the summary table (optional) 

135 xname : list[str] of length equal to the number of parameters 

136 Names of the independent variables (optional) 

137 yname : str 

138 Name of the dependent variable (optional) 

139 ''' 

140 

141 param = summary_params(results, alpha=alpha, use_t=results.use_t) 

142 info = summary_model(results) 

143 if xname is not None: 

144 param.index = xname 

145 if yname is not None: 

146 info['Dependent Variable:'] = yname 

147 self.add_dict(info, align='l') 

148 self.add_df(param, float_format=float_format) 

149 self.add_title(title=title, results=results) 

150 

151 def as_text(self): 

152 '''Generate ASCII Summary Table 

153 ''' 

154 

155 tables = self.tables 

156 settings = self.settings 

157 title = self.title 

158 extra_txt = self.extra_txt 

159 

160 pad_col, pad_index, widest = _measure_tables(tables, settings) 

161 

162 rule_equal = widest * '=' 

163 

164 simple_tables = _simple_tables(tables, settings, pad_col, pad_index) 

165 tab = [x.as_text() for x in simple_tables] 

166 

167 tab = '\n'.join(tab) 

168 tab = tab.split('\n') 

169 tab[0] = rule_equal 

170 tab.append(rule_equal) 

171 tab = '\n'.join(tab) 

172 

173 if title is not None: 

174 title = title 

175 if len(title) < widest: 

176 title = ' ' * int(widest/2 - len(title)/2) + title 

177 else: 

178 title = '' 

179 

180 txt = [textwrap.wrap(x, widest) for x in extra_txt] 

181 txt = ['\n'.join(x) for x in txt] 

182 txt = '\n'.join(txt) 

183 

184 out = '\n'.join([title, tab, txt]) 

185 

186 return out 

187 

188 def as_html(self): 

189 '''Generate HTML Summary Table 

190 ''' 

191 

192 tables = self.tables 

193 settings = self.settings 

194 

195 simple_tables = _simple_tables(tables, settings) 

196 tab = [x.as_html() for x in simple_tables] 

197 tab = '\n'.join(tab) 

198 

199 return tab 

200 

201 def as_latex(self): 

202 '''Generate LaTeX Summary Table 

203 ''' 

204 tables = self.tables 

205 settings = self.settings 

206 title = self.title 

207 

208 if title is not None: 

209 title = '\\caption{' + title + '}' 

210 else: 

211 title = '\\caption{}' 

212 

213 simple_tables = _simple_tables(tables, settings) 

214 tab = [x.as_latex_tabular() for x in simple_tables] 

215 tab = '\n\\hline\n'.join(tab) 

216 

217 to_replace = ('\\\\hline\\n\\\\hline\\n\\\\' 

218 'end{tabular}\\n\\\\begin{tabular}{.*}\\n') 

219 

220 if self._merge_latex: 

221 # create single tabular object for summary_col 

222 tab = re.sub(to_replace, r'\\midrule\n', tab) 

223 

224 out = '\\begin{table}', title, tab, '\\end{table}' 

225 out = '\n'.join(out) 

226 return out 

227 

228 

229def _measure_tables(tables, settings): 

230 '''Compare width of ascii tables in a list and calculate padding values. 

231 We add space to each col_sep to get us as close as possible to the 

232 width of the largest table. Then, we add a few spaces to the first 

233 column to pad the rest. 

234 ''' 

235 

236 simple_tables = _simple_tables(tables, settings) 

237 tab = [x.as_text() for x in simple_tables] 

238 

239 length = [len(x.splitlines()[0]) for x in tab] 

240 len_max = max(length) 

241 pad_sep = [] 

242 pad_index = [] 

243 

244 for i in range(len(tab)): 

245 nsep = max(tables[i].shape[1] - 1, 1) 

246 pad = int((len_max - length[i]) / nsep) 

247 pad_sep.append(pad) 

248 len_new = length[i] + nsep * pad 

249 pad_index.append(len_max - len_new) 

250 

251 return pad_sep, pad_index, max(length) 

252 

253 

254# Useful stuff # TODO: be more specific 

255_model_types = {'OLS': 'Ordinary least squares', 

256 'GLS': 'Generalized least squares', 

257 'GLSAR': 'Generalized least squares with AR(p)', 

258 'WLS': 'Weighted least squares', 

259 'RLM': 'Robust linear model', 

260 'NBin': 'Negative binomial model', 

261 'GLM': 'Generalized linear model' 

262 } 

263 

264 

265def summary_model(results): 

266 '''Create a dict with information about the model 

267 ''' 

268 

269 def time_now(*args, **kwds): 

270 now = datetime.datetime.now() 

271 return now.strftime('%Y-%m-%d %H:%M') 

272 

273 info = OrderedDict() 

274 info['Model:'] = lambda x: x.model.__class__.__name__ 

275 info['Model Family:'] = lambda x: x.family.__class.__name__ 

276 info['Link Function:'] = lambda x: x.family.link.__class__.__name__ 

277 info['Dependent Variable:'] = lambda x: x.model.endog_names 

278 info['Date:'] = time_now 

279 info['No. Observations:'] = lambda x: "%#6d" % x.nobs 

280 info['Df Model:'] = lambda x: "%#6d" % x.df_model 

281 info['Df Residuals:'] = lambda x: "%#6d" % x.df_resid 

282 info['Converged:'] = lambda x: x.mle_retvals['converged'] 

283 info['No. Iterations:'] = lambda x: x.mle_retvals['iterations'] 

284 info['Method:'] = lambda x: x.method 

285 info['Norm:'] = lambda x: x.fit_options['norm'] 

286 info['Scale Est.:'] = lambda x: x.fit_options['scale_est'] 

287 info['Cov. Type:'] = lambda x: x.fit_options['cov'] 

288 

289 rsquared_type = '' if results.k_constant else ' (uncentered)' 

290 info['R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared 

291 info['Adj. R-squared' + rsquared_type + ':'] = lambda x: "%#8.3f" % x.rsquared_adj # noqa:E501 

292 info['Pseudo R-squared:'] = lambda x: "%#8.3f" % x.prsquared 

293 info['AIC:'] = lambda x: "%8.4f" % x.aic 

294 info['BIC:'] = lambda x: "%8.4f" % x.bic 

295 info['Log-Likelihood:'] = lambda x: "%#8.5g" % x.llf 

296 info['LL-Null:'] = lambda x: "%#8.5g" % x.llnull 

297 info['LLR p-value:'] = lambda x: "%#8.5g" % x.llr_pvalue 

298 info['Deviance:'] = lambda x: "%#8.5g" % x.deviance 

299 info['Pearson chi2:'] = lambda x: "%#6.3g" % x.pearson_chi2 

300 info['F-statistic:'] = lambda x: "%#8.4g" % x.fvalue 

301 info['Prob (F-statistic):'] = lambda x: "%#6.3g" % x.f_pvalue 

302 info['Scale:'] = lambda x: "%#8.5g" % x.scale 

303 out = OrderedDict() 

304 for key, func in iteritems(info): 

305 try: 

306 out[key] = func(results) 

307 except (AttributeError, KeyError, NotImplementedError): 

308 # NOTE: some models do not have loglike defined (RLM), 

309 # so raise NotImplementedError 

310 pass 

311 return out 

312 

313 

314def summary_params(results, yname=None, xname=None, alpha=.05, use_t=True, 

315 skip_header=False, float_format="%.4f"): 

316 '''create a summary table of parameters from results instance 

317 

318 Parameters 

319 ---------- 

320 res : results instance 

321 some required information is directly taken from the result 

322 instance 

323 yname : {str, None} 

324 optional name for the endogenous variable, default is "y" 

325 xname : {list[str], None} 

326 optional names for the exogenous variables, default is "var_xx" 

327 alpha : float 

328 significance level for the confidence intervals 

329 use_t : bool 

330 indicator whether the p-values are based on the Student-t 

331 distribution (if True) or on the normal distribution (if False) 

332 skip_headers : bool 

333 If false (default), then the header row is added. If true, then no 

334 header row is added. 

335 float_format : str 

336 float formatting options (e.g. ".3g") 

337 

338 Returns 

339 ------- 

340 params_table : SimpleTable instance 

341 ''' 

342 

343 if isinstance(results, tuple): 

344 results, params, bse, tvalues, pvalues, conf_int = results 

345 else: 

346 params = results.params 

347 bse = results.bse 

348 tvalues = results.tvalues 

349 pvalues = results.pvalues 

350 conf_int = results.conf_int(alpha) 

351 

352 data = np.array([params, bse, tvalues, pvalues]).T 

353 data = np.hstack([data, conf_int]) 

354 data = pd.DataFrame(data) 

355 

356 if use_t: 

357 data.columns = ['Coef.', 'Std.Err.', 't', 'P>|t|', 

358 '[' + str(alpha/2), str(1-alpha/2) + ']'] 

359 else: 

360 data.columns = ['Coef.', 'Std.Err.', 'z', 'P>|z|', 

361 '[' + str(alpha/2), str(1-alpha/2) + ']'] 

362 

363 if not xname: 

364 try: 

365 data.index = results.model.data.param_names 

366 except AttributeError: 

367 data.index = results.model.exog_names 

368 else: 

369 data.index = xname 

370 

371 return data 

372 

373 

374# Vertical summary instance for multiple models 

375def _col_params(result, float_format='%.4f', stars=True): 

376 '''Stack coefficients and standard errors in single column 

377 ''' 

378 

379 # Extract parameters 

380 res = summary_params(result) 

381 # Format float 

382 for col in res.columns[:2]: 

383 res[col] = res[col].apply(lambda x: float_format % x) 

384 # Std.Errors in parentheses 

385 res.iloc[:, 1] = '(' + res.iloc[:, 1] + ')' 

386 # Significance stars 

387 if stars: 

388 idx = res.iloc[:, 3] < .1 

389 res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*' 

390 idx = res.iloc[:, 3] < .05 

391 res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*' 

392 idx = res.iloc[:, 3] < .01 

393 res.loc[idx, res.columns[0]] = res.loc[idx, res.columns[0]] + '*' 

394 # Stack Coefs and Std.Errors 

395 res = res.iloc[:, :2] 

396 res = res.iloc[:, :2] 

397 rsquared = rsquared_adj = np.nan 

398 if hasattr(result, 'rsquared'): 

399 rsquared = result.rsquared 

400 if hasattr(result, 'rsquared_adj'): 

401 rsquared_adj = result.rsquared_adj 

402 r_result = pd.DataFrame({'Basic': [rsquared], 'Adj.': [rsquared_adj]}, 

403 index=['R-squared']) 

404 if not np.all(np.isnan(np.asarray(r_result))): 

405 for col in r_result: 

406 r_result[col] = r_result[col].apply(lambda x: float_format % x) 

407 try: 

408 res = pd.DataFrame(res).append(r_result, sort=True) 

409 except TypeError: 

410 # TODO: Remove when min pandas >= 0.23 

411 res = pd.DataFrame(res).append(r_result) 

412 res = res.stack() 

413 res = pd.DataFrame(res) 

414 res.columns = [str(result.model.endog_names)] 

415 return res 

416 

417 

418def _col_info(result, info_dict=None): 

419 '''Stack model info in a column 

420 ''' 

421 

422 if info_dict is None: 

423 info_dict = {} 

424 out = [] 

425 index = [] 

426 for i in info_dict: 

427 if isinstance(info_dict[i], dict): 

428 # this is a specific model info_dict, but not for this result... 

429 continue 

430 try: 

431 out.append(info_dict[i](result)) 

432 except AttributeError: 

433 out.append('') 

434 index.append(i) 

435 out = pd.DataFrame({str(result.model.endog_names): out}, index=index) 

436 return out 

437 

438 

439def _make_unique(list_of_names): 

440 if len(set(list_of_names)) == len(list_of_names): 

441 return list_of_names 

442 # pandas does not like it if multiple columns have the same names 

443 from collections import defaultdict 

444 name_counter = defaultdict(str) 

445 header = [] 

446 for _name in list_of_names: 

447 name_counter[_name] += "I" 

448 header.append(_name+" " + name_counter[_name]) 

449 return header 

450 

451 

452def summary_col(results, float_format='%.4f', model_names=(), stars=False, 

453 info_dict=None, regressor_order=(), drop_omitted=False): 

454 """ 

455 Summarize multiple results instances side-by-side (coefs and SEs) 

456 

457 Parameters 

458 ---------- 

459 results : statsmodels results instance or list of result instances 

460 float_format : str, optional 

461 float format for coefficients and standard errors 

462 Default : '%.4f' 

463 model_names : list[str], optional 

464 Must have same length as the number of results. If the names are not 

465 unique, a roman number will be appended to all model names 

466 stars : bool 

467 print significance stars 

468 info_dict : dict 

469 dict of functions to be applied to results instances to retrieve 

470 model info. To use specific information for different models, add a 

471 (nested) info_dict with model name as the key. 

472 Example: `info_dict = {"N":lambda x:(x.nobs), "R2": ..., "OLS":{ 

473 "R2":...}}` would only show `R2` for OLS regression models, but 

474 additionally `N` for all other results. 

475 Default : None (use the info_dict specified in 

476 result.default_model_infos, if this property exists) 

477 regressor_order : list[str], optional 

478 list of names of the regressors in the desired order. All regressors 

479 not specified will be appended to the end of the list. 

480 drop_omitted : bool, optional 

481 Includes regressors that are not specified in regressor_order. If 

482 False, regressors not specified will be appended to end of the list. 

483 If True, only regressors in regressor_order will be included. 

484 """ 

485 

486 if not isinstance(results, list): 

487 results = [results] 

488 

489 cols = [_col_params(x, stars=stars, float_format=float_format) for x in 

490 results] 

491 

492 # Unique column names (pandas has problems merging otherwise) 

493 if model_names: 

494 colnames = _make_unique(model_names) 

495 else: 

496 colnames = _make_unique([x.columns[0] for x in cols]) 

497 for i in range(len(cols)): 

498 cols[i].columns = [colnames[i]] 

499 

500 def merg(x, y): 

501 return x.merge(y, how='outer', right_index=True, 

502 left_index=True) 

503 

504 summ = reduce(merg, cols) 

505 

506 if regressor_order: 

507 varnames = summ.index.get_level_values(0).tolist() 

508 ordered = [x for x in regressor_order if x in varnames] 

509 unordered = [x for x in varnames if x not in regressor_order + ['']] 

510 order = ordered + list(np.unique(unordered)) 

511 

512 def f(idx): 

513 return sum([[x + 'coef', x + 'stde'] for x in idx], []) 

514 

515 summ.index = f(pd.unique(varnames)) 

516 summ = summ.reindex(f(order)) 

517 summ.index = [x[:-4] for x in summ.index] 

518 if drop_omitted: 

519 summ = summ.loc[regressor_order] 

520 

521 idx = pd.Series(lrange(summ.shape[0])) % 2 == 1 

522 summ.index = np.where(idx, '', summ.index.get_level_values(0)) 

523 

524 # add infos about the models. 

525 if info_dict: 

526 cols = [_col_info(x, info_dict.get(x.model.__class__.__name__, 

527 info_dict)) for x in results] 

528 else: 

529 cols = [_col_info(x, getattr(x, "default_model_infos", None)) for x in 

530 results] 

531 # use unique column names, otherwise the merge will not succeed 

532 for df, name in zip(cols, _make_unique([df.columns[0] for df in cols])): 

533 df.columns = [name] 

534 

535 def merg(x, y): 

536 return x.merge(y, how='outer', right_index=True, 

537 left_index=True) 

538 

539 info = reduce(merg, cols) 

540 dat = pd.DataFrame(np.vstack([summ, info])) # pd.concat better, but error 

541 dat.columns = summ.columns 

542 dat.index = pd.Index(summ.index.tolist() + info.index.tolist()) 

543 summ = dat 

544 

545 summ = summ.fillna('') 

546 

547 smry = Summary() 

548 smry._merge_latex = True 

549 smry.add_df(summ, header=True, align='l') 

550 smry.add_text('Standard errors in parentheses.') 

551 if stars: 

552 smry.add_text('* p<.1, ** p<.05, ***p<.01') 

553 

554 return smry 

555 

556 

557def _formatter(element, float_format='%.4f'): 

558 try: 

559 out = float_format % element 

560 except (ValueError, TypeError): 

561 out = str(element) 

562 return out.strip() 

563 

564 

565def _df_to_simpletable(df, align='r', float_format="%.4f", header=True, 

566 index=True, table_dec_above='-', table_dec_below=None, 

567 header_dec_below='-', pad_col=0, pad_index=0): 

568 dat = df.copy() 

569 dat = dat.applymap(lambda x: _formatter(x, float_format)) 

570 if header: 

571 headers = [str(x) for x in dat.columns.tolist()] 

572 else: 

573 headers = None 

574 if index: 

575 stubs = [str(x) + int(pad_index) * ' ' for x in dat.index.tolist()] 

576 else: 

577 dat.iloc[:, 0] = [str(x) + int(pad_index) * ' ' 

578 for x in dat.iloc[:, 0]] 

579 stubs = None 

580 st = SimpleTable(np.array(dat), headers=headers, stubs=stubs, 

581 ltx_fmt=fmt_latex, txt_fmt=fmt_txt) 

582 st.output_formats['latex']['data_aligns'] = align 

583 st.output_formats['txt']['data_aligns'] = align 

584 st.output_formats['txt']['table_dec_above'] = table_dec_above 

585 st.output_formats['txt']['table_dec_below'] = table_dec_below 

586 st.output_formats['txt']['header_dec_below'] = header_dec_below 

587 st.output_formats['txt']['colsep'] = ' ' * int(pad_col + 1) 

588 return st 

589 

590 

591def _simple_tables(tables, settings, pad_col=None, pad_index=None): 

592 simple_tables = [] 

593 float_format = settings[0]['float_format'] if settings else '%.4f' 

594 if pad_col is None: 

595 pad_col = [0] * len(tables) 

596 if pad_index is None: 

597 pad_index = [0] * len(tables) 

598 for i, v in enumerate(tables): 

599 index = settings[i]['index'] 

600 header = settings[i]['header'] 

601 align = settings[i]['align'] 

602 simple_tables.append(_df_to_simpletable(v, align=align, 

603 float_format=float_format, 

604 header=header, index=index, 

605 pad_col=pad_col[i], 

606 pad_index=pad_index[i])) 

607 return simple_tables