Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import numpy as np 

2import pandas as pd 

3from collections import OrderedDict 

4from statsmodels.base.model import LikelihoodModelResults 

5 

6 

7class BayesGaussMI(object): 

8 """ 

9 Bayesian Imputation using a Gaussian model. 

10 

11 The approach is Bayesian. The goal is to sample from the joint 

12 distribution of the mean vector, covariance matrix, and missing 

13 data values given the observed data values. Conjugate priors for 

14 the population mean and covariance matrix are used. Gibbs 

15 sampling is used to update the mean vector, covariance matrix, and 

16 missing data values in turn. After burn-in, the imputed complete 

17 data sets from the Gibbs chain can be used in multiple imputation 

18 analyses (MI). 

19 

20 Parameters 

21 ---------- 

22 data : ndarray 

23 The array of data to be imputed. Values in the array equal to 

24 NaN are imputed. 

25 mean_prior : ndarray, optional 

26 The covariance matrix of the Gaussian prior distribution for 

27 the mean vector. If not provided, the identity matrix is 

28 used. 

29 cov_prior : ndarray, optional 

30 The center matrix for the inverse Wishart prior distribution 

31 for the covariance matrix. If not provided, the identity 

32 matrix is used. 

33 cov_prior_df : positive float 

34 The degrees of freedom of the inverse Wishart prior 

35 distribution for the covariance matrix. Defaults to 1. 

36 

37 Examples 

38 -------- 

39 A basic example with OLS. Data is generated assuming 10% is missing at 

40 random. 

41 

42 >>> import numpy as np 

43 >>> x = np.random.standard_normal((1000, 2)) 

44 >>> x.flat[np.random.sample(2000) < 0.1] = np.nan 

45 

46 The imputer is used with ``MI``. 

47 

48 >>> import statsmodels.api as sm 

49 >>> def model_args_fn(x): 

50 ... # Return endog, exog from x 

51 ... return x[:, 0], x[:, 1:] 

52 >>> imp = sm.BayesGaussMI(x) 

53 >>> mi = sm.MI(imp, sm.OLS, model_args_fn) 

54 """ 

55 

56 def __init__(self, data, mean_prior=None, cov_prior=None, cov_prior_df=1): 

57 

58 self.exog_names = None 

59 if type(data) is pd.DataFrame: 

60 self.exog_names = data.columns 

61 

62 data = np.asarray(data) 

63 self.data = data 

64 self._data = data 

65 self.mask = np.isnan(data) 

66 self.nobs = self.mask.shape[0] 

67 self.nvar = self.mask.shape[1] 

68 

69 # Identify all distinct missing data patterns 

70 z = 1 + np.log(1 + np.arange(self.mask.shape[1])) 

71 c = np.dot(self.mask, z) 

72 rowmap = OrderedDict() 

73 for i, v in enumerate(c): 

74 if v == 0: 

75 # No missing values 

76 continue 

77 if v not in rowmap: 

78 rowmap[v] = [] 

79 rowmap[v].append(i) 

80 self.patterns = [np.asarray(v) for v in rowmap.values()] 

81 

82 # Simple starting values for mean and covariance 

83 p = self._data.shape[1] 

84 self.cov = np.eye(p) 

85 mean = [] 

86 for i in range(p): 

87 v = self._data[:, i] 

88 v = v[np.isfinite(v)] 

89 if len(v) == 0: 

90 msg = "Column %d has no observed values" % i 

91 raise ValueError(msg) 

92 mean.append(v.mean()) 

93 self.mean = np.asarray(mean) 

94 

95 # Default covariance matrix of the (Gaussian) mean prior 

96 if mean_prior is None: 

97 mean_prior = np.eye(p) 

98 self.mean_prior = mean_prior 

99 

100 # Default center matrix of the (inverse Wishart) covariance prior 

101 if cov_prior is None: 

102 cov_prior = np.eye(p) 

103 self.cov_prior = cov_prior 

104 

105 # Degrees of freedom for the (inverse Wishart) covariance prior 

106 self.cov_prior_df = cov_prior_df 

107 

108 def update(self): 

109 """ 

110 Cycle through all Gibbs updates. 

111 """ 

112 

113 self.update_data() 

114 

115 # Need to update data first 

116 self.update_mean() 

117 self.update_cov() 

118 

119 def update_data(self): 

120 """ 

121 Gibbs update of the missing data values. 

122 """ 

123 

124 for ix in self.patterns: 

125 

126 i = ix[0] 

127 ix_miss = np.flatnonzero(self.mask[i, :]) 

128 ix_obs = np.flatnonzero(~self.mask[i, :]) 

129 

130 mm = self.mean[ix_miss] 

131 mo = self.mean[ix_obs] 

132 

133 voo = self.cov[ix_obs, :][:, ix_obs] 

134 vmm = self.cov[ix_miss, :][:, ix_miss] 

135 vmo = self.cov[ix_miss, :][:, ix_obs] 

136 

137 r = self._data[ix, :][:, ix_obs] - mo 

138 cm = mm + np.dot(vmo, np.linalg.solve(voo, r.T)).T 

139 cv = vmm - np.dot(vmo, np.linalg.solve(voo, vmo.T)) 

140 

141 cs = np.linalg.cholesky(cv) 

142 u = np.random.normal(size=(len(ix), len(ix_miss))) 

143 self._data[np.ix_(ix, ix_miss)] = cm + np.dot(u, cs.T) 

144 

145 # Set the user-visible data set. 

146 if self.exog_names is not None: 

147 self.data = pd.DataFrame( 

148 self._data, 

149 columns=self.exog_names, 

150 copy=False) 

151 else: 

152 self.data = self._data 

153 

154 def update_mean(self): 

155 """ 

156 Gibbs update of the mean vector. 

157 

158 Do not call until update_data has been called once. 

159 """ 

160 # https://stats.stackexchange.com/questions/28744/multivariate-normal-posterior 

161 

162 # Posterior covariance matrix of the mean 

163 cm = np.linalg.solve(self.cov/self.nobs + self.mean_prior, 

164 self.mean_prior / self.nobs) 

165 cm = np.dot(self.cov, cm) 

166 

167 # Posterior mean of the mean 

168 vm = np.linalg.solve(self.cov, self._data.sum(0)) 

169 vm = np.dot(cm, vm) 

170 

171 # Sample 

172 r = np.linalg.cholesky(cm) 

173 self.mean = vm + np.dot(r, np.random.normal(0, 1, self.nvar)) 

174 

175 def update_cov(self): 

176 """ 

177 Gibbs update of the covariance matrix. 

178 

179 Do not call until update_data has been called once. 

180 """ 

181 # https://stats.stackexchange.com/questions/50844/estimating-the-covariance-posterior-distribution-of-a-multivariate-gaussian 

182 

183 r = self._data - self.mean 

184 gr = np.dot(r.T, r) 

185 a = gr + self.cov_prior 

186 df = int(np.ceil(self.nobs + self.cov_prior_df)) 

187 

188 r = np.linalg.cholesky(np.linalg.inv(a)) 

189 x = np.dot(np.random.normal(size=(df, self.nvar)), r.T) 

190 ma = np.dot(x.T, x) 

191 self.cov = np.linalg.inv(ma) 

192 

193 

194class MI(object): 

195 """ 

196 MI performs multiple imputation using a provided imputer object. 

197 

198 Parameters 

199 ---------- 

200 imp : object 

201 An imputer class, such as BayesGaussMI. 

202 model : model class 

203 Any statsmodels model class. 

204 model_args_fn : function 

205 A function taking an imputed dataset as input and returning 

206 endog, exog. If the model is fit using a formula, returns 

207 a DataFrame used to build the model. Optional when a formula 

208 is used. 

209 model_kwds_fn : function, optional 

210 A function taking an imputed dataset as input and returning 

211 a dictionary of model keyword arguments. 

212 formula : str, optional 

213 If provided, the model is constructed using the `from_formula` 

214 class method, otherwise the `__init__` method is used. 

215 fit_args : list-like, optional 

216 List of arguments to be passed to the fit method 

217 fit_kwds : dict-like, optional 

218 Keyword arguments to be passed to the fit method 

219 xfunc : function mapping ndarray to ndarray 

220 A function that is applied to the complete data matrix 

221 prior to fitting the model 

222 burn : int 

223 Number of burn-in iterations 

224 nrep : int 

225 Number of imputed data sets to use in the analysis 

226 skip : int 

227 Number of Gibbs iterations to skip between successive 

228 multiple imputation fits. 

229 

230 Notes 

231 ----- 

232 The imputer object must have an 'update' method, and a 'data' 

233 attribute that contains the current imputed dataset. 

234 

235 xfunc can be used to introduce domain constraints, e.g. when 

236 imputing binary data the imputed continuous values can be rounded 

237 to 0/1. 

238 """ 

239 

240 def __init__(self, imp, model, model_args_fn=None, model_kwds_fn=None, 

241 formula=None, fit_args=None, fit_kwds=None, xfunc=None, 

242 burn=100, nrep=20, skip=10): 

243 

244 # The imputer 

245 self.imp = imp 

246 

247 # The number of imputed data sets to skip between each imputed 

248 # data set tha that is used in the analysis. 

249 self.skip = skip 

250 

251 # The model class 

252 self.model = model 

253 self.formula = formula 

254 

255 if model_args_fn is None: 

256 def f(x): 

257 return [] 

258 model_args_fn = f 

259 self.model_args_fn = model_args_fn 

260 

261 if model_kwds_fn is None: 

262 def f(x): 

263 return {} 

264 model_kwds_fn = f 

265 self.model_kwds_fn = model_kwds_fn 

266 

267 if fit_args is None: 

268 def f(x): 

269 return [] 

270 fit_args = f 

271 self.fit_args = fit_args 

272 

273 if fit_kwds is None: 

274 def f(x): 

275 return {} 

276 fit_kwds = f 

277 self.fit_kwds = fit_kwds 

278 

279 self.xfunc = xfunc 

280 self.nrep = nrep 

281 self.skip = skip 

282 

283 # Burn-in 

284 for k in range(burn): 

285 imp.update() 

286 

287 def fit(self, results_cb=None): 

288 """ 

289 Impute datasets, fit models, and pool results. 

290 

291 Parameters 

292 ---------- 

293 results_cb : function, optional 

294 If provided, each results instance r is passed through `results_cb`, 

295 then appended to the `results` attribute of the MIResults object. 

296 To save complete results, use `results_cb=lambda x: x`. The default 

297 behavior is to save no results. 

298 

299 Returns 

300 ------- 

301 A MIResults object. 

302 """ 

303 

304 par, cov = [], [] 

305 all_results = [] 

306 

307 for k in range(self.nrep): 

308 

309 for k in range(self.skip+1): 

310 self.imp.update() 

311 

312 da = self.imp.data 

313 

314 if self.xfunc is not None: 

315 da = self.xfunc(da) 

316 

317 if self.formula is None: 

318 model = self.model(*self.model_args_fn(da), 

319 **self.model_kwds_fn(da)) 

320 else: 

321 model = self.model.from_formula( 

322 self.formula, *self.model_args_fn(da), 

323 **self.model_kwds_fn(da)) 

324 

325 result = model.fit(*self.fit_args(da), **self.fit_kwds(da)) 

326 

327 if results_cb is not None: 

328 all_results.append(results_cb(result)) 

329 

330 par.append(np.asarray(result.params.copy())) 

331 cov.append(np.asarray(result.cov_params().copy())) 

332 

333 params, cov_params, fmi = self._combine(par, cov) 

334 

335 r = MIResults(self, model, params, cov_params) 

336 r.fmi = fmi 

337 

338 r.results = all_results 

339 

340 return r 

341 

342 def _combine(self, par, cov): 

343 # Helper function to apply "Rubin's combining rule" 

344 

345 par = np.asarray(par) 

346 

347 # Number of imputations 

348 m = par.shape[0] 

349 

350 # Point estimate 

351 params = par.mean(0) 

352 

353 # Within-imputation covariance 

354 wcov = sum(cov) / len(cov) 

355 

356 # Between-imputation covariance 

357 bcov = np.cov(par.T) 

358 bcov = np.atleast_2d(bcov) 

359 

360 # Overall covariance 

361 covp = wcov + (1 + 1/float(m))*bcov 

362 

363 # Fraction of missing information 

364 fmi = (1 + 1/float(m)) * np.diag(bcov) / np.diag(covp) 

365 

366 return params, covp, fmi 

367 

368 

369class MIResults(LikelihoodModelResults): 

370 """ 

371 A results class for multiple imputation (MI). 

372 

373 Parameters 

374 ---------- 

375 mi : MI instance 

376 The MI object that produced the results 

377 model : instance of statsmodels model class 

378 This can be any instance from the multiple imputation runs. 

379 It is used to get class information, the specific parameter 

380 and data values are not used. 

381 params : array_like 

382 The overall multiple imputation parameter estimates. 

383 normalized_cov_params : array_like (2d) 

384 The overall variance covariance matrix of the estimates. 

385 """ 

386 

387 def __init__(self, mi, model, params, normalized_cov_params): 

388 

389 super(MIResults, self).__init__(model, params, normalized_cov_params) 

390 self.mi = mi 

391 self._model = model 

392 

393 def summary(self, title=None, alpha=.05): 

394 """ 

395 Summarize the results of running multiple imputation. 

396 

397 Parameters 

398 ---------- 

399 title : str, optional 

400 Title for the top table. If not None, then this replaces 

401 the default title 

402 alpha : float 

403 Significance level for the confidence intervals 

404 

405 Returns 

406 ------- 

407 smry : Summary instance 

408 This holds the summary tables and text, which can be 

409 printed or converted to various output formats. 

410 """ 

411 

412 from statsmodels.iolib import summary2 

413 

414 smry = summary2.Summary() 

415 float_format = "%8.3f" 

416 

417 info = OrderedDict() 

418 info["Method:"] = "MI" 

419 info["Model:"] = self.mi.model.__name__ 

420 info["Dependent variable:"] = self._model.endog_names 

421 info["Sample size:"] = "%d" % self.mi.imp.data.shape[0] 

422 info["Num. imputations"] = "%d" % self.mi.nrep 

423 

424 smry.add_dict(info, align='l', float_format=float_format) 

425 

426 param = summary2.summary_params(self, alpha=alpha) 

427 param["FMI"] = self.fmi 

428 

429 smry.add_df(param, float_format=float_format) 

430 smry.add_title(title=title, results=self) 

431 

432 return smry