Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/imputation/bayes_mi.py : 10%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import numpy as np
2import pandas as pd
3from collections import OrderedDict
4from statsmodels.base.model import LikelihoodModelResults
7class BayesGaussMI(object):
8 """
9 Bayesian Imputation using a Gaussian model.
11 The approach is Bayesian. The goal is to sample from the joint
12 distribution of the mean vector, covariance matrix, and missing
13 data values given the observed data values. Conjugate priors for
14 the population mean and covariance matrix are used. Gibbs
15 sampling is used to update the mean vector, covariance matrix, and
16 missing data values in turn. After burn-in, the imputed complete
17 data sets from the Gibbs chain can be used in multiple imputation
18 analyses (MI).
20 Parameters
21 ----------
22 data : ndarray
23 The array of data to be imputed. Values in the array equal to
24 NaN are imputed.
25 mean_prior : ndarray, optional
26 The covariance matrix of the Gaussian prior distribution for
27 the mean vector. If not provided, the identity matrix is
28 used.
29 cov_prior : ndarray, optional
30 The center matrix for the inverse Wishart prior distribution
31 for the covariance matrix. If not provided, the identity
32 matrix is used.
33 cov_prior_df : positive float
34 The degrees of freedom of the inverse Wishart prior
35 distribution for the covariance matrix. Defaults to 1.
37 Examples
38 --------
39 A basic example with OLS. Data is generated assuming 10% is missing at
40 random.
42 >>> import numpy as np
43 >>> x = np.random.standard_normal((1000, 2))
44 >>> x.flat[np.random.sample(2000) < 0.1] = np.nan
46 The imputer is used with ``MI``.
48 >>> import statsmodels.api as sm
49 >>> def model_args_fn(x):
50 ... # Return endog, exog from x
51 ... return x[:, 0], x[:, 1:]
52 >>> imp = sm.BayesGaussMI(x)
53 >>> mi = sm.MI(imp, sm.OLS, model_args_fn)
54 """
56 def __init__(self, data, mean_prior=None, cov_prior=None, cov_prior_df=1):
58 self.exog_names = None
59 if type(data) is pd.DataFrame:
60 self.exog_names = data.columns
62 data = np.asarray(data)
63 self.data = data
64 self._data = data
65 self.mask = np.isnan(data)
66 self.nobs = self.mask.shape[0]
67 self.nvar = self.mask.shape[1]
69 # Identify all distinct missing data patterns
70 z = 1 + np.log(1 + np.arange(self.mask.shape[1]))
71 c = np.dot(self.mask, z)
72 rowmap = OrderedDict()
73 for i, v in enumerate(c):
74 if v == 0:
75 # No missing values
76 continue
77 if v not in rowmap:
78 rowmap[v] = []
79 rowmap[v].append(i)
80 self.patterns = [np.asarray(v) for v in rowmap.values()]
82 # Simple starting values for mean and covariance
83 p = self._data.shape[1]
84 self.cov = np.eye(p)
85 mean = []
86 for i in range(p):
87 v = self._data[:, i]
88 v = v[np.isfinite(v)]
89 if len(v) == 0:
90 msg = "Column %d has no observed values" % i
91 raise ValueError(msg)
92 mean.append(v.mean())
93 self.mean = np.asarray(mean)
95 # Default covariance matrix of the (Gaussian) mean prior
96 if mean_prior is None:
97 mean_prior = np.eye(p)
98 self.mean_prior = mean_prior
100 # Default center matrix of the (inverse Wishart) covariance prior
101 if cov_prior is None:
102 cov_prior = np.eye(p)
103 self.cov_prior = cov_prior
105 # Degrees of freedom for the (inverse Wishart) covariance prior
106 self.cov_prior_df = cov_prior_df
108 def update(self):
109 """
110 Cycle through all Gibbs updates.
111 """
113 self.update_data()
115 # Need to update data first
116 self.update_mean()
117 self.update_cov()
119 def update_data(self):
120 """
121 Gibbs update of the missing data values.
122 """
124 for ix in self.patterns:
126 i = ix[0]
127 ix_miss = np.flatnonzero(self.mask[i, :])
128 ix_obs = np.flatnonzero(~self.mask[i, :])
130 mm = self.mean[ix_miss]
131 mo = self.mean[ix_obs]
133 voo = self.cov[ix_obs, :][:, ix_obs]
134 vmm = self.cov[ix_miss, :][:, ix_miss]
135 vmo = self.cov[ix_miss, :][:, ix_obs]
137 r = self._data[ix, :][:, ix_obs] - mo
138 cm = mm + np.dot(vmo, np.linalg.solve(voo, r.T)).T
139 cv = vmm - np.dot(vmo, np.linalg.solve(voo, vmo.T))
141 cs = np.linalg.cholesky(cv)
142 u = np.random.normal(size=(len(ix), len(ix_miss)))
143 self._data[np.ix_(ix, ix_miss)] = cm + np.dot(u, cs.T)
145 # Set the user-visible data set.
146 if self.exog_names is not None:
147 self.data = pd.DataFrame(
148 self._data,
149 columns=self.exog_names,
150 copy=False)
151 else:
152 self.data = self._data
154 def update_mean(self):
155 """
156 Gibbs update of the mean vector.
158 Do not call until update_data has been called once.
159 """
160 # https://stats.stackexchange.com/questions/28744/multivariate-normal-posterior
162 # Posterior covariance matrix of the mean
163 cm = np.linalg.solve(self.cov/self.nobs + self.mean_prior,
164 self.mean_prior / self.nobs)
165 cm = np.dot(self.cov, cm)
167 # Posterior mean of the mean
168 vm = np.linalg.solve(self.cov, self._data.sum(0))
169 vm = np.dot(cm, vm)
171 # Sample
172 r = np.linalg.cholesky(cm)
173 self.mean = vm + np.dot(r, np.random.normal(0, 1, self.nvar))
175 def update_cov(self):
176 """
177 Gibbs update of the covariance matrix.
179 Do not call until update_data has been called once.
180 """
181 # https://stats.stackexchange.com/questions/50844/estimating-the-covariance-posterior-distribution-of-a-multivariate-gaussian
183 r = self._data - self.mean
184 gr = np.dot(r.T, r)
185 a = gr + self.cov_prior
186 df = int(np.ceil(self.nobs + self.cov_prior_df))
188 r = np.linalg.cholesky(np.linalg.inv(a))
189 x = np.dot(np.random.normal(size=(df, self.nvar)), r.T)
190 ma = np.dot(x.T, x)
191 self.cov = np.linalg.inv(ma)
194class MI(object):
195 """
196 MI performs multiple imputation using a provided imputer object.
198 Parameters
199 ----------
200 imp : object
201 An imputer class, such as BayesGaussMI.
202 model : model class
203 Any statsmodels model class.
204 model_args_fn : function
205 A function taking an imputed dataset as input and returning
206 endog, exog. If the model is fit using a formula, returns
207 a DataFrame used to build the model. Optional when a formula
208 is used.
209 model_kwds_fn : function, optional
210 A function taking an imputed dataset as input and returning
211 a dictionary of model keyword arguments.
212 formula : str, optional
213 If provided, the model is constructed using the `from_formula`
214 class method, otherwise the `__init__` method is used.
215 fit_args : list-like, optional
216 List of arguments to be passed to the fit method
217 fit_kwds : dict-like, optional
218 Keyword arguments to be passed to the fit method
219 xfunc : function mapping ndarray to ndarray
220 A function that is applied to the complete data matrix
221 prior to fitting the model
222 burn : int
223 Number of burn-in iterations
224 nrep : int
225 Number of imputed data sets to use in the analysis
226 skip : int
227 Number of Gibbs iterations to skip between successive
228 multiple imputation fits.
230 Notes
231 -----
232 The imputer object must have an 'update' method, and a 'data'
233 attribute that contains the current imputed dataset.
235 xfunc can be used to introduce domain constraints, e.g. when
236 imputing binary data the imputed continuous values can be rounded
237 to 0/1.
238 """
240 def __init__(self, imp, model, model_args_fn=None, model_kwds_fn=None,
241 formula=None, fit_args=None, fit_kwds=None, xfunc=None,
242 burn=100, nrep=20, skip=10):
244 # The imputer
245 self.imp = imp
247 # The number of imputed data sets to skip between each imputed
248 # data set tha that is used in the analysis.
249 self.skip = skip
251 # The model class
252 self.model = model
253 self.formula = formula
255 if model_args_fn is None:
256 def f(x):
257 return []
258 model_args_fn = f
259 self.model_args_fn = model_args_fn
261 if model_kwds_fn is None:
262 def f(x):
263 return {}
264 model_kwds_fn = f
265 self.model_kwds_fn = model_kwds_fn
267 if fit_args is None:
268 def f(x):
269 return []
270 fit_args = f
271 self.fit_args = fit_args
273 if fit_kwds is None:
274 def f(x):
275 return {}
276 fit_kwds = f
277 self.fit_kwds = fit_kwds
279 self.xfunc = xfunc
280 self.nrep = nrep
281 self.skip = skip
283 # Burn-in
284 for k in range(burn):
285 imp.update()
287 def fit(self, results_cb=None):
288 """
289 Impute datasets, fit models, and pool results.
291 Parameters
292 ----------
293 results_cb : function, optional
294 If provided, each results instance r is passed through `results_cb`,
295 then appended to the `results` attribute of the MIResults object.
296 To save complete results, use `results_cb=lambda x: x`. The default
297 behavior is to save no results.
299 Returns
300 -------
301 A MIResults object.
302 """
304 par, cov = [], []
305 all_results = []
307 for k in range(self.nrep):
309 for k in range(self.skip+1):
310 self.imp.update()
312 da = self.imp.data
314 if self.xfunc is not None:
315 da = self.xfunc(da)
317 if self.formula is None:
318 model = self.model(*self.model_args_fn(da),
319 **self.model_kwds_fn(da))
320 else:
321 model = self.model.from_formula(
322 self.formula, *self.model_args_fn(da),
323 **self.model_kwds_fn(da))
325 result = model.fit(*self.fit_args(da), **self.fit_kwds(da))
327 if results_cb is not None:
328 all_results.append(results_cb(result))
330 par.append(np.asarray(result.params.copy()))
331 cov.append(np.asarray(result.cov_params().copy()))
333 params, cov_params, fmi = self._combine(par, cov)
335 r = MIResults(self, model, params, cov_params)
336 r.fmi = fmi
338 r.results = all_results
340 return r
342 def _combine(self, par, cov):
343 # Helper function to apply "Rubin's combining rule"
345 par = np.asarray(par)
347 # Number of imputations
348 m = par.shape[0]
350 # Point estimate
351 params = par.mean(0)
353 # Within-imputation covariance
354 wcov = sum(cov) / len(cov)
356 # Between-imputation covariance
357 bcov = np.cov(par.T)
358 bcov = np.atleast_2d(bcov)
360 # Overall covariance
361 covp = wcov + (1 + 1/float(m))*bcov
363 # Fraction of missing information
364 fmi = (1 + 1/float(m)) * np.diag(bcov) / np.diag(covp)
366 return params, covp, fmi
369class MIResults(LikelihoodModelResults):
370 """
371 A results class for multiple imputation (MI).
373 Parameters
374 ----------
375 mi : MI instance
376 The MI object that produced the results
377 model : instance of statsmodels model class
378 This can be any instance from the multiple imputation runs.
379 It is used to get class information, the specific parameter
380 and data values are not used.
381 params : array_like
382 The overall multiple imputation parameter estimates.
383 normalized_cov_params : array_like (2d)
384 The overall variance covariance matrix of the estimates.
385 """
387 def __init__(self, mi, model, params, normalized_cov_params):
389 super(MIResults, self).__init__(model, params, normalized_cov_params)
390 self.mi = mi
391 self._model = model
393 def summary(self, title=None, alpha=.05):
394 """
395 Summarize the results of running multiple imputation.
397 Parameters
398 ----------
399 title : str, optional
400 Title for the top table. If not None, then this replaces
401 the default title
402 alpha : float
403 Significance level for the confidence intervals
405 Returns
406 -------
407 smry : Summary instance
408 This holds the summary tables and text, which can be
409 printed or converted to various output formats.
410 """
412 from statsmodels.iolib import summary2
414 smry = summary2.Summary()
415 float_format = "%8.3f"
417 info = OrderedDict()
418 info["Method:"] = "MI"
419 info["Model:"] = self.mi.model.__name__
420 info["Dependent variable:"] = self._model.endog_names
421 info["Sample size:"] = "%d" % self.mi.imp.data.shape[0]
422 info["Num. imputations"] = "%d" % self.mi.nrep
424 smry.add_dict(info, align='l', float_format=float_format)
426 param = summary2.summary_params(self, alpha=alpha)
427 param["FMI"] = self.fmi
429 smry.add_df(param, float_format=float_format)
430 smry.add_title(title=title, results=self)
432 return smry