Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of Patsy 

2# Copyright (C) 2011-2013 Nathaniel Smith <njs@pobox.com> 

3# See file LICENSE.txt for license information. 

4 

5# These are made available in the patsy.* namespace: 

6__all__ = ["dmatrix", "dmatrices", 

7 "incr_dbuilder", "incr_dbuilders"] 

8 

9# problems: 

10# statsmodels reluctant to pass around separate eval environment, suggesting 

11# that design_and_matrices-equivalent should return a formula_like 

12# is ModelDesc really the high-level thing? 

13# ModelDesign doesn't work -- need to work with the builder set 

14# want to be able to return either a matrix or a pandas dataframe 

15 

16import six 

17import numpy as np 

18from patsy import PatsyError 

19from patsy.design_info import DesignMatrix, DesignInfo 

20from patsy.eval import EvalEnvironment 

21from patsy.desc import ModelDesc 

22from patsy.build import (design_matrix_builders, 

23 build_design_matrices) 

24from patsy.util import (have_pandas, asarray_or_pandas, 

25 atleast_2d_column_default) 

26 

27if have_pandas: 

28 import pandas 

29 

30# Tries to build a (lhs, rhs) design given a formula_like and an incremental 

31# data source. If formula_like is not capable of doing this, then returns 

32# None. 

33def _try_incr_builders(formula_like, data_iter_maker, eval_env, 

34 NA_action): 

35 if isinstance(formula_like, DesignInfo): 

36 return (design_matrix_builders([[]], data_iter_maker, eval_env, NA_action)[0], 

37 formula_like) 

38 if (isinstance(formula_like, tuple) 

39 and len(formula_like) == 2 

40 and isinstance(formula_like[0], DesignInfo) 

41 and isinstance(formula_like[1], DesignInfo)): 

42 return formula_like 

43 if hasattr(formula_like, "__patsy_get_model_desc__"): 

44 formula_like = formula_like.__patsy_get_model_desc__(eval_env) 

45 if not isinstance(formula_like, ModelDesc): 

46 raise PatsyError("bad value from %r.__patsy_get_model_desc__" 

47 % (formula_like,)) 

48 # fallthrough 

49 if not six.PY3 and isinstance(formula_like, unicode): 

50 # Included for the convenience of people who are using py2 with 

51 # __future__.unicode_literals. 

52 try: 

53 formula_like = formula_like.encode("ascii") 

54 except UnicodeEncodeError: 

55 raise PatsyError( 

56 "On Python 2, formula strings must be either 'str' objects, " 

57 "or else 'unicode' objects containing only ascii " 

58 "characters. You passed a unicode string with non-ascii " 

59 "characters. I'm afraid you'll have to either switch to " 

60 "ascii-only, or else upgrade to Python 3.") 

61 if isinstance(formula_like, str): 

62 formula_like = ModelDesc.from_formula(formula_like) 

63 # fallthrough 

64 if isinstance(formula_like, ModelDesc): 

65 assert isinstance(eval_env, EvalEnvironment) 

66 return design_matrix_builders([formula_like.lhs_termlist, 

67 formula_like.rhs_termlist], 

68 data_iter_maker, 

69 eval_env, 

70 NA_action) 

71 else: 

72 return None 

73 

74def incr_dbuilder(formula_like, data_iter_maker, eval_env=0, NA_action="drop"): 

75 """Construct a design matrix builder incrementally from a large data set. 

76 

77 :arg formula_like: Similar to :func:`dmatrix`, except that explicit 

78 matrices are not allowed. Must be a formula string, a 

79 :class:`ModelDesc`, a :class:`DesignInfo`, or an object with a 

80 ``__patsy_get_model_desc__`` method. 

81 :arg data_iter_maker: A zero-argument callable which returns an iterator 

82 over dict-like data objects. This must be a callable rather than a 

83 simple iterator because sufficiently complex formulas may require 

84 multiple passes over the data (e.g. if there are nested stateful 

85 transforms). 

86 :arg eval_env: Either a :class:`EvalEnvironment` which will be used to 

87 look up any variables referenced in `formula_like` that cannot be 

88 found in `data`, or else a depth represented as an 

89 integer which will be passed to :meth:`EvalEnvironment.capture`. 

90 ``eval_env=0`` means to use the context of the function calling 

91 :func:`incr_dbuilder` for lookups. If calling this function from a 

92 library, you probably want ``eval_env=1``, which means that variables 

93 should be resolved in *your* caller's namespace. 

94 :arg NA_action: An :class:`NAAction` object or string, used to determine 

95 what values count as 'missing' for purposes of determining the levels of 

96 categorical factors. 

97 :returns: A :class:`DesignInfo` 

98 

99 Tip: for `data_iter_maker`, write a generator like:: 

100 

101 def iter_maker(): 

102 for data_chunk in my_data_store: 

103 yield data_chunk 

104 

105 and pass `iter_maker` (*not* `iter_maker()`). 

106 

107 .. versionadded:: 0.2.0 

108 The ``NA_action`` argument. 

109 """ 

110 eval_env = EvalEnvironment.capture(eval_env, reference=1) 

111 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env, 

112 NA_action) 

113 if design_infos is None: 

114 raise PatsyError("bad formula-like object") 

115 if len(design_infos[0].column_names) > 0: 

116 raise PatsyError("encountered outcome variables for a model " 

117 "that does not expect them") 

118 return design_infos[1] 

119 

120def incr_dbuilders(formula_like, data_iter_maker, eval_env=0, 

121 NA_action="drop"): 

122 """Construct two design matrix builders incrementally from a large data 

123 set. 

124 

125 :func:`incr_dbuilders` is to :func:`incr_dbuilder` as :func:`dmatrices` is 

126 to :func:`dmatrix`. See :func:`incr_dbuilder` for details. 

127 """ 

128 eval_env = EvalEnvironment.capture(eval_env, reference=1) 

129 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env, 

130 NA_action) 

131 if design_infos is None: 

132 raise PatsyError("bad formula-like object") 

133 if len(design_infos[0].column_names) == 0: 

134 raise PatsyError("model is missing required outcome variables") 

135 return design_infos 

136 

137# This always returns a length-two tuple, 

138# response, predictors 

139# where 

140# response is a DesignMatrix (possibly with 0 columns) 

141# predictors is a DesignMatrix 

142# The input 'formula_like' could be like: 

143# (np.ndarray, np.ndarray) 

144# (DesignMatrix, DesignMatrix) 

145# (None, DesignMatrix) 

146# np.ndarray # for predictor-only models 

147# DesignMatrix 

148# (None, np.ndarray) 

149# "y ~ x" 

150# ModelDesc(...) 

151# DesignInfo 

152# (DesignInfo, DesignInfo) 

153# any object with a special method __patsy_get_model_desc__ 

154def _do_highlevel_design(formula_like, data, eval_env, 

155 NA_action, return_type): 

156 if return_type == "dataframe" and not have_pandas: 

157 raise PatsyError("pandas.DataFrame was requested, but pandas " 

158 "is not installed") 

159 if return_type not in ("matrix", "dataframe"): 

160 raise PatsyError("unrecognized output type %r, should be " 

161 "'matrix' or 'dataframe'" % (return_type,)) 

162 def data_iter_maker(): 

163 return iter([data]) 

164 design_infos = _try_incr_builders(formula_like, data_iter_maker, eval_env, 

165 NA_action) 

166 if design_infos is not None: 

167 return build_design_matrices(design_infos, data, 

168 NA_action=NA_action, 

169 return_type=return_type) 

170 else: 

171 # No builders, but maybe we can still get matrices 

172 if isinstance(formula_like, tuple): 

173 if len(formula_like) != 2: 

174 raise PatsyError("don't know what to do with a length %s " 

175 "matrices tuple" 

176 % (len(formula_like),)) 

177 (lhs, rhs) = formula_like 

178 else: 

179 # subok=True is necessary here to allow DesignMatrixes to pass 

180 # through 

181 (lhs, rhs) = (None, asarray_or_pandas(formula_like, subok=True)) 

182 # some sort of explicit matrix or matrices were given. Currently we 

183 # have them in one of these forms: 

184 # -- an ndarray or subclass 

185 # -- a DesignMatrix 

186 # -- a pandas.Series 

187 # -- a pandas.DataFrame 

188 # and we have to produce a standard output format. 

189 def _regularize_matrix(m, default_column_prefix): 

190 di = DesignInfo.from_array(m, default_column_prefix) 

191 if have_pandas and isinstance(m, (pandas.Series, pandas.DataFrame)): 

192 orig_index = m.index 

193 else: 

194 orig_index = None 

195 if return_type == "dataframe": 

196 m = atleast_2d_column_default(m, preserve_pandas=True) 

197 m = pandas.DataFrame(m) 

198 m.columns = di.column_names 

199 m.design_info = di 

200 return (m, orig_index) 

201 else: 

202 return (DesignMatrix(m, di), orig_index) 

203 rhs, rhs_orig_index = _regularize_matrix(rhs, "x") 

204 if lhs is None: 

205 lhs = np.zeros((rhs.shape[0], 0), dtype=float) 

206 lhs, lhs_orig_index = _regularize_matrix(lhs, "y") 

207 

208 assert isinstance(getattr(lhs, "design_info", None), DesignInfo) 

209 assert isinstance(getattr(rhs, "design_info", None), DesignInfo) 

210 if lhs.shape[0] != rhs.shape[0]: 

211 raise PatsyError("shape mismatch: outcome matrix has %s rows, " 

212 "predictor matrix has %s rows" 

213 % (lhs.shape[0], rhs.shape[0])) 

214 if rhs_orig_index is not None and lhs_orig_index is not None: 

215 if not rhs_orig_index.equals(lhs_orig_index): 

216 raise PatsyError("index mismatch: outcome and " 

217 "predictor have incompatible indexes") 

218 if return_type == "dataframe": 

219 if rhs_orig_index is not None and lhs_orig_index is None: 

220 lhs.index = rhs.index 

221 if rhs_orig_index is None and lhs_orig_index is not None: 

222 rhs.index = lhs.index 

223 return (lhs, rhs) 

224 

225def dmatrix(formula_like, data={}, eval_env=0, 

226 NA_action="drop", return_type="matrix"): 

227 """Construct a single design matrix given a formula_like and data. 

228 

229 :arg formula_like: An object that can be used to construct a design 

230 matrix. See below. 

231 :arg data: A dict-like object that can be used to look up variables 

232 referenced in `formula_like`. 

233 :arg eval_env: Either a :class:`EvalEnvironment` which will be used to 

234 look up any variables referenced in `formula_like` that cannot be 

235 found in `data`, or else a depth represented as an 

236 integer which will be passed to :meth:`EvalEnvironment.capture`. 

237 ``eval_env=0`` means to use the context of the function calling 

238 :func:`dmatrix` for lookups. If calling this function from a library, 

239 you probably want ``eval_env=1``, which means that variables should be 

240 resolved in *your* caller's namespace. 

241 :arg NA_action: What to do with rows that contain missing values. You can 

242 ``"drop"`` them, ``"raise"`` an error, or for customization, pass an 

243 :class:`NAAction` object. See :class:`NAAction` for details on what 

244 values count as 'missing' (and how to alter this). 

245 :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below. 

246 

247 The `formula_like` can take a variety of forms. You can use any of the 

248 following: 

249 

250 * (The most common option) A formula string like ``"x1 + x2"`` (for 

251 :func:`dmatrix`) or ``"y ~ x1 + x2"`` (for :func:`dmatrices`). For 

252 details see :ref:`formulas`. 

253 * A :class:`ModelDesc`, which is a Python object representation of a 

254 formula. See :ref:`formulas` and :ref:`expert-model-specification` for 

255 details. 

256 * A :class:`DesignInfo`. 

257 * An object that has a method called :meth:`__patsy_get_model_desc__`. 

258 For details see :ref:`expert-model-specification`. 

259 * A numpy array_like (for :func:`dmatrix`) or a tuple 

260 (array_like, array_like) (for :func:`dmatrices`). These will have 

261 metadata added, representation normalized, and then be returned 

262 directly. In this case `data` and `eval_env` are 

263 ignored. There is special handling for two cases: 

264 

265 * :class:`DesignMatrix` objects will have their :class:`DesignInfo` 

266 preserved. This allows you to set up custom column names and term 

267 information even if you aren't using the rest of the patsy 

268 machinery. 

269 * :class:`pandas.DataFrame` or :class:`pandas.Series` objects will have 

270 their (row) indexes checked. If two are passed in, their indexes must 

271 be aligned. If ``return_type="dataframe"``, then their indexes will be 

272 preserved on the output. 

273 

274 Regardless of the input, the return type is always either: 

275 

276 * A :class:`DesignMatrix`, if ``return_type="matrix"`` (the default) 

277 * A :class:`pandas.DataFrame`, if ``return_type="dataframe"``. 

278 

279 The actual contents of the design matrix is identical in both cases, and 

280 in both cases a :class:`DesignInfo` object will be available in a 

281 ``.design_info`` attribute on the return value. However, for 

282 ``return_type="dataframe"``, any pandas indexes on the input (either in 

283 `data` or directly passed through `formula_like`) will be preserved, which 

284 may be useful for e.g. time-series models. 

285 

286 .. versionadded:: 0.2.0 

287 The ``NA_action`` argument. 

288 """ 

289 eval_env = EvalEnvironment.capture(eval_env, reference=1) 

290 (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, 

291 NA_action, return_type) 

292 if lhs.shape[1] != 0: 

293 raise PatsyError("encountered outcome variables for a model " 

294 "that does not expect them") 

295 return rhs 

296 

297def dmatrices(formula_like, data={}, eval_env=0, 

298 NA_action="drop", return_type="matrix"): 

299 """Construct two design matrices given a formula_like and data. 

300 

301 This function is identical to :func:`dmatrix`, except that it requires 

302 (and returns) two matrices instead of one. By convention, the first matrix 

303 is the "outcome" or "y" data, and the second is the "predictor" or "x" 

304 data. 

305 

306 See :func:`dmatrix` for details. 

307 """ 

308 eval_env = EvalEnvironment.capture(eval_env, reference=1) 

309 (lhs, rhs) = _do_highlevel_design(formula_like, data, eval_env, 

310 NA_action, return_type) 

311 if lhs.shape[1] == 0: 

312 raise PatsyError("model is missing required outcome variables") 

313 return (lhs, rhs)