Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of Patsy 

2# Copyright (C) 2011-2015 Nathaniel Smith <njs@pobox.com> 

3# See file LICENSE.txt for license information. 

4 

5# This file defines the core design matrix building functions. 

6 

7# These are made available in the patsy.* namespace 

8__all__ = ["design_matrix_builders", "build_design_matrices"] 

9 

10import itertools 

11import six 

12 

13import numpy as np 

14from patsy import PatsyError 

15from patsy.categorical import (guess_categorical, 

16 CategoricalSniffer, 

17 categorical_to_int) 

18from patsy.util import (atleast_2d_column_default, 

19 have_pandas, asarray_or_pandas, 

20 safe_issubdtype) 

21from patsy.design_info import (DesignMatrix, DesignInfo, 

22 FactorInfo, SubtermInfo) 

23from patsy.redundancy import pick_contrasts_for_term 

24from patsy.eval import EvalEnvironment 

25from patsy.contrasts import code_contrast_matrix, Treatment 

26from patsy.compat import OrderedDict 

27from patsy.missing import NAAction 

28 

29if have_pandas: 

30 import pandas 

31 

32class _MockFactor(object): 

33 def __init__(self, name="MOCKMOCK"): 

34 self._name = name 

35 

36 def eval(self, state, env): 

37 return env["mock"] 

38 

39 def name(self): 

40 return self._name 

41 

42def _max_allowed_dim(dim, arr, factor): 

43 if arr.ndim > dim: 

44 msg = ("factor '%s' evaluates to an %s-dimensional array; I only " 

45 "handle arrays with dimension <= %s" 

46 % (factor.name(), arr.ndim, dim)) 

47 raise PatsyError(msg, factor) 

48 

49def test__max_allowed_dim(): 

50 from nose.tools import assert_raises 

51 f = _MockFactor() 

52 _max_allowed_dim(1, np.array(1), f) 

53 _max_allowed_dim(1, np.array([1]), f) 

54 assert_raises(PatsyError, _max_allowed_dim, 1, np.array([[1]]), f) 

55 assert_raises(PatsyError, _max_allowed_dim, 1, np.array([[[1]]]), f) 

56 _max_allowed_dim(2, np.array(1), f) 

57 _max_allowed_dim(2, np.array([1]), f) 

58 _max_allowed_dim(2, np.array([[1]]), f) 

59 assert_raises(PatsyError, _max_allowed_dim, 2, np.array([[[1]]]), f) 

60 

61def _eval_factor(factor_info, data, NA_action): 

62 factor = factor_info.factor 

63 result = factor.eval(factor_info.state, data) 

64 # Returns either a 2d ndarray, or a DataFrame, plus is_NA mask 

65 if factor_info.type == "numerical": 

66 result = atleast_2d_column_default(result, preserve_pandas=True) 

67 _max_allowed_dim(2, result, factor) 

68 if result.shape[1] != factor_info.num_columns: 

69 raise PatsyError("when evaluating factor %s, I got %s columns " 

70 "instead of the %s I was expecting" 

71 % (factor.name(), 

72 factor_info.num_columns, 

73 result.shape[1]), 

74 factor) 

75 if not safe_issubdtype(np.asarray(result).dtype, np.number): 

76 raise PatsyError("when evaluating numeric factor %s, " 

77 "I got non-numeric data of type '%s'" 

78 % (factor.name(), result.dtype), 

79 factor) 

80 return result, NA_action.is_numerical_NA(result) 

81 # returns either a 1d ndarray or a pandas.Series, plus is_NA mask 

82 else: 

83 assert factor_info.type == "categorical" 

84 result = categorical_to_int(result, factor_info.categories, NA_action, 

85 origin=factor_info.factor) 

86 assert result.ndim == 1 

87 return result, np.asarray(result == -1) 

88 

89def test__eval_factor_numerical(): 

90 from nose.tools import assert_raises 

91 naa = NAAction() 

92 f = _MockFactor() 

93 

94 fi1 = FactorInfo(f, "numerical", {}, num_columns=1, categories=None) 

95 

96 assert fi1.factor is f 

97 eval123, is_NA = _eval_factor(fi1, {"mock": [1, 2, 3]}, naa) 

98 assert eval123.shape == (3, 1) 

99 assert np.all(eval123 == [[1], [2], [3]]) 

100 assert is_NA.shape == (3,) 

101 assert np.all(~is_NA) 

102 assert_raises(PatsyError, _eval_factor, fi1, {"mock": [[[1]]]}, naa) 

103 assert_raises(PatsyError, _eval_factor, fi1, {"mock": [[1, 2]]}, naa) 

104 assert_raises(PatsyError, _eval_factor, fi1, {"mock": ["a", "b"]}, naa) 

105 assert_raises(PatsyError, _eval_factor, fi1, {"mock": [True, False]}, naa) 

106 fi2 = FactorInfo(_MockFactor(), "numerical", 

107 {}, num_columns=2, categories=None) 

108 eval123321, is_NA = _eval_factor(fi2, 

109 {"mock": [[1, 3], [2, 2], [3, 1]]}, 

110 naa) 

111 assert eval123321.shape == (3, 2) 

112 assert np.all(eval123321 == [[1, 3], [2, 2], [3, 1]]) 

113 assert is_NA.shape == (3,) 

114 assert np.all(~is_NA) 

115 assert_raises(PatsyError, _eval_factor, fi2, {"mock": [1, 2, 3]}, naa) 

116 assert_raises(PatsyError, _eval_factor, fi2, {"mock": [[1, 2, 3]]}, naa) 

117 

118 ev_nan, is_NA = _eval_factor(fi1, {"mock": [1, 2, np.nan]}, 

119 NAAction(NA_types=["NaN"])) 

120 assert np.array_equal(is_NA, [False, False, True]) 

121 ev_nan, is_NA = _eval_factor(fi1, {"mock": [1, 2, np.nan]}, 

122 NAAction(NA_types=[])) 

123 assert np.array_equal(is_NA, [False, False, False]) 

124 

125 if have_pandas: 

126 eval_ser, _ = _eval_factor(fi1, 

127 {"mock": 

128 pandas.Series([1, 2, 3], 

129 index=[10, 20, 30])}, 

130 naa) 

131 assert isinstance(eval_ser, pandas.DataFrame) 

132 assert np.array_equal(eval_ser, [[1], [2], [3]]) 

133 assert np.array_equal(eval_ser.index, [10, 20, 30]) 

134 eval_df1, _ = _eval_factor(fi1, 

135 {"mock": 

136 pandas.DataFrame([[2], [1], [3]], 

137 index=[20, 10, 30])}, 

138 naa) 

139 assert isinstance(eval_df1, pandas.DataFrame) 

140 assert np.array_equal(eval_df1, [[2], [1], [3]]) 

141 assert np.array_equal(eval_df1.index, [20, 10, 30]) 

142 eval_df2, _ = _eval_factor(fi2, 

143 {"mock": 

144 pandas.DataFrame([[2, 3], [1, 4], [3, -1]], 

145 index=[20, 30, 10])}, 

146 naa) 

147 assert isinstance(eval_df2, pandas.DataFrame) 

148 assert np.array_equal(eval_df2, [[2, 3], [1, 4], [3, -1]]) 

149 assert np.array_equal(eval_df2.index, [20, 30, 10]) 

150 

151 assert_raises(PatsyError, 

152 _eval_factor, fi2, 

153 {"mock": pandas.Series([1, 2, 3], index=[10, 20, 30])}, 

154 naa) 

155 assert_raises(PatsyError, 

156 _eval_factor, fi1, 

157 {"mock": 

158 pandas.DataFrame([[2, 3], [1, 4], [3, -1]], 

159 index=[20, 30, 10])}, 

160 naa) 

161 

162def test__eval_factor_categorical(): 

163 from nose.tools import assert_raises 

164 from patsy.categorical import C 

165 naa = NAAction() 

166 f = _MockFactor() 

167 fi1 = FactorInfo(f, "categorical", 

168 {}, num_columns=None, categories=("a", "b")) 

169 assert fi1.factor is f 

170 cat1, _ = _eval_factor(fi1, {"mock": ["b", "a", "b"]}, naa) 

171 assert cat1.shape == (3,) 

172 assert np.all(cat1 == [1, 0, 1]) 

173 assert_raises(PatsyError, _eval_factor, fi1, {"mock": ["c"]}, naa) 

174 assert_raises(PatsyError, _eval_factor, fi1, {"mock": C(["a", "c"])}, naa) 

175 assert_raises(PatsyError, _eval_factor, fi1, 

176 {"mock": C(["a", "b"], levels=["b", "a"])}, naa) 

177 assert_raises(PatsyError, _eval_factor, fi1, {"mock": [1, 0, 1]}, naa) 

178 bad_cat = np.asarray(["b", "a", "a", "b"]) 

179 bad_cat.resize((2, 2)) 

180 assert_raises(PatsyError, _eval_factor, fi1, {"mock": bad_cat}, naa) 

181 

182 cat1_NA, is_NA = _eval_factor(fi1, {"mock": ["a", None, "b"]}, 

183 NAAction(NA_types=["None"])) 

184 assert np.array_equal(is_NA, [False, True, False]) 

185 assert np.array_equal(cat1_NA, [0, -1, 1]) 

186 assert_raises(PatsyError, _eval_factor, fi1, 

187 {"mock": ["a", None, "b"]}, NAAction(NA_types=[])) 

188 

189 fi2 = FactorInfo(_MockFactor(), "categorical", {}, 

190 num_columns=None, categories=[False, True]) 

191 cat2, _ = _eval_factor(fi2, {"mock": [True, False, False, True]}, naa) 

192 assert cat2.shape == (4,) 

193 assert np.all(cat2 == [1, 0, 0, 1]) 

194 

195 if have_pandas: 

196 s = pandas.Series(["b", "a"], index=[10, 20]) 

197 cat_s, _ = _eval_factor(fi1, {"mock": s}, naa) 

198 assert isinstance(cat_s, pandas.Series) 

199 assert np.array_equal(cat_s, [1, 0]) 

200 assert np.array_equal(cat_s.index, [10, 20]) 

201 sbool = pandas.Series([True, False], index=[11, 21]) 

202 cat_sbool, _ = _eval_factor(fi2, {"mock": sbool}, naa) 

203 assert isinstance(cat_sbool, pandas.Series) 

204 assert np.array_equal(cat_sbool, [1, 0]) 

205 assert np.array_equal(cat_sbool.index, [11, 21]) 

206 

207def _column_combinations(columns_per_factor): 

208 # For consistency with R, the left-most item iterates fastest: 

209 iterators = [range(n) for n in reversed(columns_per_factor)] 

210 for reversed_combo in itertools.product(*iterators): 

211 yield reversed_combo[::-1] 

212 

213def test__column_combinations(): 

214 assert list(_column_combinations([2, 3])) == [(0, 0), 

215 (1, 0), 

216 (0, 1), 

217 (1, 1), 

218 (0, 2), 

219 (1, 2)] 

220 assert list(_column_combinations([3])) == [(0,), (1,), (2,)] 

221 assert list(_column_combinations([])) == [()] 

222 

223def _subterm_column_combinations(factor_infos, subterm): 

224 columns_per_factor = [] 

225 for factor in subterm.factors: 

226 if factor in subterm.contrast_matrices: 

227 columns = subterm.contrast_matrices[factor].matrix.shape[1] 

228 else: 

229 columns = factor_infos[factor].num_columns 

230 columns_per_factor.append(columns) 

231 return _column_combinations(columns_per_factor) 

232 

233def _subterm_column_names_iter(factor_infos, subterm): 

234 total = 0 

235 for i, column_idxs in enumerate( 

236 _subterm_column_combinations(factor_infos, subterm)): 

237 name_pieces = [] 

238 for factor, column_idx in zip(subterm.factors, column_idxs): 

239 fi = factor_infos[factor] 

240 if fi.type == "numerical": 

241 if fi.num_columns > 1: 

242 name_pieces.append("%s[%s]" 

243 % (factor.name(), column_idx)) 

244 else: 

245 assert column_idx == 0 

246 name_pieces.append(factor.name()) 

247 else: 

248 assert fi.type == "categorical" 

249 contrast = subterm.contrast_matrices[factor] 

250 suffix = contrast.column_suffixes[column_idx] 

251 name_pieces.append("%s%s" % (factor.name(), suffix)) 

252 if not name_pieces: 

253 yield "Intercept" 

254 else: 

255 yield ":".join(name_pieces) 

256 total += 1 

257 assert total == subterm.num_columns 

258 

259def _build_subterm(subterm, factor_infos, factor_values, out): 

260 assert subterm.num_columns == out.shape[1] 

261 out[...] = 1 

262 for i, column_idxs in enumerate( 

263 _subterm_column_combinations(factor_infos, subterm)): 

264 for factor, column_idx in zip(subterm.factors, column_idxs): 

265 if factor_infos[factor].type == "categorical": 

266 contrast = subterm.contrast_matrices[factor] 

267 if np.any(factor_values[factor] < 0): 

268 raise PatsyError("can't build a design matrix " 

269 "containing missing values", factor) 

270 out[:, i] *= contrast.matrix[factor_values[factor], 

271 column_idx] 

272 else: 

273 assert factor_infos[factor].type == "numerical" 

274 assert (factor_values[factor].shape[1] 

275 == factor_infos[factor].num_columns) 

276 out[:, i] *= factor_values[factor][:, column_idx] 

277 

278def test__subterm_column_names_iter_and__build_subterm(): 

279 from nose.tools import assert_raises 

280 from patsy.contrasts import ContrastMatrix 

281 from patsy.categorical import C 

282 f1 = _MockFactor("f1") 

283 f2 = _MockFactor("f2") 

284 f3 = _MockFactor("f3") 

285 contrast = ContrastMatrix(np.array([[0, 0.5], 

286 [3, 0]]), 

287 ["[c1]", "[c2]"]) 

288 

289 factor_infos1 = {f1: FactorInfo(f1, "numerical", {}, 

290 num_columns=1, categories=None), 

291 f2: FactorInfo(f2, "categorical", {}, 

292 num_columns=None, categories=["a", "b"]), 

293 f3: FactorInfo(f3, "numerical", {}, 

294 num_columns=1, categories=None), 

295 } 

296 contrast_matrices = {f2: contrast} 

297 subterm1 = SubtermInfo([f1, f2, f3], contrast_matrices, 2) 

298 assert (list(_subterm_column_names_iter(factor_infos1, subterm1)) 

299 == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"]) 

300 

301 mat = np.empty((3, 2)) 

302 _build_subterm(subterm1, factor_infos1, 

303 {f1: atleast_2d_column_default([1, 2, 3]), 

304 f2: np.asarray([0, 0, 1]), 

305 f3: atleast_2d_column_default([7.5, 2, -12])}, 

306 mat) 

307 assert np.allclose(mat, [[0, 0.5 * 1 * 7.5], 

308 [0, 0.5 * 2 * 2], 

309 [3 * 3 * -12, 0]]) 

310 # Check that missing categorical values blow up 

311 assert_raises(PatsyError, _build_subterm, subterm1, factor_infos1, 

312 {f1: atleast_2d_column_default([1, 2, 3]), 

313 f2: np.asarray([0, -1, 1]), 

314 f3: atleast_2d_column_default([7.5, 2, -12])}, 

315 mat) 

316 

317 factor_infos2 = dict(factor_infos1) 

318 factor_infos2[f1] = FactorInfo(f1, "numerical", {}, 

319 num_columns=2, categories=None) 

320 subterm2 = SubtermInfo([f1, f2, f3], contrast_matrices, 4) 

321 assert (list(_subterm_column_names_iter(factor_infos2, subterm2)) 

322 == ["f1[0]:f2[c1]:f3", 

323 "f1[1]:f2[c1]:f3", 

324 "f1[0]:f2[c2]:f3", 

325 "f1[1]:f2[c2]:f3"]) 

326 

327 mat2 = np.empty((3, 4)) 

328 _build_subterm(subterm2, factor_infos2, 

329 {f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]), 

330 f2: np.asarray([0, 0, 1]), 

331 f3: atleast_2d_column_default([7.5, 2, -12])}, 

332 mat2) 

333 assert np.allclose(mat2, [[0, 0, 0.5 * 1 * 7.5, 0.5 * 2 * 7.5], 

334 [0, 0, 0.5 * 3 * 2, 0.5 * 4 * 2], 

335 [3 * 5 * -12, 3 * 6 * -12, 0, 0]]) 

336 

337 

338 subterm_int = SubtermInfo([], {}, 1) 

339 assert list(_subterm_column_names_iter({}, subterm_int)) == ["Intercept"] 

340 

341 mat3 = np.empty((3, 1)) 

342 _build_subterm(subterm_int, {}, 

343 {f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]}, 

344 mat3) 

345 assert np.allclose(mat3, 1) 

346 

347def _factors_memorize(factors, data_iter_maker, eval_env): 

348 # First, start off the memorization process by setting up each factor's 

349 # state and finding out how many passes it will need: 

350 factor_states = {} 

351 passes_needed = {} 

352 for factor in factors: 

353 state = {} 

354 which_pass = factor.memorize_passes_needed(state, eval_env) 

355 factor_states[factor] = state 

356 passes_needed[factor] = which_pass 

357 # Now, cycle through the data until all the factors have finished 

358 # memorizing everything: 

359 memorize_needed = set() 

360 for factor, passes in six.iteritems(passes_needed): 

361 if passes > 0: 

362 memorize_needed.add(factor) 

363 which_pass = 0 

364 while memorize_needed: 

365 for data in data_iter_maker(): 

366 for factor in memorize_needed: 

367 state = factor_states[factor] 

368 factor.memorize_chunk(state, which_pass, data) 

369 for factor in list(memorize_needed): 

370 factor.memorize_finish(factor_states[factor], which_pass) 

371 if which_pass == passes_needed[factor] - 1: 

372 memorize_needed.remove(factor) 

373 which_pass += 1 

374 return factor_states 

375 

376def test__factors_memorize(): 

377 class MockFactor(object): 

378 def __init__(self, requested_passes, token): 

379 self._requested_passes = requested_passes 

380 self._token = token 

381 self._chunk_in_pass = 0 

382 self._seen_passes = 0 

383 

384 def memorize_passes_needed(self, state, eval_env): 

385 state["calls"] = [] 

386 state["token"] = self._token 

387 return self._requested_passes 

388 

389 def memorize_chunk(self, state, which_pass, data): 

390 state["calls"].append(("memorize_chunk", which_pass)) 

391 assert data["chunk"] == self._chunk_in_pass 

392 self._chunk_in_pass += 1 

393 

394 def memorize_finish(self, state, which_pass): 

395 state["calls"].append(("memorize_finish", which_pass)) 

396 self._chunk_in_pass = 0 

397 

398 class Data(object): 

399 CHUNKS = 3 

400 def __init__(self): 

401 self.calls = 0 

402 self.data = [{"chunk": i} for i in range(self.CHUNKS)] 

403 def __call__(self): 

404 self.calls += 1 

405 return iter(self.data) 

406 data = Data() 

407 f0 = MockFactor(0, "f0") 

408 f1 = MockFactor(1, "f1") 

409 f2a = MockFactor(2, "f2a") 

410 f2b = MockFactor(2, "f2b") 

411 factor_states = _factors_memorize(set([f0, f1, f2a, f2b]), data, {}) 

412 assert data.calls == 2 

413 mem_chunks0 = [("memorize_chunk", 0)] * data.CHUNKS 

414 mem_chunks1 = [("memorize_chunk", 1)] * data.CHUNKS 

415 expected = { 

416 f0: { 

417 "calls": [], 

418 "token": "f0", 

419 }, 

420 f1: { 

421 "calls": mem_chunks0 + [("memorize_finish", 0)], 

422 "token": "f1", 

423 }, 

424 f2a: { 

425 "calls": mem_chunks0 + [("memorize_finish", 0)] 

426 + mem_chunks1 + [("memorize_finish", 1)], 

427 "token": "f2a", 

428 }, 

429 f2b: { 

430 "calls": mem_chunks0 + [("memorize_finish", 0)] 

431 + mem_chunks1 + [("memorize_finish", 1)], 

432 "token": "f2b", 

433 }, 

434 } 

435 assert factor_states == expected 

436 

437def _examine_factor_types(factors, factor_states, data_iter_maker, NA_action): 

438 num_column_counts = {} 

439 cat_sniffers = {} 

440 examine_needed = set(factors) 

441 for data in data_iter_maker(): 

442 for factor in list(examine_needed): 

443 value = factor.eval(factor_states[factor], data) 

444 if factor in cat_sniffers or guess_categorical(value): 

445 if factor not in cat_sniffers: 

446 cat_sniffers[factor] = CategoricalSniffer(NA_action, 

447 factor.origin) 

448 done = cat_sniffers[factor].sniff(value) 

449 if done: 

450 examine_needed.remove(factor) 

451 else: 

452 # Numeric 

453 value = atleast_2d_column_default(value) 

454 _max_allowed_dim(2, value, factor) 

455 column_count = value.shape[1] 

456 num_column_counts[factor] = column_count 

457 examine_needed.remove(factor) 

458 if not examine_needed: 

459 break 

460 # Pull out the levels 

461 cat_levels_contrasts = {} 

462 for factor, sniffer in six.iteritems(cat_sniffers): 

463 cat_levels_contrasts[factor] = sniffer.levels_contrast() 

464 return (num_column_counts, cat_levels_contrasts) 

465 

466def test__examine_factor_types(): 

467 from patsy.categorical import C 

468 class MockFactor(object): 

469 def __init__(self): 

470 # You should check this using 'is', not '==' 

471 from patsy.origin import Origin 

472 self.origin = Origin("MOCK", 1, 2) 

473 

474 def eval(self, state, data): 

475 return state[data] 

476 

477 def name(self): 

478 return "MOCK MOCK" 

479 

480 # This hacky class can only be iterated over once, but it keeps track of 

481 # how far it got. 

482 class DataIterMaker(object): 

483 def __init__(self): 

484 self.i = -1 

485 

486 def __call__(self): 

487 return self 

488 

489 def __iter__(self): 

490 return self 

491 

492 def next(self): 

493 self.i += 1 

494 if self.i > 1: 

495 raise StopIteration 

496 return self.i 

497 __next__ = next 

498 

499 num_1dim = MockFactor() 

500 num_1col = MockFactor() 

501 num_4col = MockFactor() 

502 categ_1col = MockFactor() 

503 bool_1col = MockFactor() 

504 string_1col = MockFactor() 

505 object_1col = MockFactor() 

506 object_levels = (object(), object(), object()) 

507 factor_states = { 

508 num_1dim: ([1, 2, 3], [4, 5, 6]), 

509 num_1col: ([[1], [2], [3]], [[4], [5], [6]]), 

510 num_4col: (np.zeros((3, 4)), np.ones((3, 4))), 

511 categ_1col: (C(["a", "b", "c"], levels=("a", "b", "c"), 

512 contrast="MOCK CONTRAST"), 

513 C(["c", "b", "a"], levels=("a", "b", "c"), 

514 contrast="MOCK CONTRAST")), 

515 bool_1col: ([True, True, False], [False, True, True]), 

516 # It has to read through all the data to see all the possible levels: 

517 string_1col: (["a", "a", "a"], ["c", "b", "a"]), 

518 object_1col: ([object_levels[0]] * 3, object_levels), 

519 } 

520 

521 it = DataIterMaker() 

522 (num_column_counts, cat_levels_contrasts, 

523 ) = _examine_factor_types(factor_states.keys(), factor_states, it, 

524 NAAction()) 

525 assert it.i == 2 

526 iterations = 0 

527 assert num_column_counts == {num_1dim: 1, num_1col: 1, num_4col: 4} 

528 assert cat_levels_contrasts == { 

529 categ_1col: (("a", "b", "c"), "MOCK CONTRAST"), 

530 bool_1col: ((False, True), None), 

531 string_1col: (("a", "b", "c"), None), 

532 object_1col: (tuple(sorted(object_levels, key=id)), None), 

533 } 

534 

535 # Check that it doesn't read through all the data if that's not necessary: 

536 it = DataIterMaker() 

537 no_read_necessary = [num_1dim, num_1col, num_4col, categ_1col, bool_1col] 

538 (num_column_counts, cat_levels_contrasts, 

539 ) = _examine_factor_types(no_read_necessary, factor_states, it, 

540 NAAction()) 

541 assert it.i == 0 

542 assert num_column_counts == {num_1dim: 1, num_1col: 1, num_4col: 4} 

543 assert cat_levels_contrasts == { 

544 categ_1col: (("a", "b", "c"), "MOCK CONTRAST"), 

545 bool_1col: ((False, True), None), 

546 } 

547 

548 # Illegal inputs: 

549 bool_3col = MockFactor() 

550 num_3dim = MockFactor() 

551 # no such thing as a multi-dimensional Categorical 

552 # categ_3dim = MockFactor() 

553 string_3col = MockFactor() 

554 object_3col = MockFactor() 

555 illegal_factor_states = { 

556 num_3dim: (np.zeros((3, 3, 3)), np.ones((3, 3, 3))), 

557 string_3col: ([["a", "b", "c"]], [["b", "c", "a"]]), 

558 object_3col: ([[[object()]]], [[[object()]]]), 

559 } 

560 from nose.tools import assert_raises 

561 for illegal_factor in illegal_factor_states: 

562 it = DataIterMaker() 

563 try: 

564 _examine_factor_types([illegal_factor], illegal_factor_states, it, 

565 NAAction()) 

566 except PatsyError as e: 

567 assert e.origin is illegal_factor.origin 

568 else: 

569 assert False 

570 

571def _make_subterm_infos(terms, 

572 num_column_counts, 

573 cat_levels_contrasts): 

574 # Sort each term into a bucket based on the set of numeric factors it 

575 # contains: 

576 term_buckets = OrderedDict() 

577 bucket_ordering = [] 

578 for term in terms: 

579 num_factors = [] 

580 for factor in term.factors: 

581 if factor in num_column_counts: 

582 num_factors.append(factor) 

583 bucket = frozenset(num_factors) 

584 if bucket not in term_buckets: 

585 bucket_ordering.append(bucket) 

586 term_buckets.setdefault(bucket, []).append(term) 

587 # Special rule: if there is a no-numerics bucket, then it always comes 

588 # first: 

589 if frozenset() in term_buckets: 

590 bucket_ordering.remove(frozenset()) 

591 bucket_ordering.insert(0, frozenset()) 

592 term_to_subterm_infos = OrderedDict() 

593 new_term_order = [] 

594 # Then within each bucket, work out which sort of contrasts we want to use 

595 # for each term to avoid redundancy 

596 for bucket in bucket_ordering: 

597 bucket_terms = term_buckets[bucket] 

598 # Sort by degree of interaction 

599 bucket_terms.sort(key=lambda t: len(t.factors)) 

600 new_term_order += bucket_terms 

601 used_subterms = set() 

602 for term in bucket_terms: 

603 subterm_infos = [] 

604 factor_codings = pick_contrasts_for_term(term, 

605 num_column_counts, 

606 used_subterms) 

607 # Construct one SubtermInfo for each subterm 

608 for factor_coding in factor_codings: 

609 subterm_factors = [] 

610 contrast_matrices = {} 

611 subterm_columns = 1 

612 # In order to preserve factor ordering information, the 

613 # coding_for_term just returns dicts, and we refer to 

614 # the original factors to figure out which are included in 

615 # each subterm, and in what order 

616 for factor in term.factors: 

617 # Numeric factors are included in every subterm 

618 if factor in num_column_counts: 

619 subterm_factors.append(factor) 

620 subterm_columns *= num_column_counts[factor] 

621 elif factor in factor_coding: 

622 subterm_factors.append(factor) 

623 levels, contrast = cat_levels_contrasts[factor] 

624 # This is where the default coding is set to 

625 # Treatment: 

626 coded = code_contrast_matrix(factor_coding[factor], 

627 levels, contrast, 

628 default=Treatment) 

629 contrast_matrices[factor] = coded 

630 subterm_columns *= coded.matrix.shape[1] 

631 subterm_infos.append(SubtermInfo(subterm_factors, 

632 contrast_matrices, 

633 subterm_columns)) 

634 term_to_subterm_infos[term] = subterm_infos 

635 assert new_term_order == list(term_to_subterm_infos) 

636 return term_to_subterm_infos 

637 

638def design_matrix_builders(termlists, data_iter_maker, eval_env, 

639 NA_action="drop"): 

640 """Construct several :class:`DesignInfo` objects from termlists. 

641 

642 This is one of Patsy's fundamental functions. This function and 

643 :func:`build_design_matrices` together form the API to the core formula 

644 interpretation machinery. 

645 

646 :arg termlists: A list of termlists, where each termlist is a list of 

647 :class:`Term` objects which together specify a design matrix. 

648 :arg data_iter_maker: A zero-argument callable which returns an iterator 

649 over dict-like data objects. This must be a callable rather than a 

650 simple iterator because sufficiently complex formulas may require 

651 multiple passes over the data (e.g. if there are nested stateful 

652 transforms). 

653 :arg eval_env: Either a :class:`EvalEnvironment` which will be used to 

654 look up any variables referenced in `termlists` that cannot be 

655 found in `data_iter_maker`, or else a depth represented as an 

656 integer which will be passed to :meth:`EvalEnvironment.capture`. 

657 ``eval_env=0`` means to use the context of the function calling 

658 :func:`design_matrix_builders` for lookups. If calling this function 

659 from a library, you probably want ``eval_env=1``, which means that 

660 variables should be resolved in *your* caller's namespace. 

661 :arg NA_action: An :class:`NAAction` object or string, used to determine 

662 what values count as 'missing' for purposes of determining the levels of 

663 categorical factors. 

664 :returns: A list of :class:`DesignInfo` objects, one for each 

665 termlist passed in. 

666 

667 This function performs zero or more iterations over the data in order to 

668 sniff out any necessary information about factor types, set up stateful 

669 transforms, pick column names, etc. 

670 

671 See :ref:`formulas` for details. 

672 

673 .. versionadded:: 0.2.0 

674 The ``NA_action`` argument. 

675 .. versionadded:: 0.4.0 

676 The ``eval_env`` argument. 

677 """ 

678 # People upgrading from versions prior to 0.4.0 could potentially have 

679 # passed NA_action as the 3rd positional argument. Fortunately 

680 # EvalEnvironment.capture only accepts int and EvalEnvironment objects, 

681 # and we improved its error messages to make this clear. 

682 eval_env = EvalEnvironment.capture(eval_env, reference=1) 

683 if isinstance(NA_action, str): 

684 NA_action = NAAction(NA_action) 

685 all_factors = set() 

686 for termlist in termlists: 

687 for term in termlist: 

688 all_factors.update(term.factors) 

689 factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env) 

690 # Now all the factors have working eval methods, so we can evaluate them 

691 # on some data to find out what type of data they return. 

692 (num_column_counts, 

693 cat_levels_contrasts) = _examine_factor_types(all_factors, 

694 factor_states, 

695 data_iter_maker, 

696 NA_action) 

697 # Now we need the factor infos, which encapsulate the knowledge of 

698 # how to turn any given factor into a chunk of data: 

699 factor_infos = {} 

700 for factor in all_factors: 

701 if factor in num_column_counts: 

702 fi = FactorInfo(factor, 

703 "numerical", 

704 factor_states[factor], 

705 num_columns=num_column_counts[factor], 

706 categories=None) 

707 else: 

708 assert factor in cat_levels_contrasts 

709 categories = cat_levels_contrasts[factor][0] 

710 fi = FactorInfo(factor, 

711 "categorical", 

712 factor_states[factor], 

713 num_columns=None, 

714 categories=categories) 

715 factor_infos[factor] = fi 

716 # And now we can construct the DesignInfo for each termlist: 

717 design_infos = [] 

718 for termlist in termlists: 

719 term_to_subterm_infos = _make_subterm_infos(termlist, 

720 num_column_counts, 

721 cat_levels_contrasts) 

722 assert isinstance(term_to_subterm_infos, OrderedDict) 

723 assert frozenset(term_to_subterm_infos) == frozenset(termlist) 

724 this_design_factor_infos = {} 

725 for term in termlist: 

726 for factor in term.factors: 

727 this_design_factor_infos[factor] = factor_infos[factor] 

728 column_names = [] 

729 for subterms in six.itervalues(term_to_subterm_infos): 

730 for subterm in subterms: 

731 for column_name in _subterm_column_names_iter( 

732 factor_infos, subterm): 

733 column_names.append(column_name) 

734 design_infos.append(DesignInfo(column_names, 

735 factor_infos=this_design_factor_infos, 

736 term_codings=term_to_subterm_infos)) 

737 return design_infos 

738 

739def _build_design_matrix(design_info, factor_info_to_values, dtype): 

740 factor_to_values = {} 

741 need_reshape = False 

742 num_rows = None 

743 for factor_info, value in six.iteritems(factor_info_to_values): 

744 # It's possible that the same factor appears in multiple different 

745 # FactorInfo objects (e.g. if someone is simultaneously building two 

746 # DesignInfo objects that started out as part of different 

747 # formulas). Skip any factor_info that is not our expected 

748 # factor_info. 

749 if design_info.factor_infos.get(factor_info.factor) is not factor_info: 

750 continue 

751 factor_to_values[factor_info.factor] = value 

752 if num_rows is not None: 

753 assert num_rows == value.shape[0] 

754 else: 

755 num_rows = value.shape[0] 

756 if num_rows is None: 

757 # We have no dependence on the data -- e.g. an empty termlist, or 

758 # only an intercept term. 

759 num_rows = 1 

760 need_reshape = True 

761 shape = (num_rows, len(design_info.column_names)) 

762 m = DesignMatrix(np.empty(shape, dtype=dtype), design_info) 

763 start_column = 0 

764 for term, subterms in six.iteritems(design_info.term_codings): 

765 for subterm in subterms: 

766 end_column = start_column + subterm.num_columns 

767 m_slice = m[:, start_column:end_column] 

768 _build_subterm(subterm, design_info.factor_infos, 

769 factor_to_values, m_slice) 

770 start_column = end_column 

771 assert start_column == m.shape[1] 

772 return need_reshape, m 

773 

774class _CheckMatch(object): 

775 def __init__(self, name, eq_fn): 

776 self._name = name 

777 self._eq_fn = eq_fn 

778 self.value = None 

779 self._value_desc = None 

780 self._value_origin = None 

781 

782 def check(self, seen_value, desc, origin): 

783 if self.value is None: 

784 self.value = seen_value 

785 self._value_desc = desc 

786 self._value_origin = origin 

787 else: 

788 if not self._eq_fn(self.value, seen_value): 

789 msg = ("%s mismatch between %s and %s" 

790 % (self._name, self._value_desc, desc)) 

791 if isinstance(self.value, int): 

792 msg += " (%r versus %r)" % (self.value, seen_value) 

793 # XX FIXME: this is a case where having discontiguous Origins 

794 # would be useful... 

795 raise PatsyError(msg, origin) 

796 

797def build_design_matrices(design_infos, data, 

798 NA_action="drop", 

799 return_type="matrix", 

800 dtype=np.dtype(float)): 

801 """Construct several design matrices from :class:`DesignMatrixBuilder` 

802 objects. 

803 

804 This is one of Patsy's fundamental functions. This function and 

805 :func:`design_matrix_builders` together form the API to the core formula 

806 interpretation machinery. 

807 

808 :arg design_infos: A list of :class:`DesignInfo` objects describing the 

809 design matrices to be built. 

810 :arg data: A dict-like object which will be used to look up data. 

811 :arg NA_action: What to do with rows that contain missing values. You can 

812 ``"drop"`` them, ``"raise"`` an error, or for customization, pass an 

813 :class:`NAAction` object. See :class:`NAAction` for details on what 

814 values count as 'missing' (and how to alter this). 

815 :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below. 

816 :arg dtype: The dtype of the returned matrix. Useful if you want to use 

817 single-precision or extended-precision. 

818 

819 This function returns either a list of :class:`DesignMatrix` objects (for 

820 ``return_type="matrix"``) or a list of :class:`pandas.DataFrame` objects 

821 (for ``return_type="dataframe"``). In both cases, all returned design 

822 matrices will have ``.design_info`` attributes containing the appropriate 

823 :class:`DesignInfo` objects. 

824 

825 Note that unlike :func:`design_matrix_builders`, this function takes only 

826 a simple data argument, not any kind of iterator. That's because this 

827 function doesn't need a global view of the data -- everything that depends 

828 on the whole data set is already encapsulated in the ``design_infos``. If 

829 you are incrementally processing a large data set, simply call this 

830 function for each chunk. 

831 

832 Index handling: This function always checks for indexes in the following 

833 places: 

834 

835 * If ``data`` is a :class:`pandas.DataFrame`, its ``.index`` attribute. 

836 * If any factors evaluate to a :class:`pandas.Series` or 

837 :class:`pandas.DataFrame`, then their ``.index`` attributes. 

838 

839 If multiple indexes are found, they must be identical (same values in the 

840 same order). If no indexes are found, then a default index is generated 

841 using ``np.arange(num_rows)``. One way or another, we end up with a single 

842 index for all the data. If ``return_type="dataframe"``, then this index is 

843 used as the index of the returned DataFrame objects. Examining this index 

844 makes it possible to determine which rows were removed due to NAs. 

845 

846 Determining the number of rows in design matrices: This is not as obvious 

847 as it might seem, because it's possible to have a formula like "~ 1" that 

848 doesn't depend on the data (it has no factors). For this formula, it's 

849 obvious what every row in the design matrix should look like (just the 

850 value ``1``); but, how many rows like this should there be? To determine 

851 the number of rows in a design matrix, this function always checks in the 

852 following places: 

853 

854 * If ``data`` is a :class:`pandas.DataFrame`, then its number of rows. 

855 * The number of entries in any factors present in any of the design 

856 * matrices being built. 

857 

858 All these values much match. In particular, if this function is called to 

859 generate multiple design matrices at once, then they must all have the 

860 same number of rows. 

861 

862 .. versionadded:: 0.2.0 

863 The ``NA_action`` argument. 

864 

865 """ 

866 if isinstance(NA_action, str): 

867 NA_action = NAAction(NA_action) 

868 if return_type == "dataframe" and not have_pandas: 

869 raise PatsyError("pandas.DataFrame was requested, but pandas " 

870 "is not installed") 

871 if return_type not in ("matrix", "dataframe"): 

872 raise PatsyError("unrecognized output type %r, should be " 

873 "'matrix' or 'dataframe'" % (return_type,)) 

874 # Evaluate factors 

875 factor_info_to_values = {} 

876 factor_info_to_isNAs = {} 

877 rows_checker = _CheckMatch("Number of rows", lambda a, b: a == b) 

878 index_checker = _CheckMatch("Index", lambda a, b: a.equals(b)) 

879 if have_pandas and isinstance(data, pandas.DataFrame): 

880 index_checker.check(data.index, "data.index", None) 

881 rows_checker.check(data.shape[0], "data argument", None) 

882 for design_info in design_infos: 

883 # We look at evaluators rather than factors here, because it might 

884 # happen that we have the same factor twice, but with different 

885 # memorized state. 

886 for factor_info in six.itervalues(design_info.factor_infos): 

887 if factor_info not in factor_info_to_values: 

888 value, is_NA = _eval_factor(factor_info, data, NA_action) 

889 factor_info_to_isNAs[factor_info] = is_NA 

890 # value may now be a Series, DataFrame, or ndarray 

891 name = factor_info.factor.name() 

892 origin = factor_info.factor.origin 

893 rows_checker.check(value.shape[0], name, origin) 

894 if (have_pandas 

895 and isinstance(value, (pandas.Series, pandas.DataFrame))): 

896 index_checker.check(value.index, name, origin) 

897 # Strategy: we work with raw ndarrays for doing the actual 

898 # combining; DesignMatrixBuilder objects never sees pandas 

899 # objects. Then at the end, if a DataFrame was requested, we 

900 # convert. So every entry in this dict is either a 2-d array 

901 # of floats, or a 1-d array of integers (representing 

902 # categories). 

903 value = np.asarray(value) 

904 factor_info_to_values[factor_info] = value 

905 # Handle NAs 

906 values = list(factor_info_to_values.values()) 

907 is_NAs = list(factor_info_to_isNAs.values()) 

908 origins = [factor_info.factor.origin 

909 for factor_info in factor_info_to_values] 

910 pandas_index = index_checker.value 

911 num_rows = rows_checker.value 

912 # num_rows is None iff evaluator_to_values (and associated sets like 

913 # 'values') are empty, i.e., we have no actual evaluators involved 

914 # (formulas like "~ 1"). 

915 if return_type == "dataframe" and num_rows is not None: 

916 if pandas_index is None: 

917 pandas_index = np.arange(num_rows) 

918 values.append(pandas_index) 

919 is_NAs.append(np.zeros(len(pandas_index), dtype=bool)) 

920 origins.append(None) 

921 new_values = NA_action.handle_NA(values, is_NAs, origins) 

922 # NA_action may have changed the number of rows. 

923 if new_values: 

924 num_rows = new_values[0].shape[0] 

925 if return_type == "dataframe" and num_rows is not None: 

926 pandas_index = new_values.pop() 

927 factor_info_to_values = dict(zip(factor_info_to_values, new_values)) 

928 # Build factor values into matrices 

929 results = [] 

930 for design_info in design_infos: 

931 results.append(_build_design_matrix(design_info, 

932 factor_info_to_values, 

933 dtype)) 

934 matrices = [] 

935 for need_reshape, matrix in results: 

936 if need_reshape: 

937 # There is no data-dependence, at all -- a formula like "1 ~ 1". 

938 # In this case the builder just returns a single-row matrix, and 

939 # we have to broadcast it vertically to the appropriate size. If 

940 # we can figure out what that is... 

941 assert matrix.shape[0] == 1 

942 if num_rows is not None: 

943 matrix = DesignMatrix(np.repeat(matrix, num_rows, axis=0), 

944 matrix.design_info) 

945 else: 

946 raise PatsyError( 

947 "No design matrix has any non-trivial factors, " 

948 "the data object is not a DataFrame. " 

949 "I can't tell how many rows the design matrix should " 

950 "have!" 

951 ) 

952 matrices.append(matrix) 

953 if return_type == "dataframe": 

954 assert have_pandas 

955 for i, matrix in enumerate(matrices): 

956 di = matrix.design_info 

957 matrices[i] = pandas.DataFrame(matrix, 

958 columns=di.column_names, 

959 index=pandas_index) 

960 matrices[i].design_info = di 

961 return matrices 

962 

963# It should be possible to do just the factors -> factor_infos stuff 

964# alone, since that, well, makes logical sense to do.