Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/patsy/build.py : 8%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of Patsy
2# Copyright (C) 2011-2015 Nathaniel Smith <njs@pobox.com>
3# See file LICENSE.txt for license information.
5# This file defines the core design matrix building functions.
7# These are made available in the patsy.* namespace
8__all__ = ["design_matrix_builders", "build_design_matrices"]
10import itertools
11import six
13import numpy as np
14from patsy import PatsyError
15from patsy.categorical import (guess_categorical,
16 CategoricalSniffer,
17 categorical_to_int)
18from patsy.util import (atleast_2d_column_default,
19 have_pandas, asarray_or_pandas,
20 safe_issubdtype)
21from patsy.design_info import (DesignMatrix, DesignInfo,
22 FactorInfo, SubtermInfo)
23from patsy.redundancy import pick_contrasts_for_term
24from patsy.eval import EvalEnvironment
25from patsy.contrasts import code_contrast_matrix, Treatment
26from patsy.compat import OrderedDict
27from patsy.missing import NAAction
29if have_pandas:
30 import pandas
32class _MockFactor(object):
33 def __init__(self, name="MOCKMOCK"):
34 self._name = name
36 def eval(self, state, env):
37 return env["mock"]
39 def name(self):
40 return self._name
42def _max_allowed_dim(dim, arr, factor):
43 if arr.ndim > dim:
44 msg = ("factor '%s' evaluates to an %s-dimensional array; I only "
45 "handle arrays with dimension <= %s"
46 % (factor.name(), arr.ndim, dim))
47 raise PatsyError(msg, factor)
49def test__max_allowed_dim():
50 from nose.tools import assert_raises
51 f = _MockFactor()
52 _max_allowed_dim(1, np.array(1), f)
53 _max_allowed_dim(1, np.array([1]), f)
54 assert_raises(PatsyError, _max_allowed_dim, 1, np.array([[1]]), f)
55 assert_raises(PatsyError, _max_allowed_dim, 1, np.array([[[1]]]), f)
56 _max_allowed_dim(2, np.array(1), f)
57 _max_allowed_dim(2, np.array([1]), f)
58 _max_allowed_dim(2, np.array([[1]]), f)
59 assert_raises(PatsyError, _max_allowed_dim, 2, np.array([[[1]]]), f)
61def _eval_factor(factor_info, data, NA_action):
62 factor = factor_info.factor
63 result = factor.eval(factor_info.state, data)
64 # Returns either a 2d ndarray, or a DataFrame, plus is_NA mask
65 if factor_info.type == "numerical":
66 result = atleast_2d_column_default(result, preserve_pandas=True)
67 _max_allowed_dim(2, result, factor)
68 if result.shape[1] != factor_info.num_columns:
69 raise PatsyError("when evaluating factor %s, I got %s columns "
70 "instead of the %s I was expecting"
71 % (factor.name(),
72 factor_info.num_columns,
73 result.shape[1]),
74 factor)
75 if not safe_issubdtype(np.asarray(result).dtype, np.number):
76 raise PatsyError("when evaluating numeric factor %s, "
77 "I got non-numeric data of type '%s'"
78 % (factor.name(), result.dtype),
79 factor)
80 return result, NA_action.is_numerical_NA(result)
81 # returns either a 1d ndarray or a pandas.Series, plus is_NA mask
82 else:
83 assert factor_info.type == "categorical"
84 result = categorical_to_int(result, factor_info.categories, NA_action,
85 origin=factor_info.factor)
86 assert result.ndim == 1
87 return result, np.asarray(result == -1)
89def test__eval_factor_numerical():
90 from nose.tools import assert_raises
91 naa = NAAction()
92 f = _MockFactor()
94 fi1 = FactorInfo(f, "numerical", {}, num_columns=1, categories=None)
96 assert fi1.factor is f
97 eval123, is_NA = _eval_factor(fi1, {"mock": [1, 2, 3]}, naa)
98 assert eval123.shape == (3, 1)
99 assert np.all(eval123 == [[1], [2], [3]])
100 assert is_NA.shape == (3,)
101 assert np.all(~is_NA)
102 assert_raises(PatsyError, _eval_factor, fi1, {"mock": [[[1]]]}, naa)
103 assert_raises(PatsyError, _eval_factor, fi1, {"mock": [[1, 2]]}, naa)
104 assert_raises(PatsyError, _eval_factor, fi1, {"mock": ["a", "b"]}, naa)
105 assert_raises(PatsyError, _eval_factor, fi1, {"mock": [True, False]}, naa)
106 fi2 = FactorInfo(_MockFactor(), "numerical",
107 {}, num_columns=2, categories=None)
108 eval123321, is_NA = _eval_factor(fi2,
109 {"mock": [[1, 3], [2, 2], [3, 1]]},
110 naa)
111 assert eval123321.shape == (3, 2)
112 assert np.all(eval123321 == [[1, 3], [2, 2], [3, 1]])
113 assert is_NA.shape == (3,)
114 assert np.all(~is_NA)
115 assert_raises(PatsyError, _eval_factor, fi2, {"mock": [1, 2, 3]}, naa)
116 assert_raises(PatsyError, _eval_factor, fi2, {"mock": [[1, 2, 3]]}, naa)
118 ev_nan, is_NA = _eval_factor(fi1, {"mock": [1, 2, np.nan]},
119 NAAction(NA_types=["NaN"]))
120 assert np.array_equal(is_NA, [False, False, True])
121 ev_nan, is_NA = _eval_factor(fi1, {"mock": [1, 2, np.nan]},
122 NAAction(NA_types=[]))
123 assert np.array_equal(is_NA, [False, False, False])
125 if have_pandas:
126 eval_ser, _ = _eval_factor(fi1,
127 {"mock":
128 pandas.Series([1, 2, 3],
129 index=[10, 20, 30])},
130 naa)
131 assert isinstance(eval_ser, pandas.DataFrame)
132 assert np.array_equal(eval_ser, [[1], [2], [3]])
133 assert np.array_equal(eval_ser.index, [10, 20, 30])
134 eval_df1, _ = _eval_factor(fi1,
135 {"mock":
136 pandas.DataFrame([[2], [1], [3]],
137 index=[20, 10, 30])},
138 naa)
139 assert isinstance(eval_df1, pandas.DataFrame)
140 assert np.array_equal(eval_df1, [[2], [1], [3]])
141 assert np.array_equal(eval_df1.index, [20, 10, 30])
142 eval_df2, _ = _eval_factor(fi2,
143 {"mock":
144 pandas.DataFrame([[2, 3], [1, 4], [3, -1]],
145 index=[20, 30, 10])},
146 naa)
147 assert isinstance(eval_df2, pandas.DataFrame)
148 assert np.array_equal(eval_df2, [[2, 3], [1, 4], [3, -1]])
149 assert np.array_equal(eval_df2.index, [20, 30, 10])
151 assert_raises(PatsyError,
152 _eval_factor, fi2,
153 {"mock": pandas.Series([1, 2, 3], index=[10, 20, 30])},
154 naa)
155 assert_raises(PatsyError,
156 _eval_factor, fi1,
157 {"mock":
158 pandas.DataFrame([[2, 3], [1, 4], [3, -1]],
159 index=[20, 30, 10])},
160 naa)
162def test__eval_factor_categorical():
163 from nose.tools import assert_raises
164 from patsy.categorical import C
165 naa = NAAction()
166 f = _MockFactor()
167 fi1 = FactorInfo(f, "categorical",
168 {}, num_columns=None, categories=("a", "b"))
169 assert fi1.factor is f
170 cat1, _ = _eval_factor(fi1, {"mock": ["b", "a", "b"]}, naa)
171 assert cat1.shape == (3,)
172 assert np.all(cat1 == [1, 0, 1])
173 assert_raises(PatsyError, _eval_factor, fi1, {"mock": ["c"]}, naa)
174 assert_raises(PatsyError, _eval_factor, fi1, {"mock": C(["a", "c"])}, naa)
175 assert_raises(PatsyError, _eval_factor, fi1,
176 {"mock": C(["a", "b"], levels=["b", "a"])}, naa)
177 assert_raises(PatsyError, _eval_factor, fi1, {"mock": [1, 0, 1]}, naa)
178 bad_cat = np.asarray(["b", "a", "a", "b"])
179 bad_cat.resize((2, 2))
180 assert_raises(PatsyError, _eval_factor, fi1, {"mock": bad_cat}, naa)
182 cat1_NA, is_NA = _eval_factor(fi1, {"mock": ["a", None, "b"]},
183 NAAction(NA_types=["None"]))
184 assert np.array_equal(is_NA, [False, True, False])
185 assert np.array_equal(cat1_NA, [0, -1, 1])
186 assert_raises(PatsyError, _eval_factor, fi1,
187 {"mock": ["a", None, "b"]}, NAAction(NA_types=[]))
189 fi2 = FactorInfo(_MockFactor(), "categorical", {},
190 num_columns=None, categories=[False, True])
191 cat2, _ = _eval_factor(fi2, {"mock": [True, False, False, True]}, naa)
192 assert cat2.shape == (4,)
193 assert np.all(cat2 == [1, 0, 0, 1])
195 if have_pandas:
196 s = pandas.Series(["b", "a"], index=[10, 20])
197 cat_s, _ = _eval_factor(fi1, {"mock": s}, naa)
198 assert isinstance(cat_s, pandas.Series)
199 assert np.array_equal(cat_s, [1, 0])
200 assert np.array_equal(cat_s.index, [10, 20])
201 sbool = pandas.Series([True, False], index=[11, 21])
202 cat_sbool, _ = _eval_factor(fi2, {"mock": sbool}, naa)
203 assert isinstance(cat_sbool, pandas.Series)
204 assert np.array_equal(cat_sbool, [1, 0])
205 assert np.array_equal(cat_sbool.index, [11, 21])
207def _column_combinations(columns_per_factor):
208 # For consistency with R, the left-most item iterates fastest:
209 iterators = [range(n) for n in reversed(columns_per_factor)]
210 for reversed_combo in itertools.product(*iterators):
211 yield reversed_combo[::-1]
213def test__column_combinations():
214 assert list(_column_combinations([2, 3])) == [(0, 0),
215 (1, 0),
216 (0, 1),
217 (1, 1),
218 (0, 2),
219 (1, 2)]
220 assert list(_column_combinations([3])) == [(0,), (1,), (2,)]
221 assert list(_column_combinations([])) == [()]
223def _subterm_column_combinations(factor_infos, subterm):
224 columns_per_factor = []
225 for factor in subterm.factors:
226 if factor in subterm.contrast_matrices:
227 columns = subterm.contrast_matrices[factor].matrix.shape[1]
228 else:
229 columns = factor_infos[factor].num_columns
230 columns_per_factor.append(columns)
231 return _column_combinations(columns_per_factor)
233def _subterm_column_names_iter(factor_infos, subterm):
234 total = 0
235 for i, column_idxs in enumerate(
236 _subterm_column_combinations(factor_infos, subterm)):
237 name_pieces = []
238 for factor, column_idx in zip(subterm.factors, column_idxs):
239 fi = factor_infos[factor]
240 if fi.type == "numerical":
241 if fi.num_columns > 1:
242 name_pieces.append("%s[%s]"
243 % (factor.name(), column_idx))
244 else:
245 assert column_idx == 0
246 name_pieces.append(factor.name())
247 else:
248 assert fi.type == "categorical"
249 contrast = subterm.contrast_matrices[factor]
250 suffix = contrast.column_suffixes[column_idx]
251 name_pieces.append("%s%s" % (factor.name(), suffix))
252 if not name_pieces:
253 yield "Intercept"
254 else:
255 yield ":".join(name_pieces)
256 total += 1
257 assert total == subterm.num_columns
259def _build_subterm(subterm, factor_infos, factor_values, out):
260 assert subterm.num_columns == out.shape[1]
261 out[...] = 1
262 for i, column_idxs in enumerate(
263 _subterm_column_combinations(factor_infos, subterm)):
264 for factor, column_idx in zip(subterm.factors, column_idxs):
265 if factor_infos[factor].type == "categorical":
266 contrast = subterm.contrast_matrices[factor]
267 if np.any(factor_values[factor] < 0):
268 raise PatsyError("can't build a design matrix "
269 "containing missing values", factor)
270 out[:, i] *= contrast.matrix[factor_values[factor],
271 column_idx]
272 else:
273 assert factor_infos[factor].type == "numerical"
274 assert (factor_values[factor].shape[1]
275 == factor_infos[factor].num_columns)
276 out[:, i] *= factor_values[factor][:, column_idx]
278def test__subterm_column_names_iter_and__build_subterm():
279 from nose.tools import assert_raises
280 from patsy.contrasts import ContrastMatrix
281 from patsy.categorical import C
282 f1 = _MockFactor("f1")
283 f2 = _MockFactor("f2")
284 f3 = _MockFactor("f3")
285 contrast = ContrastMatrix(np.array([[0, 0.5],
286 [3, 0]]),
287 ["[c1]", "[c2]"])
289 factor_infos1 = {f1: FactorInfo(f1, "numerical", {},
290 num_columns=1, categories=None),
291 f2: FactorInfo(f2, "categorical", {},
292 num_columns=None, categories=["a", "b"]),
293 f3: FactorInfo(f3, "numerical", {},
294 num_columns=1, categories=None),
295 }
296 contrast_matrices = {f2: contrast}
297 subterm1 = SubtermInfo([f1, f2, f3], contrast_matrices, 2)
298 assert (list(_subterm_column_names_iter(factor_infos1, subterm1))
299 == ["f1:f2[c1]:f3", "f1:f2[c2]:f3"])
301 mat = np.empty((3, 2))
302 _build_subterm(subterm1, factor_infos1,
303 {f1: atleast_2d_column_default([1, 2, 3]),
304 f2: np.asarray([0, 0, 1]),
305 f3: atleast_2d_column_default([7.5, 2, -12])},
306 mat)
307 assert np.allclose(mat, [[0, 0.5 * 1 * 7.5],
308 [0, 0.5 * 2 * 2],
309 [3 * 3 * -12, 0]])
310 # Check that missing categorical values blow up
311 assert_raises(PatsyError, _build_subterm, subterm1, factor_infos1,
312 {f1: atleast_2d_column_default([1, 2, 3]),
313 f2: np.asarray([0, -1, 1]),
314 f3: atleast_2d_column_default([7.5, 2, -12])},
315 mat)
317 factor_infos2 = dict(factor_infos1)
318 factor_infos2[f1] = FactorInfo(f1, "numerical", {},
319 num_columns=2, categories=None)
320 subterm2 = SubtermInfo([f1, f2, f3], contrast_matrices, 4)
321 assert (list(_subterm_column_names_iter(factor_infos2, subterm2))
322 == ["f1[0]:f2[c1]:f3",
323 "f1[1]:f2[c1]:f3",
324 "f1[0]:f2[c2]:f3",
325 "f1[1]:f2[c2]:f3"])
327 mat2 = np.empty((3, 4))
328 _build_subterm(subterm2, factor_infos2,
329 {f1: atleast_2d_column_default([[1, 2], [3, 4], [5, 6]]),
330 f2: np.asarray([0, 0, 1]),
331 f3: atleast_2d_column_default([7.5, 2, -12])},
332 mat2)
333 assert np.allclose(mat2, [[0, 0, 0.5 * 1 * 7.5, 0.5 * 2 * 7.5],
334 [0, 0, 0.5 * 3 * 2, 0.5 * 4 * 2],
335 [3 * 5 * -12, 3 * 6 * -12, 0, 0]])
338 subterm_int = SubtermInfo([], {}, 1)
339 assert list(_subterm_column_names_iter({}, subterm_int)) == ["Intercept"]
341 mat3 = np.empty((3, 1))
342 _build_subterm(subterm_int, {},
343 {f1: [1, 2, 3], f2: [1, 2, 3], f3: [1, 2, 3]},
344 mat3)
345 assert np.allclose(mat3, 1)
347def _factors_memorize(factors, data_iter_maker, eval_env):
348 # First, start off the memorization process by setting up each factor's
349 # state and finding out how many passes it will need:
350 factor_states = {}
351 passes_needed = {}
352 for factor in factors:
353 state = {}
354 which_pass = factor.memorize_passes_needed(state, eval_env)
355 factor_states[factor] = state
356 passes_needed[factor] = which_pass
357 # Now, cycle through the data until all the factors have finished
358 # memorizing everything:
359 memorize_needed = set()
360 for factor, passes in six.iteritems(passes_needed):
361 if passes > 0:
362 memorize_needed.add(factor)
363 which_pass = 0
364 while memorize_needed:
365 for data in data_iter_maker():
366 for factor in memorize_needed:
367 state = factor_states[factor]
368 factor.memorize_chunk(state, which_pass, data)
369 for factor in list(memorize_needed):
370 factor.memorize_finish(factor_states[factor], which_pass)
371 if which_pass == passes_needed[factor] - 1:
372 memorize_needed.remove(factor)
373 which_pass += 1
374 return factor_states
376def test__factors_memorize():
377 class MockFactor(object):
378 def __init__(self, requested_passes, token):
379 self._requested_passes = requested_passes
380 self._token = token
381 self._chunk_in_pass = 0
382 self._seen_passes = 0
384 def memorize_passes_needed(self, state, eval_env):
385 state["calls"] = []
386 state["token"] = self._token
387 return self._requested_passes
389 def memorize_chunk(self, state, which_pass, data):
390 state["calls"].append(("memorize_chunk", which_pass))
391 assert data["chunk"] == self._chunk_in_pass
392 self._chunk_in_pass += 1
394 def memorize_finish(self, state, which_pass):
395 state["calls"].append(("memorize_finish", which_pass))
396 self._chunk_in_pass = 0
398 class Data(object):
399 CHUNKS = 3
400 def __init__(self):
401 self.calls = 0
402 self.data = [{"chunk": i} for i in range(self.CHUNKS)]
403 def __call__(self):
404 self.calls += 1
405 return iter(self.data)
406 data = Data()
407 f0 = MockFactor(0, "f0")
408 f1 = MockFactor(1, "f1")
409 f2a = MockFactor(2, "f2a")
410 f2b = MockFactor(2, "f2b")
411 factor_states = _factors_memorize(set([f0, f1, f2a, f2b]), data, {})
412 assert data.calls == 2
413 mem_chunks0 = [("memorize_chunk", 0)] * data.CHUNKS
414 mem_chunks1 = [("memorize_chunk", 1)] * data.CHUNKS
415 expected = {
416 f0: {
417 "calls": [],
418 "token": "f0",
419 },
420 f1: {
421 "calls": mem_chunks0 + [("memorize_finish", 0)],
422 "token": "f1",
423 },
424 f2a: {
425 "calls": mem_chunks0 + [("memorize_finish", 0)]
426 + mem_chunks1 + [("memorize_finish", 1)],
427 "token": "f2a",
428 },
429 f2b: {
430 "calls": mem_chunks0 + [("memorize_finish", 0)]
431 + mem_chunks1 + [("memorize_finish", 1)],
432 "token": "f2b",
433 },
434 }
435 assert factor_states == expected
437def _examine_factor_types(factors, factor_states, data_iter_maker, NA_action):
438 num_column_counts = {}
439 cat_sniffers = {}
440 examine_needed = set(factors)
441 for data in data_iter_maker():
442 for factor in list(examine_needed):
443 value = factor.eval(factor_states[factor], data)
444 if factor in cat_sniffers or guess_categorical(value):
445 if factor not in cat_sniffers:
446 cat_sniffers[factor] = CategoricalSniffer(NA_action,
447 factor.origin)
448 done = cat_sniffers[factor].sniff(value)
449 if done:
450 examine_needed.remove(factor)
451 else:
452 # Numeric
453 value = atleast_2d_column_default(value)
454 _max_allowed_dim(2, value, factor)
455 column_count = value.shape[1]
456 num_column_counts[factor] = column_count
457 examine_needed.remove(factor)
458 if not examine_needed:
459 break
460 # Pull out the levels
461 cat_levels_contrasts = {}
462 for factor, sniffer in six.iteritems(cat_sniffers):
463 cat_levels_contrasts[factor] = sniffer.levels_contrast()
464 return (num_column_counts, cat_levels_contrasts)
466def test__examine_factor_types():
467 from patsy.categorical import C
468 class MockFactor(object):
469 def __init__(self):
470 # You should check this using 'is', not '=='
471 from patsy.origin import Origin
472 self.origin = Origin("MOCK", 1, 2)
474 def eval(self, state, data):
475 return state[data]
477 def name(self):
478 return "MOCK MOCK"
480 # This hacky class can only be iterated over once, but it keeps track of
481 # how far it got.
482 class DataIterMaker(object):
483 def __init__(self):
484 self.i = -1
486 def __call__(self):
487 return self
489 def __iter__(self):
490 return self
492 def next(self):
493 self.i += 1
494 if self.i > 1:
495 raise StopIteration
496 return self.i
497 __next__ = next
499 num_1dim = MockFactor()
500 num_1col = MockFactor()
501 num_4col = MockFactor()
502 categ_1col = MockFactor()
503 bool_1col = MockFactor()
504 string_1col = MockFactor()
505 object_1col = MockFactor()
506 object_levels = (object(), object(), object())
507 factor_states = {
508 num_1dim: ([1, 2, 3], [4, 5, 6]),
509 num_1col: ([[1], [2], [3]], [[4], [5], [6]]),
510 num_4col: (np.zeros((3, 4)), np.ones((3, 4))),
511 categ_1col: (C(["a", "b", "c"], levels=("a", "b", "c"),
512 contrast="MOCK CONTRAST"),
513 C(["c", "b", "a"], levels=("a", "b", "c"),
514 contrast="MOCK CONTRAST")),
515 bool_1col: ([True, True, False], [False, True, True]),
516 # It has to read through all the data to see all the possible levels:
517 string_1col: (["a", "a", "a"], ["c", "b", "a"]),
518 object_1col: ([object_levels[0]] * 3, object_levels),
519 }
521 it = DataIterMaker()
522 (num_column_counts, cat_levels_contrasts,
523 ) = _examine_factor_types(factor_states.keys(), factor_states, it,
524 NAAction())
525 assert it.i == 2
526 iterations = 0
527 assert num_column_counts == {num_1dim: 1, num_1col: 1, num_4col: 4}
528 assert cat_levels_contrasts == {
529 categ_1col: (("a", "b", "c"), "MOCK CONTRAST"),
530 bool_1col: ((False, True), None),
531 string_1col: (("a", "b", "c"), None),
532 object_1col: (tuple(sorted(object_levels, key=id)), None),
533 }
535 # Check that it doesn't read through all the data if that's not necessary:
536 it = DataIterMaker()
537 no_read_necessary = [num_1dim, num_1col, num_4col, categ_1col, bool_1col]
538 (num_column_counts, cat_levels_contrasts,
539 ) = _examine_factor_types(no_read_necessary, factor_states, it,
540 NAAction())
541 assert it.i == 0
542 assert num_column_counts == {num_1dim: 1, num_1col: 1, num_4col: 4}
543 assert cat_levels_contrasts == {
544 categ_1col: (("a", "b", "c"), "MOCK CONTRAST"),
545 bool_1col: ((False, True), None),
546 }
548 # Illegal inputs:
549 bool_3col = MockFactor()
550 num_3dim = MockFactor()
551 # no such thing as a multi-dimensional Categorical
552 # categ_3dim = MockFactor()
553 string_3col = MockFactor()
554 object_3col = MockFactor()
555 illegal_factor_states = {
556 num_3dim: (np.zeros((3, 3, 3)), np.ones((3, 3, 3))),
557 string_3col: ([["a", "b", "c"]], [["b", "c", "a"]]),
558 object_3col: ([[[object()]]], [[[object()]]]),
559 }
560 from nose.tools import assert_raises
561 for illegal_factor in illegal_factor_states:
562 it = DataIterMaker()
563 try:
564 _examine_factor_types([illegal_factor], illegal_factor_states, it,
565 NAAction())
566 except PatsyError as e:
567 assert e.origin is illegal_factor.origin
568 else:
569 assert False
571def _make_subterm_infos(terms,
572 num_column_counts,
573 cat_levels_contrasts):
574 # Sort each term into a bucket based on the set of numeric factors it
575 # contains:
576 term_buckets = OrderedDict()
577 bucket_ordering = []
578 for term in terms:
579 num_factors = []
580 for factor in term.factors:
581 if factor in num_column_counts:
582 num_factors.append(factor)
583 bucket = frozenset(num_factors)
584 if bucket not in term_buckets:
585 bucket_ordering.append(bucket)
586 term_buckets.setdefault(bucket, []).append(term)
587 # Special rule: if there is a no-numerics bucket, then it always comes
588 # first:
589 if frozenset() in term_buckets:
590 bucket_ordering.remove(frozenset())
591 bucket_ordering.insert(0, frozenset())
592 term_to_subterm_infos = OrderedDict()
593 new_term_order = []
594 # Then within each bucket, work out which sort of contrasts we want to use
595 # for each term to avoid redundancy
596 for bucket in bucket_ordering:
597 bucket_terms = term_buckets[bucket]
598 # Sort by degree of interaction
599 bucket_terms.sort(key=lambda t: len(t.factors))
600 new_term_order += bucket_terms
601 used_subterms = set()
602 for term in bucket_terms:
603 subterm_infos = []
604 factor_codings = pick_contrasts_for_term(term,
605 num_column_counts,
606 used_subterms)
607 # Construct one SubtermInfo for each subterm
608 for factor_coding in factor_codings:
609 subterm_factors = []
610 contrast_matrices = {}
611 subterm_columns = 1
612 # In order to preserve factor ordering information, the
613 # coding_for_term just returns dicts, and we refer to
614 # the original factors to figure out which are included in
615 # each subterm, and in what order
616 for factor in term.factors:
617 # Numeric factors are included in every subterm
618 if factor in num_column_counts:
619 subterm_factors.append(factor)
620 subterm_columns *= num_column_counts[factor]
621 elif factor in factor_coding:
622 subterm_factors.append(factor)
623 levels, contrast = cat_levels_contrasts[factor]
624 # This is where the default coding is set to
625 # Treatment:
626 coded = code_contrast_matrix(factor_coding[factor],
627 levels, contrast,
628 default=Treatment)
629 contrast_matrices[factor] = coded
630 subterm_columns *= coded.matrix.shape[1]
631 subterm_infos.append(SubtermInfo(subterm_factors,
632 contrast_matrices,
633 subterm_columns))
634 term_to_subterm_infos[term] = subterm_infos
635 assert new_term_order == list(term_to_subterm_infos)
636 return term_to_subterm_infos
638def design_matrix_builders(termlists, data_iter_maker, eval_env,
639 NA_action="drop"):
640 """Construct several :class:`DesignInfo` objects from termlists.
642 This is one of Patsy's fundamental functions. This function and
643 :func:`build_design_matrices` together form the API to the core formula
644 interpretation machinery.
646 :arg termlists: A list of termlists, where each termlist is a list of
647 :class:`Term` objects which together specify a design matrix.
648 :arg data_iter_maker: A zero-argument callable which returns an iterator
649 over dict-like data objects. This must be a callable rather than a
650 simple iterator because sufficiently complex formulas may require
651 multiple passes over the data (e.g. if there are nested stateful
652 transforms).
653 :arg eval_env: Either a :class:`EvalEnvironment` which will be used to
654 look up any variables referenced in `termlists` that cannot be
655 found in `data_iter_maker`, or else a depth represented as an
656 integer which will be passed to :meth:`EvalEnvironment.capture`.
657 ``eval_env=0`` means to use the context of the function calling
658 :func:`design_matrix_builders` for lookups. If calling this function
659 from a library, you probably want ``eval_env=1``, which means that
660 variables should be resolved in *your* caller's namespace.
661 :arg NA_action: An :class:`NAAction` object or string, used to determine
662 what values count as 'missing' for purposes of determining the levels of
663 categorical factors.
664 :returns: A list of :class:`DesignInfo` objects, one for each
665 termlist passed in.
667 This function performs zero or more iterations over the data in order to
668 sniff out any necessary information about factor types, set up stateful
669 transforms, pick column names, etc.
671 See :ref:`formulas` for details.
673 .. versionadded:: 0.2.0
674 The ``NA_action`` argument.
675 .. versionadded:: 0.4.0
676 The ``eval_env`` argument.
677 """
678 # People upgrading from versions prior to 0.4.0 could potentially have
679 # passed NA_action as the 3rd positional argument. Fortunately
680 # EvalEnvironment.capture only accepts int and EvalEnvironment objects,
681 # and we improved its error messages to make this clear.
682 eval_env = EvalEnvironment.capture(eval_env, reference=1)
683 if isinstance(NA_action, str):
684 NA_action = NAAction(NA_action)
685 all_factors = set()
686 for termlist in termlists:
687 for term in termlist:
688 all_factors.update(term.factors)
689 factor_states = _factors_memorize(all_factors, data_iter_maker, eval_env)
690 # Now all the factors have working eval methods, so we can evaluate them
691 # on some data to find out what type of data they return.
692 (num_column_counts,
693 cat_levels_contrasts) = _examine_factor_types(all_factors,
694 factor_states,
695 data_iter_maker,
696 NA_action)
697 # Now we need the factor infos, which encapsulate the knowledge of
698 # how to turn any given factor into a chunk of data:
699 factor_infos = {}
700 for factor in all_factors:
701 if factor in num_column_counts:
702 fi = FactorInfo(factor,
703 "numerical",
704 factor_states[factor],
705 num_columns=num_column_counts[factor],
706 categories=None)
707 else:
708 assert factor in cat_levels_contrasts
709 categories = cat_levels_contrasts[factor][0]
710 fi = FactorInfo(factor,
711 "categorical",
712 factor_states[factor],
713 num_columns=None,
714 categories=categories)
715 factor_infos[factor] = fi
716 # And now we can construct the DesignInfo for each termlist:
717 design_infos = []
718 for termlist in termlists:
719 term_to_subterm_infos = _make_subterm_infos(termlist,
720 num_column_counts,
721 cat_levels_contrasts)
722 assert isinstance(term_to_subterm_infos, OrderedDict)
723 assert frozenset(term_to_subterm_infos) == frozenset(termlist)
724 this_design_factor_infos = {}
725 for term in termlist:
726 for factor in term.factors:
727 this_design_factor_infos[factor] = factor_infos[factor]
728 column_names = []
729 for subterms in six.itervalues(term_to_subterm_infos):
730 for subterm in subterms:
731 for column_name in _subterm_column_names_iter(
732 factor_infos, subterm):
733 column_names.append(column_name)
734 design_infos.append(DesignInfo(column_names,
735 factor_infos=this_design_factor_infos,
736 term_codings=term_to_subterm_infos))
737 return design_infos
739def _build_design_matrix(design_info, factor_info_to_values, dtype):
740 factor_to_values = {}
741 need_reshape = False
742 num_rows = None
743 for factor_info, value in six.iteritems(factor_info_to_values):
744 # It's possible that the same factor appears in multiple different
745 # FactorInfo objects (e.g. if someone is simultaneously building two
746 # DesignInfo objects that started out as part of different
747 # formulas). Skip any factor_info that is not our expected
748 # factor_info.
749 if design_info.factor_infos.get(factor_info.factor) is not factor_info:
750 continue
751 factor_to_values[factor_info.factor] = value
752 if num_rows is not None:
753 assert num_rows == value.shape[0]
754 else:
755 num_rows = value.shape[0]
756 if num_rows is None:
757 # We have no dependence on the data -- e.g. an empty termlist, or
758 # only an intercept term.
759 num_rows = 1
760 need_reshape = True
761 shape = (num_rows, len(design_info.column_names))
762 m = DesignMatrix(np.empty(shape, dtype=dtype), design_info)
763 start_column = 0
764 for term, subterms in six.iteritems(design_info.term_codings):
765 for subterm in subterms:
766 end_column = start_column + subterm.num_columns
767 m_slice = m[:, start_column:end_column]
768 _build_subterm(subterm, design_info.factor_infos,
769 factor_to_values, m_slice)
770 start_column = end_column
771 assert start_column == m.shape[1]
772 return need_reshape, m
774class _CheckMatch(object):
775 def __init__(self, name, eq_fn):
776 self._name = name
777 self._eq_fn = eq_fn
778 self.value = None
779 self._value_desc = None
780 self._value_origin = None
782 def check(self, seen_value, desc, origin):
783 if self.value is None:
784 self.value = seen_value
785 self._value_desc = desc
786 self._value_origin = origin
787 else:
788 if not self._eq_fn(self.value, seen_value):
789 msg = ("%s mismatch between %s and %s"
790 % (self._name, self._value_desc, desc))
791 if isinstance(self.value, int):
792 msg += " (%r versus %r)" % (self.value, seen_value)
793 # XX FIXME: this is a case where having discontiguous Origins
794 # would be useful...
795 raise PatsyError(msg, origin)
797def build_design_matrices(design_infos, data,
798 NA_action="drop",
799 return_type="matrix",
800 dtype=np.dtype(float)):
801 """Construct several design matrices from :class:`DesignMatrixBuilder`
802 objects.
804 This is one of Patsy's fundamental functions. This function and
805 :func:`design_matrix_builders` together form the API to the core formula
806 interpretation machinery.
808 :arg design_infos: A list of :class:`DesignInfo` objects describing the
809 design matrices to be built.
810 :arg data: A dict-like object which will be used to look up data.
811 :arg NA_action: What to do with rows that contain missing values. You can
812 ``"drop"`` them, ``"raise"`` an error, or for customization, pass an
813 :class:`NAAction` object. See :class:`NAAction` for details on what
814 values count as 'missing' (and how to alter this).
815 :arg return_type: Either ``"matrix"`` or ``"dataframe"``. See below.
816 :arg dtype: The dtype of the returned matrix. Useful if you want to use
817 single-precision or extended-precision.
819 This function returns either a list of :class:`DesignMatrix` objects (for
820 ``return_type="matrix"``) or a list of :class:`pandas.DataFrame` objects
821 (for ``return_type="dataframe"``). In both cases, all returned design
822 matrices will have ``.design_info`` attributes containing the appropriate
823 :class:`DesignInfo` objects.
825 Note that unlike :func:`design_matrix_builders`, this function takes only
826 a simple data argument, not any kind of iterator. That's because this
827 function doesn't need a global view of the data -- everything that depends
828 on the whole data set is already encapsulated in the ``design_infos``. If
829 you are incrementally processing a large data set, simply call this
830 function for each chunk.
832 Index handling: This function always checks for indexes in the following
833 places:
835 * If ``data`` is a :class:`pandas.DataFrame`, its ``.index`` attribute.
836 * If any factors evaluate to a :class:`pandas.Series` or
837 :class:`pandas.DataFrame`, then their ``.index`` attributes.
839 If multiple indexes are found, they must be identical (same values in the
840 same order). If no indexes are found, then a default index is generated
841 using ``np.arange(num_rows)``. One way or another, we end up with a single
842 index for all the data. If ``return_type="dataframe"``, then this index is
843 used as the index of the returned DataFrame objects. Examining this index
844 makes it possible to determine which rows were removed due to NAs.
846 Determining the number of rows in design matrices: This is not as obvious
847 as it might seem, because it's possible to have a formula like "~ 1" that
848 doesn't depend on the data (it has no factors). For this formula, it's
849 obvious what every row in the design matrix should look like (just the
850 value ``1``); but, how many rows like this should there be? To determine
851 the number of rows in a design matrix, this function always checks in the
852 following places:
854 * If ``data`` is a :class:`pandas.DataFrame`, then its number of rows.
855 * The number of entries in any factors present in any of the design
856 * matrices being built.
858 All these values much match. In particular, if this function is called to
859 generate multiple design matrices at once, then they must all have the
860 same number of rows.
862 .. versionadded:: 0.2.0
863 The ``NA_action`` argument.
865 """
866 if isinstance(NA_action, str):
867 NA_action = NAAction(NA_action)
868 if return_type == "dataframe" and not have_pandas:
869 raise PatsyError("pandas.DataFrame was requested, but pandas "
870 "is not installed")
871 if return_type not in ("matrix", "dataframe"):
872 raise PatsyError("unrecognized output type %r, should be "
873 "'matrix' or 'dataframe'" % (return_type,))
874 # Evaluate factors
875 factor_info_to_values = {}
876 factor_info_to_isNAs = {}
877 rows_checker = _CheckMatch("Number of rows", lambda a, b: a == b)
878 index_checker = _CheckMatch("Index", lambda a, b: a.equals(b))
879 if have_pandas and isinstance(data, pandas.DataFrame):
880 index_checker.check(data.index, "data.index", None)
881 rows_checker.check(data.shape[0], "data argument", None)
882 for design_info in design_infos:
883 # We look at evaluators rather than factors here, because it might
884 # happen that we have the same factor twice, but with different
885 # memorized state.
886 for factor_info in six.itervalues(design_info.factor_infos):
887 if factor_info not in factor_info_to_values:
888 value, is_NA = _eval_factor(factor_info, data, NA_action)
889 factor_info_to_isNAs[factor_info] = is_NA
890 # value may now be a Series, DataFrame, or ndarray
891 name = factor_info.factor.name()
892 origin = factor_info.factor.origin
893 rows_checker.check(value.shape[0], name, origin)
894 if (have_pandas
895 and isinstance(value, (pandas.Series, pandas.DataFrame))):
896 index_checker.check(value.index, name, origin)
897 # Strategy: we work with raw ndarrays for doing the actual
898 # combining; DesignMatrixBuilder objects never sees pandas
899 # objects. Then at the end, if a DataFrame was requested, we
900 # convert. So every entry in this dict is either a 2-d array
901 # of floats, or a 1-d array of integers (representing
902 # categories).
903 value = np.asarray(value)
904 factor_info_to_values[factor_info] = value
905 # Handle NAs
906 values = list(factor_info_to_values.values())
907 is_NAs = list(factor_info_to_isNAs.values())
908 origins = [factor_info.factor.origin
909 for factor_info in factor_info_to_values]
910 pandas_index = index_checker.value
911 num_rows = rows_checker.value
912 # num_rows is None iff evaluator_to_values (and associated sets like
913 # 'values') are empty, i.e., we have no actual evaluators involved
914 # (formulas like "~ 1").
915 if return_type == "dataframe" and num_rows is not None:
916 if pandas_index is None:
917 pandas_index = np.arange(num_rows)
918 values.append(pandas_index)
919 is_NAs.append(np.zeros(len(pandas_index), dtype=bool))
920 origins.append(None)
921 new_values = NA_action.handle_NA(values, is_NAs, origins)
922 # NA_action may have changed the number of rows.
923 if new_values:
924 num_rows = new_values[0].shape[0]
925 if return_type == "dataframe" and num_rows is not None:
926 pandas_index = new_values.pop()
927 factor_info_to_values = dict(zip(factor_info_to_values, new_values))
928 # Build factor values into matrices
929 results = []
930 for design_info in design_infos:
931 results.append(_build_design_matrix(design_info,
932 factor_info_to_values,
933 dtype))
934 matrices = []
935 for need_reshape, matrix in results:
936 if need_reshape:
937 # There is no data-dependence, at all -- a formula like "1 ~ 1".
938 # In this case the builder just returns a single-row matrix, and
939 # we have to broadcast it vertically to the appropriate size. If
940 # we can figure out what that is...
941 assert matrix.shape[0] == 1
942 if num_rows is not None:
943 matrix = DesignMatrix(np.repeat(matrix, num_rows, axis=0),
944 matrix.design_info)
945 else:
946 raise PatsyError(
947 "No design matrix has any non-trivial factors, "
948 "the data object is not a DataFrame. "
949 "I can't tell how many rows the design matrix should "
950 "have!"
951 )
952 matrices.append(matrix)
953 if return_type == "dataframe":
954 assert have_pandas
955 for i, matrix in enumerate(matrices):
956 di = matrix.design_info
957 matrices[i] = pandas.DataFrame(matrix,
958 columns=di.column_names,
959 index=pandas_index)
960 matrices[i].design_info = di
961 return matrices
963# It should be possible to do just the factors -> factor_infos stuff
964# alone, since that, well, makes logical sense to do.