Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/patsy/desc.py : 26%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# This file is part of Patsy
2# Copyright (C) 2011-2012 Nathaniel Smith <njs@pobox.com>
3# See file LICENSE.txt for license information.
5# This file defines the ModelDesc class, which describes a model at a high
6# level, as a list of interactions of factors. It also has the code to convert
7# a formula parse tree (from patsy.parse_formula) into a ModelDesc.
9from __future__ import print_function
11import six
12from patsy import PatsyError
13from patsy.parse_formula import ParseNode, Token, parse_formula
14from patsy.eval import EvalEnvironment, EvalFactor
15from patsy.util import uniqueify_list
16from patsy.util import repr_pretty_delegate, repr_pretty_impl
17from patsy.util import no_pickling, assert_no_pickling
19# These are made available in the patsy.* namespace
20__all__ = ["Term", "ModelDesc", "INTERCEPT"]
22# One might think it would make more sense for 'factors' to be a set, rather
23# than a tuple-with-guaranteed-unique-entries-that-compares-like-a-set. The
24# reason we do it this way is that it preserves the order that the user typed
25# and is expecting, which then ends up producing nicer names in our final
26# output, nicer column ordering, etc. (A similar comment applies to the
27# ordering of terms in ModelDesc objects as a whole.)
28class Term(object):
29 """The interaction between a collection of factor objects.
31 This is one of the basic types used in representing formulas, and
32 corresponds to an expression like ``"a:b:c"`` in a formula string.
33 For details, see :ref:`formulas` and :ref:`expert-model-specification`.
35 Terms are hashable and compare by value.
37 Attributes:
39 .. attribute:: factors
41 A tuple of factor objects.
42 """
43 def __init__(self, factors):
44 self.factors = tuple(uniqueify_list(factors))
46 def __eq__(self, other):
47 return (isinstance(other, Term)
48 and frozenset(other.factors) == frozenset(self.factors))
50 def __ne__(self, other):
51 return not self == other
53 def __hash__(self):
54 return hash((Term, frozenset(self.factors)))
56 __repr__ = repr_pretty_delegate
57 def _repr_pretty_(self, p, cycle):
58 assert not cycle
59 repr_pretty_impl(p, self, [list(self.factors)])
61 def name(self):
62 """Return a human-readable name for this term."""
63 if self.factors:
64 return ":".join([f.name() for f in self.factors])
65 else:
66 return "Intercept"
68 __getstate__ = no_pickling
70INTERCEPT = Term([])
72class _MockFactor(object):
73 def __init__(self, name):
74 self._name = name
76 def name(self):
77 return self._name
79def test_Term():
80 assert Term([1, 2, 1]).factors == (1, 2)
81 assert Term([1, 2]) == Term([2, 1])
82 assert hash(Term([1, 2])) == hash(Term([2, 1]))
83 f1 = _MockFactor("a")
84 f2 = _MockFactor("b")
85 assert Term([f1, f2]).name() == "a:b"
86 assert Term([f2, f1]).name() == "b:a"
87 assert Term([]).name() == "Intercept"
89 assert_no_pickling(Term([]))
91class ModelDesc(object):
92 """A simple container representing the termlists parsed from a formula.
94 This is a simple container object which has exactly the same
95 representational power as a formula string, but is a Python object
96 instead. You can construct one by hand, and pass it to functions like
97 :func:`dmatrix` or :func:`incr_dbuilder` that are expecting a formula
98 string, but without having to do any messy string manipulation. For
99 details see :ref:`expert-model-specification`.
101 Attributes:
103 .. attribute:: lhs_termlist
104 rhs_termlist
106 Two termlists representing the left- and right-hand sides of a
107 formula, suitable for passing to :func:`design_matrix_builders`.
108 """
109 def __init__(self, lhs_termlist, rhs_termlist):
110 self.lhs_termlist = uniqueify_list(lhs_termlist)
111 self.rhs_termlist = uniqueify_list(rhs_termlist)
113 __repr__ = repr_pretty_delegate
114 def _repr_pretty_(self, p, cycle):
115 assert not cycle
116 return repr_pretty_impl(p, self,
117 [],
118 [("lhs_termlist", self.lhs_termlist),
119 ("rhs_termlist", self.rhs_termlist)])
121 def describe(self):
122 """Returns a human-readable representation of this :class:`ModelDesc`
123 in pseudo-formula notation.
125 .. warning:: There is no guarantee that the strings returned by this
126 function can be parsed as formulas. They are best-effort
127 descriptions intended for human users. However, if this ModelDesc
128 was created by parsing a formula, then it should work in
129 practice. If you *really* have to.
130 """
131 def term_code(term):
132 if term == INTERCEPT:
133 return "1"
134 else:
135 return term.name()
136 result = " + ".join([term_code(term) for term in self.lhs_termlist])
137 if result:
138 result += " ~ "
139 else:
140 result += "~ "
141 if self.rhs_termlist == [INTERCEPT]:
142 result += term_code(INTERCEPT)
143 else:
144 term_names = []
145 if INTERCEPT not in self.rhs_termlist:
146 term_names.append("0")
147 term_names += [term_code(term) for term in self.rhs_termlist
148 if term != INTERCEPT]
149 result += " + ".join(term_names)
150 return result
152 @classmethod
153 def from_formula(cls, tree_or_string):
154 """Construct a :class:`ModelDesc` from a formula string.
156 :arg tree_or_string: A formula string. (Or an unevaluated formula
157 parse tree, but the API for generating those isn't public yet. Shh,
158 it can be our secret.)
159 :returns: A new :class:`ModelDesc`.
160 """
161 if isinstance(tree_or_string, ParseNode):
162 tree = tree_or_string
163 else:
164 tree = parse_formula(tree_or_string)
165 value = Evaluator().eval(tree, require_evalexpr=False)
166 assert isinstance(value, cls)
167 return value
169 __getstate__ = no_pickling
171def test_ModelDesc():
172 f1 = _MockFactor("a")
173 f2 = _MockFactor("b")
174 m = ModelDesc([INTERCEPT, Term([f1])], [Term([f1]), Term([f1, f2])])
175 assert m.lhs_termlist == [INTERCEPT, Term([f1])]
176 assert m.rhs_termlist == [Term([f1]), Term([f1, f2])]
177 print(m.describe())
178 assert m.describe() == "1 + a ~ 0 + a + a:b"
180 assert_no_pickling(m)
182 assert ModelDesc([], []).describe() == "~ 0"
183 assert ModelDesc([INTERCEPT], []).describe() == "1 ~ 0"
184 assert ModelDesc([INTERCEPT], [INTERCEPT]).describe() == "1 ~ 1"
185 assert (ModelDesc([INTERCEPT], [INTERCEPT, Term([f2])]).describe()
186 == "1 ~ b")
188def test_ModelDesc_from_formula():
189 for input in ("y ~ x", parse_formula("y ~ x")):
190 md = ModelDesc.from_formula(input)
191 assert md.lhs_termlist == [Term([EvalFactor("y")]),]
192 assert md.rhs_termlist == [INTERCEPT, Term([EvalFactor("x")])]
194class IntermediateExpr(object):
195 "This class holds an intermediate result while we're evaluating a tree."
196 def __init__(self, intercept, intercept_origin, intercept_removed, terms):
197 self.intercept = intercept
198 self.intercept_origin = intercept_origin
199 self.intercept_removed =intercept_removed
200 self.terms = tuple(uniqueify_list(terms))
201 if self.intercept:
202 assert self.intercept_origin
203 assert not (self.intercept and self.intercept_removed)
205 __repr__ = repr_pretty_delegate
206 def _pretty_repr_(self, p, cycle): # pragma: no cover
207 assert not cycle
208 return repr_pretty_impl(p, self,
209 [self.intercept, self.intercept_origin,
210 self.intercept_removed, self.terms])
212 __getstate__ = no_pickling
214def _maybe_add_intercept(doit, terms):
215 if doit:
216 return (INTERCEPT,) + terms
217 else:
218 return terms
220def _eval_any_tilde(evaluator, tree):
221 exprs = [evaluator.eval(arg) for arg in tree.args]
222 if len(exprs) == 1:
223 # Formula was like: "~ foo"
224 # We pretend that instead it was like: "0 ~ foo"
225 exprs.insert(0, IntermediateExpr(False, None, True, []))
226 assert len(exprs) == 2
227 # Note that only the RHS gets an implicit intercept:
228 return ModelDesc(_maybe_add_intercept(exprs[0].intercept, exprs[0].terms),
229 _maybe_add_intercept(not exprs[1].intercept_removed,
230 exprs[1].terms))
232def _eval_binary_plus(evaluator, tree):
233 left_expr = evaluator.eval(tree.args[0])
234 if tree.args[1].type == "ZERO":
235 return IntermediateExpr(False, None, True, left_expr.terms)
236 else:
237 right_expr = evaluator.eval(tree.args[1])
238 if right_expr.intercept:
239 return IntermediateExpr(True, right_expr.intercept_origin, False,
240 left_expr.terms + right_expr.terms)
241 else:
242 return IntermediateExpr(left_expr.intercept,
243 left_expr.intercept_origin,
244 left_expr.intercept_removed,
245 left_expr.terms + right_expr.terms)
248def _eval_binary_minus(evaluator, tree):
249 left_expr = evaluator.eval(tree.args[0])
250 if tree.args[1].type == "ZERO":
251 return IntermediateExpr(True, tree.args[1], False,
252 left_expr.terms)
253 elif tree.args[1].type == "ONE":
254 return IntermediateExpr(False, None, True, left_expr.terms)
255 else:
256 right_expr = evaluator.eval(tree.args[1])
257 terms = [term for term in left_expr.terms
258 if term not in right_expr.terms]
259 if right_expr.intercept:
260 return IntermediateExpr(False, None, True, terms)
261 else:
262 return IntermediateExpr(left_expr.intercept,
263 left_expr.intercept_origin,
264 left_expr.intercept_removed,
265 terms)
267def _check_interactable(expr):
268 if expr.intercept:
269 raise PatsyError("intercept term cannot interact with "
270 "anything else", expr.intercept_origin)
272def _interaction(left_expr, right_expr):
273 for expr in (left_expr, right_expr):
274 _check_interactable(expr)
275 terms = []
276 for l_term in left_expr.terms:
277 for r_term in right_expr.terms:
278 terms.append(Term(l_term.factors + r_term.factors))
279 return IntermediateExpr(False, None, False, terms)
281def _eval_binary_prod(evaluator, tree):
282 exprs = [evaluator.eval(arg) for arg in tree.args]
283 return IntermediateExpr(False, None, False,
284 exprs[0].terms
285 + exprs[1].terms
286 + _interaction(*exprs).terms)
288# Division (nesting) is right-ward distributive:
289# a / (b + c) -> a/b + a/c -> a + a:b + a:c
290# But left-ward, in S/R it has a quirky behavior:
291# (a + b)/c -> a + b + a:b:c
292# This is because it's meaningless for a factor to be "nested" under two
293# different factors. (This is documented in Chambers and Hastie (page 30) as a
294# "Slightly more subtle..." rule, with no further elaboration. Hopefully we
295# will do better.)
296def _eval_binary_div(evaluator, tree):
297 left_expr = evaluator.eval(tree.args[0])
298 right_expr = evaluator.eval(tree.args[1])
299 terms = list(left_expr.terms)
300 _check_interactable(left_expr)
301 # Build a single giant combined term for everything on the left:
302 left_factors = []
303 for term in left_expr.terms:
304 left_factors += list(term.factors)
305 left_combined_expr = IntermediateExpr(False, None, False,
306 [Term(left_factors)])
307 # Then interact it with everything on the right:
308 terms += list(_interaction(left_combined_expr, right_expr).terms)
309 return IntermediateExpr(False, None, False, terms)
311def _eval_binary_interact(evaluator, tree):
312 exprs = [evaluator.eval(arg) for arg in tree.args]
313 return _interaction(*exprs)
315def _eval_binary_power(evaluator, tree):
316 left_expr = evaluator.eval(tree.args[0])
317 _check_interactable(left_expr)
318 power = -1
319 if tree.args[1].type in ("ONE", "NUMBER"):
320 expr = tree.args[1].token.extra
321 try:
322 power = int(expr)
323 except ValueError:
324 pass
325 if power < 1:
326 raise PatsyError("'**' requires a positive integer", tree.args[1])
327 all_terms = left_expr.terms
328 big_expr = left_expr
329 # Small optimization: (a + b)**100 is just the same as (a + b)**2.
330 power = min(len(left_expr.terms), power)
331 for i in range(1, power):
332 big_expr = _interaction(left_expr, big_expr)
333 all_terms = all_terms + big_expr.terms
334 return IntermediateExpr(False, None, False, all_terms)
336def _eval_unary_plus(evaluator, tree):
337 return evaluator.eval(tree.args[0])
339def _eval_unary_minus(evaluator, tree):
340 if tree.args[0].type == "ZERO":
341 return IntermediateExpr(True, tree.origin, False, [])
342 elif tree.args[0].type == "ONE":
343 return IntermediateExpr(False, None, True, [])
344 else:
345 raise PatsyError("Unary minus can only be applied to 1 or 0", tree)
347def _eval_zero(evaluator, tree):
348 return IntermediateExpr(False, None, True, [])
350def _eval_one(evaluator, tree):
351 return IntermediateExpr(True, tree.origin, False, [])
353def _eval_number(evaluator, tree):
354 raise PatsyError("numbers besides '0' and '1' are "
355 "only allowed with **", tree)
357def _eval_python_expr(evaluator, tree):
358 factor = EvalFactor(tree.token.extra, origin=tree.origin)
359 return IntermediateExpr(False, None, False, [Term([factor])])
361class Evaluator(object):
362 def __init__(self):
363 self._evaluators = {}
364 self.add_op("~", 2, _eval_any_tilde)
365 self.add_op("~", 1, _eval_any_tilde)
367 self.add_op("+", 2, _eval_binary_plus)
368 self.add_op("-", 2, _eval_binary_minus)
369 self.add_op("*", 2, _eval_binary_prod)
370 self.add_op("/", 2, _eval_binary_div)
371 self.add_op(":", 2, _eval_binary_interact)
372 self.add_op("**", 2, _eval_binary_power)
374 self.add_op("+", 1, _eval_unary_plus)
375 self.add_op("-", 1, _eval_unary_minus)
377 self.add_op("ZERO", 0, _eval_zero)
378 self.add_op("ONE", 0, _eval_one)
379 self.add_op("NUMBER", 0, _eval_number)
380 self.add_op("PYTHON_EXPR", 0, _eval_python_expr)
382 # Not used by Patsy -- provided for the convenience of eventual
383 # user-defined operators.
384 self.stash = {}
386 # This should not be considered a public API yet (to use for actually
387 # adding new operator semantics) because I wrote in some of the relevant
388 # code sort of speculatively, but it isn't actually tested.
389 def add_op(self, op, arity, evaluator):
390 self._evaluators[op, arity] = evaluator
392 def eval(self, tree, require_evalexpr=True):
393 result = None
394 assert isinstance(tree, ParseNode)
395 key = (tree.type, len(tree.args))
396 if key not in self._evaluators:
397 raise PatsyError("I don't know how to evaluate this "
398 "'%s' operator" % (tree.type,),
399 tree.token)
400 result = self._evaluators[key](self, tree)
401 if require_evalexpr and not isinstance(result, IntermediateExpr):
402 if isinstance(result, ModelDesc):
403 raise PatsyError("~ can only be used once, and "
404 "only at the top level",
405 tree)
406 else:
407 raise PatsyError("custom operator returned an "
408 "object that I don't know how to "
409 "handle", tree)
410 return result
412#############
414_eval_tests = {
415 "": (True, []),
416 " ": (True, []),
417 " \n ": (True, []),
418 "a": (True, ["a"]),
420 "1": (True, []),
421 "0": (False, []),
422 "- 1": (False, []),
423 "- 0": (True, []),
424 "+ 1": (True, []),
425 "+ 0": (False, []),
426 "0 + 1": (True, []),
427 "1 + 0": (False, []),
428 "1 - 0": (True, []),
429 "0 - 1": (False, []),
431 "1 + a": (True, ["a"]),
432 "0 + a": (False, ["a"]),
433 "a - 1": (False, ["a"]),
434 "a - 0": (True, ["a"]),
435 "1 - a": (True, []),
437 "a + b": (True, ["a", "b"]),
438 "(a + b)": (True, ["a", "b"]),
439 "a + ((((b))))": (True, ["a", "b"]),
440 "a + ((((+b))))": (True, ["a", "b"]),
441 "a + ((((b - a))))": (True, ["a", "b"]),
443 "a + a + a": (True, ["a"]),
445 "a + (b - a)": (True, ["a", "b"]),
447 "a + np.log(a, base=10)": (True, ["a", "np.log(a, base=10)"]),
448 # Note different spacing:
449 "a + np.log(a, base=10) - np . log(a , base = 10)": (True, ["a"]),
451 "a + (I(b) + c)": (True, ["a", "I(b)", "c"]),
452 "a + I(b + c)": (True, ["a", "I(b + c)"]),
454 "a:b": (True, [("a", "b")]),
455 "a:b:a": (True, [("a", "b")]),
456 "a:(b + c)": (True, [("a", "b"), ("a", "c")]),
457 "(a + b):c": (True, [("a", "c"), ("b", "c")]),
458 "a:(b - c)": (True, [("a", "b")]),
459 "c + a:c + a:(b - c)": (True, ["c", ("a", "c"), ("a", "b")]),
460 "(a - b):c": (True, [("a", "c")]),
461 "b + b:c + (a - b):c": (True, ["b", ("b", "c"), ("a", "c")]),
463 "a:b - a:b": (True, []),
464 "a:b - b:a": (True, []),
466 "1 - (a + b)": (True, []),
467 "a + b - (a + b)": (True, []),
469 "a * b": (True, ["a", "b", ("a", "b")]),
470 "a * b * a": (True, ["a", "b", ("a", "b")]),
471 "a * (b + c)": (True, ["a", "b", "c", ("a", "b"), ("a", "c")]),
472 "(a + b) * c": (True, ["a", "b", "c", ("a", "c"), ("b", "c")]),
473 "a * (b - c)": (True, ["a", "b", ("a", "b")]),
474 "c + a:c + a * (b - c)": (True, ["c", ("a", "c"), "a", "b", ("a", "b")]),
475 "(a - b) * c": (True, ["a", "c", ("a", "c")]),
476 "b + b:c + (a - b) * c": (True, ["b", ("b", "c"), "a", "c", ("a", "c")]),
478 "a/b": (True, ["a", ("a", "b")]),
479 "(a + b)/c": (True, ["a", "b", ("a", "b", "c")]),
480 "b + b:c + (a - b)/c": (True, ["b", ("b", "c"), "a", ("a", "c")]),
481 "a/(b + c)": (True, ["a", ("a", "b"), ("a", "c")]),
483 "a ** 2": (True, ["a"]),
484 "(a + b + c + d) ** 2": (True, ["a", "b", "c", "d",
485 ("a", "b"), ("a", "c"), ("a", "d"),
486 ("b", "c"), ("b", "d"), ("c", "d")]),
487 "(a + b + c + d) ** 3": (True, ["a", "b", "c", "d",
488 ("a", "b"), ("a", "c"), ("a", "d"),
489 ("b", "c"), ("b", "d"), ("c", "d"),
490 ("a", "b", "c"), ("a", "b", "d"),
491 ("a", "c", "d"), ("b", "c", "d")]),
493 "a + +a": (True, ["a"]),
495 "~ a + b": (True, ["a", "b"]),
496 "~ a*b": (True, ["a", "b", ("a", "b")]),
497 "~ a*b + 0": (False, ["a", "b", ("a", "b")]),
498 "~ -1": (False, []),
500 "0 ~ a + b": (True, ["a", "b"]),
501 "1 ~ a + b": (True, [], True, ["a", "b"]),
502 "y ~ a + b": (False, ["y"], True, ["a", "b"]),
503 "0 + y ~ a + b": (False, ["y"], True, ["a", "b"]),
504 "0 + y * z ~ a + b": (False, ["y", "z", ("y", "z")], True, ["a", "b"]),
505 "-1 ~ 1": (False, [], True, []),
506 "1 + y ~ a + b": (True, ["y"], True, ["a", "b"]),
508 # Check precedence:
509 "a + b * c": (True, ["a", "b", "c", ("b", "c")]),
510 "a * b + c": (True, ["a", "b", ("a", "b"), "c"]),
511 "a * b - a": (True, ["b", ("a", "b")]),
512 "a + b / c": (True, ["a", "b", ("b", "c")]),
513 "a / b + c": (True, ["a", ("a", "b"), "c"]),
514 "a*b:c": (True, ["a", ("b", "c"), ("a", "b", "c")]),
515 "a:b*c": (True, [("a", "b"), "c", ("a", "b", "c")]),
517 # Intercept handling:
518 "~ 1 + 1 + 0 + 1": (True, []),
519 "~ 0 + 1 + 0": (False, []),
520 "~ 0 - 1 - 1 + 0 + 1": (True, []),
521 "~ 1 - 1": (False, []),
522 "~ 0 + a + 1": (True, ["a"]),
523 "~ 1 + (a + 0)": (True, ["a"]), # This is correct, but perhaps surprising!
524 "~ 0 + (a + 1)": (True, ["a"]), # Also correct!
525 "~ 1 - (a + 1)": (False, []),
526}
528# <> mark off where the error should be reported:
529_eval_error_tests = [
530 "a <+>",
531 "a + <(>",
533 "b + <(-a)>",
535 "a:<1>",
536 "(a + <1>)*b",
538 "a + <2>",
539 "a + <1.0>",
540 # eh, catching this is a hassle, we'll just leave the user some rope if
541 # they really want it:
542 #"a + <0x1>",
544 "a ** <b>",
545 "a ** <(1 + 1)>",
546 "a ** <1.5>",
548 "a + b <# asdf>",
550 "<)>",
551 "a + <)>",
552 "<*> a",
553 "a + <*>",
555 "a + <foo[bar>",
556 "a + <foo{bar>",
557 "a + <foo(bar>",
559 "a + <[bar>",
560 "a + <{bar>",
562 "a + <{bar[]>",
564 "a + foo<]>bar",
565 "a + foo[]<]>bar",
566 "a + foo{}<}>bar",
567 "a + foo<)>bar",
569 "a + b<)>",
570 "(a) <.>",
572 "<(>a + b",
574 "<y ~ a> ~ b",
575 "y ~ <(a ~ b)>",
576 "<~ a> ~ b",
577 "~ <(a ~ b)>",
579 "1 + <-(a + b)>",
581 "<- a>",
582 "a + <-a**2>",
583]
585def _assert_terms_match(terms, expected_intercept, expecteds): # pragma: no cover
586 if expected_intercept:
587 expecteds = [()] + expecteds
588 assert len(terms) == len(expecteds)
589 for term, expected in zip(terms, expecteds):
590 if isinstance(term, Term):
591 if isinstance(expected, str):
592 expected = (expected,)
593 assert term.factors == tuple([EvalFactor(s) for s in expected])
594 else:
595 assert term == expected
597def _do_eval_formula_tests(tests): # pragma: no cover
598 for code, result in six.iteritems(tests):
599 if len(result) == 2:
600 result = (False, []) + result
601 model_desc = ModelDesc.from_formula(code)
602 print(repr(code))
603 print(result)
604 print(model_desc)
605 lhs_intercept, lhs_termlist, rhs_intercept, rhs_termlist = result
606 _assert_terms_match(model_desc.lhs_termlist,
607 lhs_intercept, lhs_termlist)
608 _assert_terms_match(model_desc.rhs_termlist,
609 rhs_intercept, rhs_termlist)
611def test_eval_formula():
612 _do_eval_formula_tests(_eval_tests)
614def test_eval_formula_error_reporting():
615 from patsy.parse_formula import _parsing_error_test
616 parse_fn = lambda formula: ModelDesc.from_formula(formula)
617 _parsing_error_test(parse_fn, _eval_error_tests)
619def test_formula_factor_origin():
620 from patsy.origin import Origin
621 desc = ModelDesc.from_formula("a + b")
622 assert (desc.rhs_termlist[1].factors[0].origin
623 == Origin("a + b", 0, 1))
624 assert (desc.rhs_termlist[2].factors[0].origin
625 == Origin("a + b", 4, 5))