Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/patsy/parse_formula.py : 22%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1 # This file is part of Patsy
2# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
3# See file LICENSE.txt for license information.
5# This file defines a parser for a simple language based on S/R "formulas"
6# (which are described in sections 2.3 and 2.4 in Chambers & Hastie, 1992). It
7# uses the machinery in patsy.parse_core to do the heavy-lifting -- its
8# biggest job is to handle tokenization.
10from __future__ import print_function
12__all__ = ["parse_formula"]
14# The Python tokenizer
15import tokenize
17import six
18from six.moves import cStringIO as StringIO
20from patsy import PatsyError
21from patsy.origin import Origin
22from patsy.infix_parser import Token, Operator, infix_parse, ParseNode
23from patsy.tokens import python_tokenize, pretty_untokenize
24from patsy.util import PushbackAdapter
26_atomic_token_types = ["PYTHON_EXPR", "ZERO", "ONE", "NUMBER"]
28def _is_a(f, v):
29 try:
30 f(v)
31 except ValueError:
32 return False
33 else:
34 return True
36# Helper function for _tokenize_formula:
37def _read_python_expr(it, end_tokens):
38 # Read out a full python expression, stopping when we hit an
39 # unnested end token.
40 pytypes = []
41 token_strings = []
42 origins = []
43 bracket_level = 0
44 for pytype, token_string, origin in it:
45 assert bracket_level >= 0
46 if bracket_level == 0 and token_string in end_tokens:
47 it.push_back((pytype, token_string, origin))
48 break
49 if token_string in ("(", "[", "{"):
50 bracket_level += 1
51 if token_string in (")", "]", "}"):
52 bracket_level -= 1
53 if bracket_level < 0:
54 raise PatsyError("unmatched close bracket", origin)
55 pytypes.append(pytype)
56 token_strings.append(token_string)
57 origins.append(origin)
58 # Either we found an end_token, or we hit the end of the string
59 if bracket_level == 0:
60 expr_text = pretty_untokenize(zip(pytypes, token_strings))
61 if expr_text == "0":
62 token_type = "ZERO"
63 elif expr_text == "1":
64 token_type = "ONE"
65 elif _is_a(int, expr_text) or _is_a(float, expr_text):
66 token_type = "NUMBER"
67 else:
68 token_type = "PYTHON_EXPR"
69 return Token(token_type, Origin.combine(origins), extra=expr_text)
70 else:
71 raise PatsyError("unclosed bracket in embedded Python "
72 "expression",
73 Origin.combine(origins))
75def _tokenize_formula(code, operator_strings):
76 assert "(" not in operator_strings
77 assert ")" not in operator_strings
78 magic_token_types = {"(": Token.LPAREN,
79 ")": Token.RPAREN,
80 }
81 for operator_string in operator_strings:
82 magic_token_types[operator_string] = operator_string
83 # Once we enter a Python expression, a ( does not end it, but any other
84 # "magic" token does:
85 end_tokens = set(magic_token_types)
86 end_tokens.remove("(")
88 it = PushbackAdapter(python_tokenize(code))
89 for pytype, token_string, origin in it:
90 if token_string in magic_token_types:
91 yield Token(magic_token_types[token_string], origin)
92 else:
93 it.push_back((pytype, token_string, origin))
94 yield _read_python_expr(it, end_tokens)
96def test__tokenize_formula():
97 code = "y ~ a + (foo(b,c + 2)) + -1 + 0 + 10"
98 tokens = list(_tokenize_formula(code, ["+", "-", "~"]))
99 expecteds = [("PYTHON_EXPR", Origin(code, 0, 1), "y"),
100 ("~", Origin(code, 2, 3), None),
101 ("PYTHON_EXPR", Origin(code, 4, 5), "a"),
102 ("+", Origin(code, 6, 7), None),
103 (Token.LPAREN, Origin(code, 8, 9), None),
104 ("PYTHON_EXPR", Origin(code, 9, 23), "foo(b, c + 2)"),
105 (Token.RPAREN, Origin(code, 23, 24), None),
106 ("+", Origin(code, 25, 26), None),
107 ("-", Origin(code, 27, 28), None),
108 ("ONE", Origin(code, 28, 29), "1"),
109 ("+", Origin(code, 30, 31), None),
110 ("ZERO", Origin(code, 32, 33), "0"),
111 ("+", Origin(code, 34, 35), None),
112 ("NUMBER", Origin(code, 36, 38), "10"),
113 ]
114 for got, expected in zip(tokens, expecteds):
115 assert isinstance(got, Token)
116 assert got.type == expected[0]
117 assert got.origin == expected[1]
118 assert got.extra == expected[2]
120_unary_tilde = Operator("~", 1, -100)
121_default_ops = [
122 _unary_tilde,
123 Operator("~", 2, -100),
125 Operator("+", 2, 100),
126 Operator("-", 2, 100),
127 Operator("*", 2, 200),
128 Operator("/", 2, 200),
129 Operator(":", 2, 300),
130 Operator("**", 2, 500),
132 Operator("+", 1, 100),
133 Operator("-", 1, 100),
134]
136def parse_formula(code, extra_operators=[]):
137 if not code.strip():
138 code = "~ 1"
140 for op in extra_operators:
141 if op.precedence < 0:
142 raise ValueError("all operators must have precedence >= 0")
144 operators = _default_ops + extra_operators
145 operator_strings = [op.token_type for op in operators]
146 tree = infix_parse(_tokenize_formula(code, operator_strings),
147 operators,
148 _atomic_token_types)
149 if not isinstance(tree, ParseNode) or tree.type != "~":
150 tree = ParseNode("~", None, [tree], tree.origin)
151 return tree
153#############
155_parser_tests = {
156 "": ["~", "1"],
157 " ": ["~", "1"],
158 " \n ": ["~", "1"],
160 "1": ["~", "1"],
161 "a": ["~", "a"],
162 "a ~ b": ["~", "a", "b"],
164 "(a ~ b)": ["~", "a", "b"],
165 "a ~ ((((b))))": ["~", "a", "b"],
166 "a ~ ((((+b))))": ["~", "a", ["+", "b"]],
168 "a + b + c": ["~", ["+", ["+", "a", "b"], "c"]],
169 "a + (b ~ c) + d": ["~", ["+", ["+", "a", ["~", "b", "c"]], "d"]],
171 "a + np.log(a, base=10)": ["~", ["+", "a", "np.log(a, base=10)"]],
172 # Note different spacing:
173 "a + np . log(a , base = 10)": ["~", ["+", "a", "np.log(a, base=10)"]],
175 # Check precedence
176 "a + b ~ c * d": ["~", ["+", "a", "b"], ["*", "c", "d"]],
177 "a + b * c": ["~", ["+", "a", ["*", "b", "c"]]],
178 "-a**2": ["~", ["-", ["**", "a", "2"]]],
179 "-a:b": ["~", ["-", [":", "a", "b"]]],
180 "a + b:c": ["~", ["+", "a", [":", "b", "c"]]],
181 "(a + b):c": ["~", [":", ["+", "a", "b"], "c"]],
182 "a*b:c": ["~", ["*", "a", [":", "b", "c"]]],
184 "a+b / c": ["~", ["+", "a", ["/", "b", "c"]]],
185 "~ a": ["~", "a"],
187 "-1": ["~", ["-", "1"]],
188 }
190def _compare_trees(got, expected):
191 assert isinstance(got, ParseNode)
192 if got.args:
193 assert got.type == expected[0]
194 for arg, expected_arg in zip(got.args, expected[1:]):
195 _compare_trees(arg, expected_arg)
196 else:
197 assert got.type in _atomic_token_types
198 assert got.token.extra == expected
200def _do_parse_test(test_cases, extra_operators):
201 for code, expected in six.iteritems(test_cases):
202 actual = parse_formula(code, extra_operators=extra_operators)
203 print(repr(code), repr(expected))
204 print(actual)
205 _compare_trees(actual, expected)
207def test_parse_formula():
208 _do_parse_test(_parser_tests, [])
210def test_parse_origin():
211 tree = parse_formula("a ~ b + c")
212 assert tree.origin == Origin("a ~ b + c", 0, 9)
213 assert tree.token.origin == Origin("a ~ b + c", 2, 3)
214 assert tree.args[0].origin == Origin("a ~ b + c", 0, 1)
215 assert tree.args[1].origin == Origin("a ~ b + c", 4, 9)
216 assert tree.args[1].token.origin == Origin("a ~ b + c", 6, 7)
217 assert tree.args[1].args[0].origin == Origin("a ~ b + c", 4, 5)
218 assert tree.args[1].args[1].origin == Origin("a ~ b + c", 8, 9)
220# <> mark off where the error should be reported:
221_parser_error_tests = [
222 "a <+>",
223 "a + <(>",
225 "a + b <# asdf>",
227 "<)>",
228 "a + <)>",
229 "<*> a",
230 "a + <*>",
232 "a + <foo[bar>",
233 "a + <foo{bar>",
234 "a + <foo(bar>",
236 "a + <[bar>",
237 "a + <{bar>",
239 "a + <{bar[]>",
241 "a + foo<]>bar",
242 "a + foo[]<]>bar",
243 "a + foo{}<}>bar",
244 "a + foo<)>bar",
246 "a + b<)>",
247 "(a) <.>",
249 "<(>a + b",
251 "a +< >'foo", # Not the best placement for the error
252]
254# Split out so it can also be used by tests of the evaluator (which also
255# raises PatsyError's)
256def _parsing_error_test(parse_fn, error_descs): # pragma: no cover
257 for error_desc in error_descs:
258 letters = []
259 start = None
260 end = None
261 for letter in error_desc:
262 if letter == "<":
263 start = len(letters)
264 elif letter == ">":
265 end = len(letters)
266 else:
267 letters.append(letter)
268 bad_code = "".join(letters)
269 assert start is not None and end is not None
270 print(error_desc)
271 print(repr(bad_code), start, end)
272 try:
273 parse_fn(bad_code)
274 except PatsyError as e:
275 print(e)
276 assert e.origin.code == bad_code
277 assert e.origin.start == start
278 assert e.origin.end == end
279 else:
280 assert False, "parser failed to report an error!"
282def test_parse_errors(extra_operators=[]):
283 def parse_fn(code):
284 return parse_formula(code, extra_operators=extra_operators)
285 _parsing_error_test(parse_fn, _parser_error_tests)
287_extra_op_parser_tests = {
288 "a | b": ["~", ["|", "a", "b"]],
289 "a * b|c": ["~", ["*", "a", ["|", "b", "c"]]],
290 }
292def test_parse_extra_op():
293 extra_operators = [Operator("|", 2, 250)]
294 _do_parse_test(_parser_tests,
295 extra_operators=extra_operators)
296 _do_parse_test(_extra_op_parser_tests,
297 extra_operators=extra_operators)
298 test_parse_errors(extra_operators=extra_operators)