Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/pandas/core/computation/expr.py : 40%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1""":func:`~pandas.eval` parsers
2"""
4import ast
5from functools import partial, reduce
6from keyword import iskeyword
7import tokenize
8from typing import Optional, Type
10import numpy as np
12import pandas.core.common as com
13from pandas.core.computation.ops import (
14 _LOCAL_TAG,
15 BinOp,
16 Constant,
17 Div,
18 FuncNode,
19 Op,
20 Term,
21 UnaryOp,
22 UndefinedVariableError,
23 _arith_ops_syms,
24 _bool_ops_syms,
25 _cmp_ops_syms,
26 _mathops,
27 _reductions,
28 _unary_ops_syms,
29 is_term,
30)
31from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string
32from pandas.core.computation.scope import Scope
34import pandas.io.formats.printing as printing
37def _rewrite_assign(tok):
38 """Rewrite the assignment operator for PyTables expressions that use ``=``
39 as a substitute for ``==``.
41 Parameters
42 ----------
43 tok : tuple of int, str
44 ints correspond to the all caps constants in the tokenize module
46 Returns
47 -------
48 t : tuple of int, str
49 Either the input or token or the replacement values
50 """
51 toknum, tokval = tok
52 return toknum, "==" if tokval == "=" else tokval
55def _replace_booleans(tok):
56 """Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
57 precedence is changed to boolean precedence.
59 Parameters
60 ----------
61 tok : tuple of int, str
62 ints correspond to the all caps constants in the tokenize module
64 Returns
65 -------
66 t : tuple of int, str
67 Either the input or token or the replacement values
68 """
69 toknum, tokval = tok
70 if toknum == tokenize.OP:
71 if tokval == "&":
72 return tokenize.NAME, "and"
73 elif tokval == "|":
74 return tokenize.NAME, "or"
75 return toknum, tokval
76 return toknum, tokval
79def _replace_locals(tok):
80 """Replace local variables with a syntactically valid name.
82 Parameters
83 ----------
84 tok : tuple of int, str
85 ints correspond to the all caps constants in the tokenize module
87 Returns
88 -------
89 t : tuple of int, str
90 Either the input or token or the replacement values
92 Notes
93 -----
94 This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as
95 ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``
96 is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
97 """
98 toknum, tokval = tok
99 if toknum == tokenize.OP and tokval == "@":
100 return tokenize.OP, _LOCAL_TAG
101 return toknum, tokval
104def _compose2(f, g):
105 """Compose 2 callables"""
106 return lambda *args, **kwargs: f(g(*args, **kwargs))
109def _compose(*funcs):
110 """Compose 2 or more callables"""
111 assert len(funcs) > 1, "At least 2 callables must be passed to compose"
112 return reduce(_compose2, funcs)
115def _preparse(
116 source: str,
117 f=_compose(
118 _replace_locals, _replace_booleans, _rewrite_assign, clean_backtick_quoted_toks
119 ),
120):
121 """Compose a collection of tokenization functions
123 Parameters
124 ----------
125 source : str
126 A Python source code string
127 f : callable
128 This takes a tuple of (toknum, tokval) as its argument and returns a
129 tuple with the same structure but possibly different elements. Defaults
130 to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
131 ``_replace_locals``.
133 Returns
134 -------
135 s : str
136 Valid Python source code
138 Notes
139 -----
140 The `f` parameter can be any callable that takes *and* returns input of the
141 form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
142 the ``tokenize`` module and ``tokval`` is a string.
143 """
144 assert callable(f), "f must be callable"
145 return tokenize.untokenize((f(x) for x in tokenize_string(source)))
148def _is_type(t):
149 """Factory for a type checking function of type ``t`` or tuple of types."""
150 return lambda x: isinstance(x.value, t)
153_is_list = _is_type(list)
154_is_str = _is_type(str)
157# partition all AST nodes
158_all_nodes = frozenset(
159 filter(
160 lambda x: isinstance(x, type) and issubclass(x, ast.AST),
161 (getattr(ast, node) for node in dir(ast)),
162 )
163)
166def _filter_nodes(superclass, all_nodes=_all_nodes):
167 """Filter out AST nodes that are subclasses of ``superclass``."""
168 node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass))
169 return frozenset(node_names)
172_all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes))
173_mod_nodes = _filter_nodes(ast.mod)
174_stmt_nodes = _filter_nodes(ast.stmt)
175_expr_nodes = _filter_nodes(ast.expr)
176_expr_context_nodes = _filter_nodes(ast.expr_context)
177_slice_nodes = _filter_nodes(ast.slice)
178_boolop_nodes = _filter_nodes(ast.boolop)
179_operator_nodes = _filter_nodes(ast.operator)
180_unary_op_nodes = _filter_nodes(ast.unaryop)
181_cmp_op_nodes = _filter_nodes(ast.cmpop)
182_comprehension_nodes = _filter_nodes(ast.comprehension)
183_handler_nodes = _filter_nodes(ast.excepthandler)
184_arguments_nodes = _filter_nodes(ast.arguments)
185_keyword_nodes = _filter_nodes(ast.keyword)
186_alias_nodes = _filter_nodes(ast.alias)
189# nodes that we don't support directly but are needed for parsing
190_hacked_nodes = frozenset(["Assign", "Module", "Expr"])
193_unsupported_expr_nodes = frozenset(
194 [
195 "Yield",
196 "GeneratorExp",
197 "IfExp",
198 "DictComp",
199 "SetComp",
200 "Repr",
201 "Lambda",
202 "Set",
203 "AST",
204 "Is",
205 "IsNot",
206 ]
207)
209# these nodes are low priority or won't ever be supported (e.g., AST)
210_unsupported_nodes = (
211 _stmt_nodes
212 | _mod_nodes
213 | _handler_nodes
214 | _arguments_nodes
215 | _keyword_nodes
216 | _alias_nodes
217 | _expr_context_nodes
218 | _unsupported_expr_nodes
219) - _hacked_nodes
221# we're adding a different assignment in some cases to be equality comparison
222# and we don't want `stmt` and friends in their so get only the class whose
223# names are capitalized
224_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes
225intersection = _unsupported_nodes & _base_supported_nodes
226_msg = f"cannot both support and not support {intersection}"
227assert not intersection, _msg
230def _node_not_implemented(node_name, cls):
231 """Return a function that raises a NotImplementedError with a passed node
232 name.
233 """
235 def f(self, *args, **kwargs):
236 raise NotImplementedError(f"{repr(node_name)} nodes are not implemented")
238 return f
241def disallow(nodes):
242 """Decorator to disallow certain nodes from parsing. Raises a
243 NotImplementedError instead.
245 Returns
246 -------
247 disallowed : callable
248 """
250 def disallowed(cls):
251 cls.unsupported_nodes = ()
252 for node in nodes:
253 new_method = _node_not_implemented(node, cls)
254 name = f"visit_{node}"
255 cls.unsupported_nodes += (name,)
256 setattr(cls, name, new_method)
257 return cls
259 return disallowed
262def _op_maker(op_class, op_symbol):
263 """Return a function to create an op class with its symbol already passed.
265 Returns
266 -------
267 f : callable
268 """
270 def f(self, node, *args, **kwargs):
271 """Return a partial function with an Op subclass with an operator
272 already passed.
274 Returns
275 -------
276 f : callable
277 """
278 return partial(op_class, op_symbol, *args, **kwargs)
280 return f
283_op_classes = {"binary": BinOp, "unary": UnaryOp}
286def add_ops(op_classes):
287 """Decorator to add default implementation of ops."""
289 def f(cls):
290 for op_attr_name, op_class in op_classes.items():
291 ops = getattr(cls, f"{op_attr_name}_ops")
292 ops_map = getattr(cls, f"{op_attr_name}_op_nodes_map")
293 for op in ops:
294 op_node = ops_map[op]
295 if op_node is not None:
296 made_op = _op_maker(op_class, op)
297 setattr(cls, f"visit_{op_node}", made_op)
298 return cls
300 return f
303@disallow(_unsupported_nodes)
304@add_ops(_op_classes)
305class BaseExprVisitor(ast.NodeVisitor):
306 """
307 Custom ast walker. Parsers of other engines should subclass this class
308 if necessary.
310 Parameters
311 ----------
312 env : Scope
313 engine : str
314 parser : str
315 preparser : callable
316 """
318 const_type: Type[Term] = Constant
319 term_type = Term
321 binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms
322 binary_op_nodes = (
323 "Gt",
324 "Lt",
325 "GtE",
326 "LtE",
327 "Eq",
328 "NotEq",
329 "In",
330 "NotIn",
331 "BitAnd",
332 "BitOr",
333 "And",
334 "Or",
335 "Add",
336 "Sub",
337 "Mult",
338 None,
339 "Pow",
340 "FloorDiv",
341 "Mod",
342 )
343 binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes))
345 unary_ops = _unary_ops_syms
346 unary_op_nodes = "UAdd", "USub", "Invert", "Not"
347 unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes))
349 rewrite_map = {
350 ast.Eq: ast.In,
351 ast.NotEq: ast.NotIn,
352 ast.In: ast.In,
353 ast.NotIn: ast.NotIn,
354 }
356 def __init__(self, env, engine, parser, preparser=_preparse):
357 self.env = env
358 self.engine = engine
359 self.parser = parser
360 self.preparser = preparser
361 self.assigner = None
363 def visit(self, node, **kwargs):
364 if isinstance(node, str):
365 clean = self.preparser(node)
366 try:
367 node = ast.fix_missing_locations(ast.parse(clean))
368 except SyntaxError as e:
369 if any(iskeyword(x) for x in clean.split()):
370 e.msg = "Python keyword not valid identifier in numexpr query"
371 raise e
373 method = "visit_" + type(node).__name__
374 visitor = getattr(self, method)
375 return visitor(node, **kwargs)
377 def visit_Module(self, node, **kwargs):
378 if len(node.body) != 1:
379 raise SyntaxError("only a single expression is allowed")
380 expr = node.body[0]
381 return self.visit(expr, **kwargs)
383 def visit_Expr(self, node, **kwargs):
384 return self.visit(node.value, **kwargs)
386 def _rewrite_membership_op(self, node, left, right):
387 # the kind of the operator (is actually an instance)
388 op_instance = node.op
389 op_type = type(op_instance)
391 # must be two terms and the comparison operator must be ==/!=/in/not in
392 if is_term(left) and is_term(right) and op_type in self.rewrite_map:
394 left_list, right_list = map(_is_list, (left, right))
395 left_str, right_str = map(_is_str, (left, right))
397 # if there are any strings or lists in the expression
398 if left_list or right_list or left_str or right_str:
399 op_instance = self.rewrite_map[op_type]()
401 # pop the string variable out of locals and replace it with a list
402 # of one string, kind of a hack
403 if right_str:
404 name = self.env.add_tmp([right.value])
405 right = self.term_type(name, self.env)
407 if left_str:
408 name = self.env.add_tmp([left.value])
409 left = self.term_type(name, self.env)
411 op = self.visit(op_instance)
412 return op, op_instance, left, right
414 def _maybe_transform_eq_ne(self, node, left=None, right=None):
415 if left is None:
416 left = self.visit(node.left, side="left")
417 if right is None:
418 right = self.visit(node.right, side="right")
419 op, op_class, left, right = self._rewrite_membership_op(node, left, right)
420 return op, op_class, left, right
422 def _maybe_downcast_constants(self, left, right):
423 f32 = np.dtype(np.float32)
424 if (
425 left.is_scalar
426 and hasattr(left, "value")
427 and not right.is_scalar
428 and right.return_type == f32
429 ):
430 # right is a float32 array, left is a scalar
431 name = self.env.add_tmp(np.float32(left.value))
432 left = self.term_type(name, self.env)
433 if (
434 right.is_scalar
435 and hasattr(right, "value")
436 and not left.is_scalar
437 and left.return_type == f32
438 ):
439 # left is a float32 array, right is a scalar
440 name = self.env.add_tmp(np.float32(right.value))
441 right = self.term_type(name, self.env)
443 return left, right
445 def _maybe_eval(self, binop, eval_in_python):
446 # eval `in` and `not in` (for now) in "partial" python space
447 # things that can be evaluated in "eval" space will be turned into
448 # temporary variables. for example,
449 # [1,2] in a + 2 * b
450 # in that case a + 2 * b will be evaluated using numexpr, and the "in"
451 # call will be evaluated using isin (in python space)
452 return binop.evaluate(
453 self.env, self.engine, self.parser, self.term_type, eval_in_python
454 )
456 def _maybe_evaluate_binop(
457 self,
458 op,
459 op_class,
460 lhs,
461 rhs,
462 eval_in_python=("in", "not in"),
463 maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="),
464 ):
465 res = op(lhs, rhs)
467 if res.has_invalid_return_type:
468 raise TypeError(
469 f"unsupported operand type(s) for {res.op}:"
470 f" '{lhs.type}' and '{rhs.type}'"
471 )
473 if self.engine != "pytables":
474 if (
475 res.op in _cmp_ops_syms
476 and getattr(lhs, "is_datetime", False)
477 or getattr(rhs, "is_datetime", False)
478 ):
479 # all date ops must be done in python bc numexpr doesn't work
480 # well with NaT
481 return self._maybe_eval(res, self.binary_ops)
483 if res.op in eval_in_python:
484 # "in"/"not in" ops are always evaluated in python
485 return self._maybe_eval(res, eval_in_python)
486 elif self.engine != "pytables":
487 if (
488 getattr(lhs, "return_type", None) == object
489 or getattr(rhs, "return_type", None) == object
490 ):
491 # evaluate "==" and "!=" in python if either of our operands
492 # has an object return type
493 return self._maybe_eval(res, eval_in_python + maybe_eval_in_python)
494 return res
496 def visit_BinOp(self, node, **kwargs):
497 op, op_class, left, right = self._maybe_transform_eq_ne(node)
498 left, right = self._maybe_downcast_constants(left, right)
499 return self._maybe_evaluate_binop(op, op_class, left, right)
501 def visit_Div(self, node, **kwargs):
502 return lambda lhs, rhs: Div(lhs, rhs)
504 def visit_UnaryOp(self, node, **kwargs):
505 op = self.visit(node.op)
506 operand = self.visit(node.operand)
507 return op(operand)
509 def visit_Name(self, node, **kwargs):
510 return self.term_type(node.id, self.env, **kwargs)
512 def visit_NameConstant(self, node, **kwargs):
513 return self.const_type(node.value, self.env)
515 def visit_Num(self, node, **kwargs):
516 return self.const_type(node.n, self.env)
518 def visit_Constant(self, node, **kwargs):
519 return self.const_type(node.n, self.env)
521 def visit_Str(self, node, **kwargs):
522 name = self.env.add_tmp(node.s)
523 return self.term_type(name, self.env)
525 def visit_List(self, node, **kwargs):
526 name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts])
527 return self.term_type(name, self.env)
529 visit_Tuple = visit_List
531 def visit_Index(self, node, **kwargs):
532 """ df.index[4] """
533 return self.visit(node.value)
535 def visit_Subscript(self, node, **kwargs):
536 import pandas as pd
538 value = self.visit(node.value)
539 slobj = self.visit(node.slice)
540 result = pd.eval(
541 slobj, local_dict=self.env, engine=self.engine, parser=self.parser
542 )
543 try:
544 # a Term instance
545 v = value.value[result]
546 except AttributeError:
547 # an Op instance
548 lhs = pd.eval(
549 value, local_dict=self.env, engine=self.engine, parser=self.parser
550 )
551 v = lhs[result]
552 name = self.env.add_tmp(v)
553 return self.term_type(name, env=self.env)
555 def visit_Slice(self, node, **kwargs):
556 """ df.index[slice(4,6)] """
557 lower = node.lower
558 if lower is not None:
559 lower = self.visit(lower).value
560 upper = node.upper
561 if upper is not None:
562 upper = self.visit(upper).value
563 step = node.step
564 if step is not None:
565 step = self.visit(step).value
567 return slice(lower, upper, step)
569 def visit_Assign(self, node, **kwargs):
570 """
571 support a single assignment node, like
573 c = a + b
575 set the assigner at the top level, must be a Name node which
576 might or might not exist in the resolvers
578 """
580 if len(node.targets) != 1:
581 raise SyntaxError("can only assign a single expression")
582 if not isinstance(node.targets[0], ast.Name):
583 raise SyntaxError("left hand side of an assignment must be a single name")
584 if self.env.target is None:
585 raise ValueError("cannot assign without a target object")
587 try:
588 assigner = self.visit(node.targets[0], **kwargs)
589 except UndefinedVariableError:
590 assigner = node.targets[0].id
592 self.assigner = getattr(assigner, "name", assigner)
593 if self.assigner is None:
594 raise SyntaxError(
595 "left hand side of an assignment must be a single resolvable name"
596 )
598 return self.visit(node.value, **kwargs)
600 def visit_Attribute(self, node, **kwargs):
601 attr = node.attr
602 value = node.value
604 ctx = node.ctx
605 if isinstance(ctx, ast.Load):
606 # resolve the value
607 resolved = self.visit(value).value
608 try:
609 v = getattr(resolved, attr)
610 name = self.env.add_tmp(v)
611 return self.term_type(name, self.env)
612 except AttributeError:
613 # something like datetime.datetime where scope is overridden
614 if isinstance(value, ast.Name) and value.id == attr:
615 return resolved
617 raise ValueError(f"Invalid Attribute context {ctx.__name__}")
619 def visit_Call(self, node, side=None, **kwargs):
621 if isinstance(node.func, ast.Attribute):
622 res = self.visit_Attribute(node.func)
623 elif not isinstance(node.func, ast.Name):
624 raise TypeError("Only named functions are supported")
625 else:
626 try:
627 res = self.visit(node.func)
628 except UndefinedVariableError:
629 # Check if this is a supported function name
630 try:
631 res = FuncNode(node.func.id)
632 except ValueError:
633 # Raise original error
634 raise
636 if res is None:
637 raise ValueError(f"Invalid function call {node.func.id}")
638 if hasattr(res, "value"):
639 res = res.value
641 if isinstance(res, FuncNode):
643 new_args = [self.visit(arg) for arg in node.args]
645 if node.keywords:
646 raise TypeError(
647 f'Function "{res.name}" does not support keyword arguments'
648 )
650 return res(*new_args, **kwargs)
652 else:
654 new_args = [self.visit(arg).value for arg in node.args]
656 for key in node.keywords:
657 if not isinstance(key, ast.keyword):
658 raise ValueError(f"keyword error in function call '{node.func.id}'")
660 if key.arg:
661 kwargs[key.arg] = self.visit(key.value).value
663 return self.const_type(res(*new_args, **kwargs), self.env)
665 def translate_In(self, op):
666 return op
668 def visit_Compare(self, node, **kwargs):
669 ops = node.ops
670 comps = node.comparators
672 # base case: we have something like a CMP b
673 if len(comps) == 1:
674 op = self.translate_In(ops[0])
675 binop = ast.BinOp(op=op, left=node.left, right=comps[0])
676 return self.visit(binop)
678 # recursive case: we have a chained comparison, a CMP b CMP c, etc.
679 left = node.left
680 values = []
681 for op, comp in zip(ops, comps):
682 new_node = self.visit(
683 ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)])
684 )
685 left = comp
686 values.append(new_node)
687 return self.visit(ast.BoolOp(op=ast.And(), values=values))
689 def _try_visit_binop(self, bop):
690 if isinstance(bop, (Op, Term)):
691 return bop
692 return self.visit(bop)
694 def visit_BoolOp(self, node, **kwargs):
695 def visitor(x, y):
696 lhs = self._try_visit_binop(x)
697 rhs = self._try_visit_binop(y)
699 op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs)
700 return self._maybe_evaluate_binop(op, node.op, lhs, rhs)
702 operands = node.values
703 return reduce(visitor, operands)
706_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"])
707_numexpr_supported_calls = frozenset(_reductions + _mathops)
710@disallow(
711 (_unsupported_nodes | _python_not_supported)
712 - (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"]))
713)
714class PandasExprVisitor(BaseExprVisitor):
715 def __init__(
716 self,
717 env,
718 engine,
719 parser,
720 preparser=partial(
721 _preparse,
722 f=_compose(_replace_locals, _replace_booleans, clean_backtick_quoted_toks),
723 ),
724 ):
725 super().__init__(env, engine, parser, preparser)
728@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"]))
729class PythonExprVisitor(BaseExprVisitor):
730 def __init__(self, env, engine, parser, preparser=lambda x: x):
731 super().__init__(env, engine, parser, preparser=preparser)
734class Expr:
735 """
736 Object encapsulating an expression.
738 Parameters
739 ----------
740 expr : str
741 engine : str, optional, default 'numexpr'
742 parser : str, optional, default 'pandas'
743 env : Scope, optional, default None
744 level : int, optional, default 2
745 """
747 env: Scope
748 engine: str
749 parser: str
751 def __init__(
752 self,
753 expr,
754 engine: str = "numexpr",
755 parser: str = "pandas",
756 env: Optional[Scope] = None,
757 level: int = 0,
758 ):
759 self.expr = expr
760 self.env = env or Scope(level=level + 1)
761 self.engine = engine
762 self.parser = parser
763 self._visitor = _parsers[parser](self.env, self.engine, self.parser)
764 self.terms = self.parse()
766 @property
767 def assigner(self):
768 return getattr(self._visitor, "assigner", None)
770 def __call__(self):
771 return self.terms(self.env)
773 def __repr__(self) -> str:
774 return printing.pprint_thing(self.terms)
776 def __len__(self) -> int:
777 return len(self.expr)
779 def parse(self):
780 """Parse an expression"""
781 return self._visitor.visit(self.expr)
783 @property
784 def names(self):
785 """Get the names in an expression"""
786 if is_term(self.terms):
787 return frozenset([self.terms.name])
788 return frozenset(term.name for term in com.flatten(self.terms))
791_parsers = {"python": PythonExprVisitor, "pandas": PandasExprVisitor}