Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of Patsy 

2# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com> 

3# See file LICENSE.txt for license information. 

4 

5# Utilities for dealing with Python code at the token level. 

6# 

7# Includes: 

8# a "pretty printer" that converts a sequence of tokens back into a 

9# readable, white-space normalized string. 

10# a utility function to replace calls to global functions with calls to 

11# other functions 

12 

13import tokenize 

14from six.moves import cStringIO as StringIO 

15 

16from patsy import PatsyError 

17from patsy.origin import Origin 

18 

19__all__ = ["python_tokenize", "pretty_untokenize", 

20 "normalize_token_spacing"] 

21 

22# A convenience wrapper around tokenize.generate_tokens. yields tuples 

23# (tokenize type, token string, origin object) 

24def python_tokenize(code): 

25 # Since formulas can only contain Python expressions, and Python 

26 # expressions cannot meaningfully contain newlines, we'll just remove all 

27 # the newlines up front to avoid any complications: 

28 code = code.replace("\n", " ").strip() 

29 it = tokenize.generate_tokens(StringIO(code).readline) 

30 try: 

31 for (pytype, string, (_, start), (_, end), code) in it: 

32 if pytype == tokenize.ENDMARKER: 

33 break 

34 origin = Origin(code, start, end) 

35 assert pytype != tokenize.NL 

36 if pytype == tokenize.NEWLINE: 

37 assert string == "" 

38 continue 

39 if pytype == tokenize.ERRORTOKEN: 

40 raise PatsyError("error tokenizing input " 

41 "(maybe an unclosed string?)", 

42 origin) 

43 if pytype == tokenize.COMMENT: 

44 raise PatsyError("comments are not allowed", origin) 

45 yield (pytype, string, origin) 

46 else: # pragma: no cover 

47 raise ValueError("stream ended without ENDMARKER?!?") 

48 except tokenize.TokenError as e: 

49 # TokenError is raised iff the tokenizer thinks that there is 

50 # some sort of multi-line construct in progress (e.g., an 

51 # unclosed parentheses, which in Python lets a virtual line 

52 # continue past the end of the physical line), and it hits the 

53 # end of the source text. We have our own error handling for 

54 # such cases, so just treat this as an end-of-stream. 

55 #  

56 # Just in case someone adds some other error case: 

57 assert e.args[0].startswith("EOF in multi-line") 

58 return 

59 

60def test_python_tokenize(): 

61 code = "a + (foo * -1)" 

62 tokens = list(python_tokenize(code)) 

63 expected = [(tokenize.NAME, "a", Origin(code, 0, 1)), 

64 (tokenize.OP, "+", Origin(code, 2, 3)), 

65 (tokenize.OP, "(", Origin(code, 4, 5)), 

66 (tokenize.NAME, "foo", Origin(code, 5, 8)), 

67 (tokenize.OP, "*", Origin(code, 9, 10)), 

68 (tokenize.OP, "-", Origin(code, 11, 12)), 

69 (tokenize.NUMBER, "1", Origin(code, 12, 13)), 

70 (tokenize.OP, ")", Origin(code, 13, 14))] 

71 assert tokens == expected 

72 

73 code2 = "a + (b" 

74 tokens2 = list(python_tokenize(code2)) 

75 expected2 = [(tokenize.NAME, "a", Origin(code2, 0, 1)), 

76 (tokenize.OP, "+", Origin(code2, 2, 3)), 

77 (tokenize.OP, "(", Origin(code2, 4, 5)), 

78 (tokenize.NAME, "b", Origin(code2, 5, 6))] 

79 assert tokens2 == expected2 

80 

81 from nose.tools import assert_raises 

82 assert_raises(PatsyError, list, python_tokenize("a b # c")) 

83 

84 from nose.tools import assert_raises 

85 assert_raises(PatsyError, list, python_tokenize("a b \"c")) 

86 

87_python_space_both = (list("+-*/%&^|<>") 

88 + ["==", "<>", "!=", "<=", ">=", 

89 "<<", ">>", "**", "//"]) 

90_python_space_before = (_python_space_both 

91 + ["!", "~"]) 

92_python_space_after = (_python_space_both 

93 + [",", ":"]) 

94 

95def pretty_untokenize(typed_tokens): 

96 text = [] 

97 prev_was_space_delim = False 

98 prev_wants_space = False 

99 prev_was_open_paren_or_comma = False 

100 prev_was_object_like = False 

101 brackets = [] 

102 for token_type, token in typed_tokens: 

103 assert token_type not in (tokenize.INDENT, tokenize.DEDENT, 

104 tokenize.NL) 

105 if token_type == tokenize.NEWLINE: 

106 continue 

107 if token_type == tokenize.ENDMARKER: 

108 continue 

109 if token_type in (tokenize.NAME, tokenize.NUMBER, tokenize.STRING): 

110 if prev_wants_space or prev_was_space_delim: 

111 text.append(" ") 

112 text.append(token) 

113 prev_wants_space = False 

114 prev_was_space_delim = True 

115 else: 

116 if token in ("(", "[", "{"): 

117 brackets.append(token) 

118 elif brackets and token in (")", "]", "}"): 

119 brackets.pop() 

120 this_wants_space_before = (token in _python_space_before) 

121 this_wants_space_after = (token in _python_space_after) 

122 # Special case for slice syntax: foo[:10] 

123 # Otherwise ":" is spaced after, like: "{1: ...}", "if a: ..." 

124 if token == ":" and brackets and brackets[-1] == "[": 

125 this_wants_space_after = False 

126 # Special case for foo(*args), foo(a, *args): 

127 if token in ("*", "**") and prev_was_open_paren_or_comma: 

128 this_wants_space_before = False 

129 this_wants_space_after = False 

130 # Special case for "a = foo(b=1)": 

131 if token == "=" and not brackets: 

132 this_wants_space_before = True 

133 this_wants_space_after = True 

134 # Special case for unary -, +. Our heuristic is that if we see the 

135 # + or - after something that looks like an object (a NAME, 

136 # NUMBER, STRING, or close paren) then it is probably binary, 

137 # otherwise it is probably unary. 

138 if token in ("+", "-") and not prev_was_object_like: 

139 this_wants_space_before = False 

140 this_wants_space_after = False 

141 if prev_wants_space or this_wants_space_before: 

142 text.append(" ") 

143 text.append(token) 

144 prev_wants_space = this_wants_space_after 

145 prev_was_space_delim = False 

146 if (token_type in (tokenize.NAME, tokenize.NUMBER, tokenize.STRING) 

147 or token == ")"): 

148 prev_was_object_like = True 

149 else: 

150 prev_was_object_like = False 

151 prev_was_open_paren_or_comma = token in ("(", ",") 

152 return "".join(text) 

153 

154def normalize_token_spacing(code): 

155 tokens = [(t[0], t[1]) 

156 for t in tokenize.generate_tokens(StringIO(code).readline)] 

157 return pretty_untokenize(tokens) 

158 

159def test_pretty_untokenize_and_normalize_token_spacing(): 

160 assert normalize_token_spacing("1 + 1") == "1 + 1" 

161 assert normalize_token_spacing("1+1") == "1 + 1" 

162 assert normalize_token_spacing("1*(2+3**2)") == "1 * (2 + 3 ** 2)" 

163 assert normalize_token_spacing("a and b") == "a and b" 

164 assert normalize_token_spacing("foo(a=bar.baz[1:])") == "foo(a=bar.baz[1:])" 

165 assert normalize_token_spacing("""{"hi":foo[:]}""") == """{"hi": foo[:]}""" 

166 assert normalize_token_spacing("""'a' "b" 'c'""") == """'a' "b" 'c'""" 

167 assert normalize_token_spacing('"""a""" is 1 or 2==3') == '"""a""" is 1 or 2 == 3' 

168 assert normalize_token_spacing("foo ( * args )") == "foo(*args)" 

169 assert normalize_token_spacing("foo ( a * args )") == "foo(a * args)" 

170 assert normalize_token_spacing("foo ( ** args )") == "foo(**args)" 

171 assert normalize_token_spacing("foo ( a ** args )") == "foo(a ** args)" 

172 assert normalize_token_spacing("foo (1, * args )") == "foo(1, *args)" 

173 assert normalize_token_spacing("foo (1, a * args )") == "foo(1, a * args)" 

174 assert normalize_token_spacing("foo (1, ** args )") == "foo(1, **args)" 

175 assert normalize_token_spacing("foo (1, a ** args )") == "foo(1, a ** args)" 

176 

177 assert normalize_token_spacing("a=foo(b = 1)") == "a = foo(b=1)" 

178 

179 assert normalize_token_spacing("foo(+ 10, bar = - 1)") == "foo(+10, bar=-1)" 

180 assert normalize_token_spacing("1 + +10 + -1 - 5") == "1 + +10 + -1 - 5"