Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import re 

2import logging 

3 

4try: 

5 from collections import OrderedDict 

6except ImportError: 

7 from ordereddict import OrderedDict 

8 

9from .exc import ParseError 

10from .namespaces import XML_NS 

11from .tokenize import Token 

12 

13match_double_hyphen = re.compile(r'--(?!(-)*>)') 

14match_tag_prefix_and_name = re.compile( 

15 r'^(?P<prefix></?)(?P<name>([^:\n\r ]+:)?[^ \n\t\r>/]+)' 

16 r'(?P<suffix>(?P<space>\s*)/?>)?', 

17 re.UNICODE | re.DOTALL) 

18match_single_attribute = re.compile( 

19 r'(?P<space>\s+)(?!\d)' 

20 r'(?P<name>[^ =/>\n\t\r]+)' 

21 r'((?P<eq>\s*=\s*)' 

22 r'((?P<quote>[\'"])(?P<value>.*?)(?P=quote)|' 

23 r'(?P<alt_value>[^\s\'">/]+))|' 

24 r'(?P<simple_value>(?![ \\n\\t\\r]*=)))', 

25 re.UNICODE | re.DOTALL) 

26match_comment = re.compile( 

27 r'^<!--(?P<text>.*)-->$', re.DOTALL) 

28match_cdata = re.compile( 

29 r'^<!\[CDATA\[(?P<text>.*)\]>$', re.DOTALL) 

30match_declaration = re.compile( 

31 r'^<!(?P<text>[^>]+)>$', re.DOTALL) 

32match_processing_instruction = re.compile( 

33 r'^<\?(?P<name>\w+)(?P<text>.*?)\?>', re.DOTALL) 

34match_xml_declaration = re.compile(r'^<\?xml(?=[ /])', re.DOTALL) 

35 

36log = logging.getLogger('chameleon.parser') 

37 

38 

39def substitute(regex, repl, token): 

40 if not isinstance(token, Token): 

41 token = Token(token) 

42 

43 return Token( 

44 regex.sub(repl, token), 

45 token.pos, 

46 token.source, 

47 token.filename 

48 ) 

49 

50 

51def groups(m, token): 

52 result = [] 

53 for i, group in enumerate(m.groups()): 

54 if group is not None: 

55 j, k = m.span(i + 1) 

56 group = token[j:k] 

57 

58 result.append(group) 

59 

60 return tuple(result) 

61 

62 

63def groupdict(m, token): 

64 d = m.groupdict() 

65 for name, value in d.items(): 

66 if value is not None: 

67 i, j = m.span(name) 

68 d[name] = token[i:j] 

69 

70 return d 

71 

72 

73def match_tag(token, regex=match_tag_prefix_and_name): 

74 m = regex.match(token) 

75 d = groupdict(m, token) 

76 

77 end = m.end() 

78 token = token[end:] 

79 

80 attrs = d['attrs'] = [] 

81 for m in match_single_attribute.finditer(token): 

82 attr = groupdict(m, token) 

83 alt_value = attr.pop('alt_value', None) 

84 if alt_value is not None: 

85 attr['value'] = alt_value 

86 attr['quote'] = '' 

87 simple_value = attr.pop('simple_value', None) 

88 if simple_value is not None: 

89 attr['quote'] = '' 

90 attr['value'] = '' 

91 attr['eq'] = '' 

92 attrs.append(attr) 

93 d['suffix'] = token[m.end():] 

94 

95 return d 

96 

97 

98def parse_tag(token, namespace, restricted_namespace): 

99 node = match_tag(token) 

100 

101 update_namespace(node['attrs'], namespace) 

102 

103 if ':' in node['name']: 

104 prefix = node['name'].split(':')[0] 

105 else: 

106 prefix = None 

107 

108 default = node['namespace'] = namespace.get(prefix, XML_NS) 

109 

110 node['ns_attrs'] = unpack_attributes( 

111 node['attrs'], namespace, default, restricted_namespace 

112 ) 

113 

114 node['ns_map'] = namespace 

115 

116 return node 

117 

118 

119def update_namespace(attributes, namespace): 

120 # possibly update namespaces; we do this in a separate step 

121 # because this assignment is irrespective of order 

122 for attribute in attributes: 

123 name = attribute['name'] 

124 value = attribute['value'] 

125 if name == 'xmlns': 

126 namespace[None] = value 

127 elif name.startswith('xmlns:'): 

128 namespace[name[6:]] = value 

129 

130 

131def unpack_attributes(attributes, namespace, default, restricted_namespace): 

132 namespaced = OrderedDict() 

133 

134 for index, attribute in enumerate(attributes): 

135 name = attribute['name'] 

136 value = attribute['value'] 

137 

138 if ':' in name: 

139 prefix = name.split(':')[0] 

140 name = name[len(prefix) + 1:] 

141 try: 

142 ns = namespace[prefix] 

143 except KeyError: 

144 if restricted_namespace: 

145 raise KeyError( 

146 "Undefined namespace prefix: %s." % prefix) 

147 else: 

148 ns = default 

149 else: 

150 ns = default 

151 namespaced[ns, name] = value 

152 

153 return namespaced 

154 

155 

156def identify(string): 

157 if string.startswith("<"): 

158 if string.startswith("<!--"): 

159 m = match_double_hyphen.search(string[4:]) 

160 if m is not None: 

161 raise ParseError( 

162 "The string '--' is not allowed in a comment.", 

163 string[4 + m.start():4 + m.end()] 

164 ) 

165 return "comment" 

166 if string.startswith("<![CDATA["): 

167 return "cdata" 

168 if string.startswith("<!"): 

169 return "declaration" 

170 if string.startswith("<?xml"): 

171 return "xml_declaration" 

172 if string.startswith("<?"): 

173 return "processing_instruction" 

174 if string.startswith("</"): 

175 return "end_tag" 

176 if string.endswith("/>"): 

177 return "empty_tag" 

178 if string.endswith(">"): 

179 return "start_tag" 

180 return "error" 

181 return "text" 

182 

183 

184class ElementParser(object): 

185 """Parses tokens into elements.""" 

186 

187 def __init__(self, stream, default_namespaces, restricted_namespace=True): 

188 self.stream = stream 

189 self.queue = [] 

190 self.index = [] 

191 self.namespaces = [default_namespaces.copy()] 

192 self.restricted_namespace = restricted_namespace 

193 

194 def __iter__(self): 

195 for token in self.stream: 

196 item = self.parse(token) 

197 self.queue.append(item) 

198 

199 return iter(self.queue) 

200 

201 def parse(self, token): 

202 kind = identify(token) 

203 visitor = getattr(self, "visit_%s" % kind, self.visit_default) 

204 return visitor(kind, token) 

205 

206 def visit_comment(self, kind, token): 

207 return "comment", (token, ) 

208 

209 def visit_cdata(self, kind, token): 

210 return "cdata", (token, ) 

211 

212 def visit_default(self, kind, token): 

213 return "default", (token, ) 

214 

215 def visit_processing_instruction(self, kind, token): 

216 m = match_processing_instruction.match(token) 

217 if m is None: 

218 return self.visit_default(kind, token) 

219 

220 return "processing_instruction", (groupdict(m, token), ) 

221 

222 def visit_text(self, kind, token): 

223 return kind, (token, ) 

224 

225 def visit_start_tag(self, kind, token): 

226 namespace = self.namespaces[-1].copy() 

227 self.namespaces.append(namespace) 

228 node = parse_tag(token, namespace, self.restricted_namespace) 

229 self.index.append((node['name'], len(self.queue))) 

230 return kind, (node, ) 

231 

232 def visit_end_tag(self, kind, token): 

233 try: 

234 namespace = self.namespaces.pop() 

235 except IndexError: 

236 raise ParseError("Unexpected end tag.", token) 

237 

238 node = parse_tag(token, namespace, self.restricted_namespace) 

239 

240 while self.index: 

241 name, pos = self.index.pop() 

242 if name == node['name']: 

243 start, = self.queue.pop(pos)[1] 

244 children = self.queue[pos:] 

245 del self.queue[pos:] 

246 break 

247 else: 

248 raise ParseError("Unexpected end tag.", token) 

249 

250 return "element", (start, node, children) 

251 

252 def visit_empty_tag(self, kind, token): 

253 namespace = self.namespaces[-1].copy() 

254 node = parse_tag(token, namespace, self.restricted_namespace) 

255 return "element", (node, None, []) 

256 

257 def visit_xml_declaration(self, kind, token): 

258 return self.visit_empty_tag(kind, token)