Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/chameleon/parser.py : 22%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import re
2import logging
4try:
5 from collections import OrderedDict
6except ImportError:
7 from ordereddict import OrderedDict
9from .exc import ParseError
10from .namespaces import XML_NS
11from .tokenize import Token
13match_double_hyphen = re.compile(r'--(?!(-)*>)')
14match_tag_prefix_and_name = re.compile(
15 r'^(?P<prefix></?)(?P<name>([^:\n\r ]+:)?[^ \n\t\r>/]+)'
16 r'(?P<suffix>(?P<space>\s*)/?>)?',
17 re.UNICODE | re.DOTALL)
18match_single_attribute = re.compile(
19 r'(?P<space>\s+)(?!\d)'
20 r'(?P<name>[^ =/>\n\t\r]+)'
21 r'((?P<eq>\s*=\s*)'
22 r'((?P<quote>[\'"])(?P<value>.*?)(?P=quote)|'
23 r'(?P<alt_value>[^\s\'">/]+))|'
24 r'(?P<simple_value>(?![ \\n\\t\\r]*=)))',
25 re.UNICODE | re.DOTALL)
26match_comment = re.compile(
27 r'^<!--(?P<text>.*)-->$', re.DOTALL)
28match_cdata = re.compile(
29 r'^<!\[CDATA\[(?P<text>.*)\]>$', re.DOTALL)
30match_declaration = re.compile(
31 r'^<!(?P<text>[^>]+)>$', re.DOTALL)
32match_processing_instruction = re.compile(
33 r'^<\?(?P<name>\w+)(?P<text>.*?)\?>', re.DOTALL)
34match_xml_declaration = re.compile(r'^<\?xml(?=[ /])', re.DOTALL)
36log = logging.getLogger('chameleon.parser')
39def substitute(regex, repl, token):
40 if not isinstance(token, Token):
41 token = Token(token)
43 return Token(
44 regex.sub(repl, token),
45 token.pos,
46 token.source,
47 token.filename
48 )
51def groups(m, token):
52 result = []
53 for i, group in enumerate(m.groups()):
54 if group is not None:
55 j, k = m.span(i + 1)
56 group = token[j:k]
58 result.append(group)
60 return tuple(result)
63def groupdict(m, token):
64 d = m.groupdict()
65 for name, value in d.items():
66 if value is not None:
67 i, j = m.span(name)
68 d[name] = token[i:j]
70 return d
73def match_tag(token, regex=match_tag_prefix_and_name):
74 m = regex.match(token)
75 d = groupdict(m, token)
77 end = m.end()
78 token = token[end:]
80 attrs = d['attrs'] = []
81 for m in match_single_attribute.finditer(token):
82 attr = groupdict(m, token)
83 alt_value = attr.pop('alt_value', None)
84 if alt_value is not None:
85 attr['value'] = alt_value
86 attr['quote'] = ''
87 simple_value = attr.pop('simple_value', None)
88 if simple_value is not None:
89 attr['quote'] = ''
90 attr['value'] = ''
91 attr['eq'] = ''
92 attrs.append(attr)
93 d['suffix'] = token[m.end():]
95 return d
98def parse_tag(token, namespace, restricted_namespace):
99 node = match_tag(token)
101 update_namespace(node['attrs'], namespace)
103 if ':' in node['name']:
104 prefix = node['name'].split(':')[0]
105 else:
106 prefix = None
108 default = node['namespace'] = namespace.get(prefix, XML_NS)
110 node['ns_attrs'] = unpack_attributes(
111 node['attrs'], namespace, default, restricted_namespace
112 )
114 node['ns_map'] = namespace
116 return node
119def update_namespace(attributes, namespace):
120 # possibly update namespaces; we do this in a separate step
121 # because this assignment is irrespective of order
122 for attribute in attributes:
123 name = attribute['name']
124 value = attribute['value']
125 if name == 'xmlns':
126 namespace[None] = value
127 elif name.startswith('xmlns:'):
128 namespace[name[6:]] = value
131def unpack_attributes(attributes, namespace, default, restricted_namespace):
132 namespaced = OrderedDict()
134 for index, attribute in enumerate(attributes):
135 name = attribute['name']
136 value = attribute['value']
138 if ':' in name:
139 prefix = name.split(':')[0]
140 name = name[len(prefix) + 1:]
141 try:
142 ns = namespace[prefix]
143 except KeyError:
144 if restricted_namespace:
145 raise KeyError(
146 "Undefined namespace prefix: %s." % prefix)
147 else:
148 ns = default
149 else:
150 ns = default
151 namespaced[ns, name] = value
153 return namespaced
156def identify(string):
157 if string.startswith("<"):
158 if string.startswith("<!--"):
159 m = match_double_hyphen.search(string[4:])
160 if m is not None:
161 raise ParseError(
162 "The string '--' is not allowed in a comment.",
163 string[4 + m.start():4 + m.end()]
164 )
165 return "comment"
166 if string.startswith("<![CDATA["):
167 return "cdata"
168 if string.startswith("<!"):
169 return "declaration"
170 if string.startswith("<?xml"):
171 return "xml_declaration"
172 if string.startswith("<?"):
173 return "processing_instruction"
174 if string.startswith("</"):
175 return "end_tag"
176 if string.endswith("/>"):
177 return "empty_tag"
178 if string.endswith(">"):
179 return "start_tag"
180 return "error"
181 return "text"
184class ElementParser(object):
185 """Parses tokens into elements."""
187 def __init__(self, stream, default_namespaces, restricted_namespace=True):
188 self.stream = stream
189 self.queue = []
190 self.index = []
191 self.namespaces = [default_namespaces.copy()]
192 self.restricted_namespace = restricted_namespace
194 def __iter__(self):
195 for token in self.stream:
196 item = self.parse(token)
197 self.queue.append(item)
199 return iter(self.queue)
201 def parse(self, token):
202 kind = identify(token)
203 visitor = getattr(self, "visit_%s" % kind, self.visit_default)
204 return visitor(kind, token)
206 def visit_comment(self, kind, token):
207 return "comment", (token, )
209 def visit_cdata(self, kind, token):
210 return "cdata", (token, )
212 def visit_default(self, kind, token):
213 return "default", (token, )
215 def visit_processing_instruction(self, kind, token):
216 m = match_processing_instruction.match(token)
217 if m is None:
218 return self.visit_default(kind, token)
220 return "processing_instruction", (groupdict(m, token), )
222 def visit_text(self, kind, token):
223 return kind, (token, )
225 def visit_start_tag(self, kind, token):
226 namespace = self.namespaces[-1].copy()
227 self.namespaces.append(namespace)
228 node = parse_tag(token, namespace, self.restricted_namespace)
229 self.index.append((node['name'], len(self.queue)))
230 return kind, (node, )
232 def visit_end_tag(self, kind, token):
233 try:
234 namespace = self.namespaces.pop()
235 except IndexError:
236 raise ParseError("Unexpected end tag.", token)
238 node = parse_tag(token, namespace, self.restricted_namespace)
240 while self.index:
241 name, pos = self.index.pop()
242 if name == node['name']:
243 start, = self.queue.pop(pos)[1]
244 children = self.queue[pos:]
245 del self.queue[pos:]
246 break
247 else:
248 raise ParseError("Unexpected end tag.", token)
250 return "element", (start, node, children)
252 def visit_empty_tag(self, kind, token):
253 namespace = self.namespaces[-1].copy()
254 node = parse_tag(token, namespace, self.restricted_namespace)
255 return "element", (node, None, [])
257 def visit_xml_declaration(self, kind, token):
258 return self.visit_empty_tag(kind, token)