Package lxml :: Module doctestcompare
[hide private]
[frames] | no frames]

Source Code for Module lxml.doctestcompare

  1  """ 
  2  lxml-based doctest output comparison. 
  3   
  4  To use this you must call ``lxmldoctest.install()``, which will cause 
  5  doctest to use this in all subsequent calls. 
  6   
  7  This changes the way output is checked and comparisons are made for 
  8  XML or HTML-like content. 
  9   
 10  XML or HTML content is noticed because the example starts with ``<`` 
 11  (it's HTML if it starts with ``<html``).  You can also use the 
 12  ``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing. 
 13   
 14  Some rough wildcard-like things are allowed.  Whitespace is generally 
 15  ignored (except in attributes).  In text (attributes and text in the 
 16  body) you can use ``...`` as a wildcard.  In an example it also 
 17  matches any trailing tags in the element, though it does not match 
 18  leading tags.  You may create a tag ``<any>`` or include an ``any`` 
 19  attribute in the tag.  An ``any`` tag matches any tag, while the 
 20  attribute matches any and all attributes. 
 21   
 22  When a match fails, the reformatted example and gotten text is 
 23  displayed (indented), and a rough diff-like output is given.  Anything 
 24  marked with ``-`` is in the output but wasn't supposed to be, and 
 25  similarly ``+`` means its in the example but wasn't in the output. 
 26  """ 
 27   
 28  from lxml import etree 
 29  from lxml.html import document_fromstring 
 30  import re 
 31  import doctest 
 32  import cgi 
 33   
 34  __all__ = ['PARSE_HTML', 'PARSE_XML', 'LXMLOutputChecker', 
 35             'LHTMLOutputChecker', 'install', 'temp_install'] 
 36   
 37  PARSE_HTML = doctest.register_optionflag('PARSE_HTML') 
 38  PARSE_XML = doctest.register_optionflag('PARSE_XML') 
 39   
 40  OutputChecker = doctest.OutputChecker 
 41   
42 -def strip(v):
43 if v is None: 44 return None 45 else: 46 return v.strip()
47
48 -def norm_whitespace(v):
49 return _norm_whitespace_re.sub(' ', v)
50 51 # We use this to distinguish repr()s from elements: 52 _repr_re = re.compile(r'^<[^>]+ (at|object) ') 53 _norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+') 54
55 -class LXMLOutputChecker(OutputChecker):
56 57 empty_tags = ( 58 'param', 'img', 'area', 'br', 'basefont', 'input', 59 'base', 'meta', 'link', 'col') 60
61 - def get_default_parser(self):
62 return etree.XML
63
64 - def check_output(self, want, got, optionflags):
65 alt_self = getattr(self, '_temp_override_self', None) 66 if alt_self is not None: 67 super_method = self._temp_call_super_check_output 68 self = alt_self 69 else: 70 super_method = OutputChecker.check_output 71 parser = self.get_parser(want, got, optionflags) 72 if not parser: 73 return super_method( 74 self, want, got, optionflags) 75 try: 76 want_doc = parser(want) 77 except etree.XMLSyntaxError: 78 return False 79 try: 80 got_doc = parser(got) 81 except etree.XMLSyntaxError: 82 return False 83 return self.compare_docs(want_doc, got_doc)
84
85 - def get_parser(self, want, got, optionflags):
86 parser = None 87 if PARSE_HTML & optionflags: 88 parser = document_fromstring 89 elif PARSE_XML & optionflags: 90 parser = etree.XML 91 elif (want.strip().lower().startswith('<html') 92 and got.strip().startswith('<html')): 93 parser = document_fromstring 94 elif (self._looks_like_markup(want) 95 and self._looks_like_markup(got)): 96 parser = self.get_default_parser() 97 return parser
98
99 - def _looks_like_markup(self, s):
100 s = s.strip() 101 return (s.startswith('<') 102 and not _repr_re.search(s))
103
104 - def compare_docs(self, want, got):
105 if want.tag != got.tag and want.tag != 'any': 106 return False 107 if not self.text_compare(want.text, got.text, True): 108 return False 109 if not self.text_compare(want.tail, got.tail, True): 110 return False 111 if 'any' not in want.attrib: 112 want_keys = sorted(want.attrib.keys()) 113 got_keys = sorted(got.attrib.keys()) 114 if want_keys != got_keys: 115 return False 116 for key in want_keys: 117 if not self.text_compare(want.attrib[key], got.attrib[key], False): 118 return False 119 if want.text != '...' or len(want): 120 want_children = list(want) 121 got_children = list(got) 122 while want_children or got_children: 123 if not want_children or not got_children: 124 return False 125 want_first = want_children.pop(0) 126 got_first = got_children.pop(0) 127 if not self.compare_docs(want_first, got_first): 128 return False 129 if not got_children and want_first.tail == '...': 130 break 131 return True
132
133 - def text_compare(self, want, got, strip):
134 want = want or '' 135 got = got or '' 136 if strip: 137 want = norm_whitespace(want).strip() 138 got = norm_whitespace(got).strip() 139 want = '^%s$' % re.escape(want) 140 want = want.replace(r'\.\.\.', '.*') 141 if re.search(want, got): 142 return True 143 else: 144 return False
145
146 - def output_difference(self, example, got, optionflags):
147 want = example.want 148 parser = self.get_parser(want, got, optionflags) 149 errors = [] 150 if parser is not None: 151 try: 152 want_doc = parser(want) 153 except etree.XMLSyntaxError, e: 154 errors.append('In example: %s' % e) 155 try: 156 got_doc = parser(got) 157 except etree.XMLSyntaxError, e: 158 errors.append('In actual output: %s' % e) 159 if parser is None or errors: 160 value = OutputChecker.output_difference( 161 self, example, got, optionflags) 162 if errors: 163 errors.append(value) 164 return '\n'.join(errors) 165 else: 166 return value 167 html = parser is document_fromstring 168 diff_parts = [] 169 diff_parts.append('Expected:') 170 diff_parts.append(self.format_doc(want_doc, html, 2)) 171 diff_parts.append('Got:') 172 diff_parts.append(self.format_doc(got_doc, html, 2)) 173 diff_parts.append('Diff:') 174 diff_parts.append(self.collect_diff(want_doc, got_doc, html, 2)) 175 return '\n'.join(diff_parts)
176
177 - def html_empty_tag(self, el, html=True):
178 if not html: 179 return False 180 if el.tag not in self.empty_tags: 181 return False 182 if el.text or len(el): 183 # This shouldn't happen (contents in an empty tag) 184 return False 185 return True
186
187 - def format_doc(self, doc, html, indent, prefix=''):
188 parts = [] 189 if not len(doc): 190 # No children... 191 parts.append(' '*indent) 192 parts.append(prefix) 193 parts.append(self.format_tag(doc)) 194 if not self.html_empty_tag(doc, html): 195 if strip(doc.text): 196 parts.append(self.format_text(doc.text)) 197 parts.append(self.format_end_tag(doc)) 198 if strip(doc.tail): 199 parts.append(self.format_text(doc.tail)) 200 parts.append('\n') 201 return ''.join(parts) 202 parts.append(' '*indent) 203 parts.append(prefix) 204 parts.append(self.format_tag(doc)) 205 if not self.html_empty_tag(doc, html): 206 parts.append('\n') 207 if strip(doc.text): 208 parts.append(' '*indent) 209 parts.append(self.format_text(doc.text)) 210 parts.append('\n') 211 for el in doc: 212 parts.append(self.format_doc(el, html, indent+2)) 213 parts.append(' '*indent) 214 parts.append(self.format_end_tag(doc)) 215 parts.append('\n') 216 if strip(doc.tail): 217 parts.append(' '*indent) 218 parts.append(self.format_text(doc.tail)) 219 parts.append('\n') 220 return ''.join(parts)
221
222 - def format_text(self, text, strip=True):
223 if text is None: 224 return '' 225 if strip: 226 text = text.strip() 227 return cgi.escape(text, 1)
228
229 - def format_tag(self, el):
230 attrs = [] 231 if isinstance(el, etree.CommentBase): 232 # FIXME: probably PIs should be handled specially too? 233 return '<!--' 234 for name, value in sorted(el.attrib.items()): 235 attrs.append('%s="%s"' % (name, self.format_text(value, False))) 236 if not attrs: 237 return '<%s>' % el.tag 238 return '<%s %s>' % (el.tag, ' '.join(attrs))
239
240 - def format_end_tag(self, el):
241 if isinstance(el, etree.CommentBase): 242 # FIXME: probably PIs should be handled specially too? 243 return '-->' 244 return '</%s>' % el.tag
245
246 - def collect_diff(self, want, got, html, indent):
247 parts = [] 248 if not len(want) and not len(got): 249 parts.append(' '*indent) 250 parts.append(self.collect_diff_tag(want, got)) 251 if not self.html_empty_tag(got, html): 252 parts.append(self.collect_diff_text(want.text, got.text)) 253 parts.append(self.collect_diff_end_tag(want, got)) 254 parts.append(self.collect_diff_text(want.tail, got.tail)) 255 parts.append('\n') 256 return ''.join(parts) 257 parts.append(' '*indent) 258 parts.append(self.collect_diff_tag(want, got)) 259 parts.append('\n') 260 if strip(want.text) or strip(got.text): 261 parts.append(' '*indent) 262 parts.append(self.collect_diff_text(want.text, got.text)) 263 parts.append('\n') 264 want_children = list(want) 265 got_children = list(got) 266 while want_children or got_children: 267 if not want_children: 268 parts.append(self.format_doc(got_children.pop(0), html, indent+2, '-')) 269 continue 270 if not got_children: 271 parts.append(self.format_doc(want_children.pop(0), html, indent+2, '+')) 272 continue 273 parts.append(self.collect_diff( 274 want_children.pop(0), got_children.pop(0), html, indent+2)) 275 parts.append(' '*indent) 276 parts.append(self.collect_diff_end_tag(want, got)) 277 parts.append('\n') 278 if strip(want.tail) or strip(got.tail): 279 parts.append(' '*indent) 280 parts.append(self.collect_diff_text(want.tail, got.tail)) 281 parts.append('\n') 282 return ''.join(parts)
283
284 - def collect_diff_tag(self, want, got):
285 if want.tag != got.tag and want.tag != 'any': 286 tag = '%s (got: %s)' % (want.tag, got.tag) 287 else: 288 tag = got.tag 289 attrs = [] 290 any = want.tag == 'any' or 'any' in want.attrib 291 for name, value in sorted(got.attrib.items()): 292 if name not in want.attrib and not any: 293 attrs.append('-%s="%s"' % (name, self.format_text(value, False))) 294 else: 295 if name in want.attrib: 296 text = self.collect_diff_text(value, want.attrib[name], False) 297 else: 298 text = self.format_text(value, False) 299 attrs.append('%s="%s"' % (name, text)) 300 if not any: 301 for name, value in sorted(want.attrib.items()): 302 if name in got.attrib: 303 continue 304 attrs.append('+%s="%s"' % (name, self.format_text(value, False))) 305 if attrs: 306 tag = '<%s %s>' % (tag, ' '.join(attrs)) 307 else: 308 tag = '<%s>' % tag 309 return tag
310
311 - def collect_diff_end_tag(self, want, got):
312 if want.tag != got.tag: 313 tag = '%s (got: %s)' % (want.tag, got.tag) 314 else: 315 tag = got.tag 316 return '</%s>' % tag
317
318 - def collect_diff_text(self, want, got, strip=True):
319 if self.text_compare(want, got, strip): 320 if not got: 321 return '' 322 return self.format_text(got, strip) 323 text = '%s (got: %s)' % (want, got) 324 return self.format_text(text, strip)
325
326 -class LHTMLOutputChecker(LXMLOutputChecker):
327 - def get_default_parser(self):
329
330 -def install(html=False):
331 """ 332 Install doctestcompare for all future doctests. 333 334 If html is true, then by default the HTML parser will be used; 335 otherwise the XML parser is used. 336 """ 337 if html: 338 doctest.OutputChecker = LHTMLOutputChecker 339 else: 340 doctest.OutputChecker = LXMLOutputChecker
341
342 -def temp_install(html=False, del_module=None):
343 """ 344 Use this *inside* a doctest to enable this checker for this 345 doctest only. 346 347 If html is true, then by default the HTML parser will be used; 348 otherwise the XML parser is used. 349 """ 350 if html: 351 Checker = LHTMLOutputChecker 352 else: 353 Checker = LXMLOutputChecker 354 frame = _find_doctest_frame() 355 dt_self = frame.f_locals['self'] 356 checker = Checker() 357 old_checker = dt_self._checker 358 dt_self._checker = checker 359 # The unfortunate thing is that there is a local variable 'check' 360 # in the function that runs the doctests, that is a bound method 361 # into the output checker. We have to update that. We can't 362 # modify the frame, so we have to modify the object in place. The 363 # only way to do this is to actually change the func_code 364 # attribute of the method. We change it, and then wait for 365 # __record_outcome to be run, which signals the end of the __run 366 # method, at which point we restore the previous check_output 367 # implementation. 368 check_func = frame.f_locals['check'].im_func 369 # Because we can't patch up func_globals, this is the only global 370 # in check_output that we care about: 371 doctest.etree = etree 372 _RestoreChecker(dt_self, old_checker, checker, 373 check_func, checker.check_output.im_func, 374 del_module)
375
376 -class _RestoreChecker(object):
377 - def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func, 378 del_module):
379 self.dt_self = dt_self 380 self.checker = old_checker 381 self.checker._temp_call_super_check_output = self.call_super 382 self.checker._temp_override_self = new_checker 383 self.check_func = check_func 384 self.clone_func = clone_func 385 self.del_module = del_module 386 self.install_clone() 387 self.install_dt_self()
388 - def install_clone(self):
389 self.func_code = self.check_func.func_code 390 self.func_globals = self.check_func.func_globals 391 self.check_func.func_code = self.clone_func.func_code
392 - def uninstall_clone(self):
393 self.check_func.func_code = self.func_code
394 - def install_dt_self(self):
395 self.prev_func = self.dt_self._DocTestRunner__record_outcome 396 self.dt_self._DocTestRunner__record_outcome = self
397 - def uninstall_dt_self(self):
398 self.dt_self._DocTestRunner__record_outcome = self.prev_func
399 - def uninstall_module(self):
400 if self.del_module: 401 import sys 402 del sys.modules[self.del_module] 403 if '.' in self.del_module: 404 package, module = self.del_module.rsplit('.', 1) 405 package_mod = sys.modules[package] 406 delattr(package_mod, module)
407 - def __call__(self, *args, **kw):
408 self.uninstall_clone() 409 self.uninstall_dt_self() 410 del self.checker._temp_override_self 411 del self.checker._temp_call_super_check_output 412 result = self.prev_func(*args, **kw) 413 self.uninstall_module() 414 return result
415 - def call_super(self, *args, **kw):
416 self.uninstall_clone() 417 try: 418 return self.check_func(*args, **kw) 419 finally: 420 self.install_clone()
421
422 -def _find_doctest_frame():
423 import sys 424 frame = sys._getframe(1) 425 while frame: 426 l = frame.f_locals 427 if 'BOOM' in l: 428 # Sign of doctest 429 return frame 430 frame = frame.f_back 431 raise LookupError( 432 "Could not find doctest (only use this function *inside* a doctest)")
433