1 import difflib
2 from lxml import etree
3 from lxml.html import fragment_fromstring
4 import cgi
5 import re
6
7 __all__ = ['html_annotate', 'htmldiff']
8
9
10
11
12
13
15 return '<span title="%s">%s</span>' % (
16 cgi.escape(unicode(version), 1), text)
17
19 """
20 doclist should be ordered from oldest to newest, like::
21
22 >>> version1 = 'Hello World'
23 >>> version2 = 'Goodbye World'
24 >>> html_annotate([(version1, 'version 1'),
25 ... (version2, 'version 2')])
26 u'<span title="version 2">Goodbye</span> <span title="version 1">World</span>'
27
28 The documents must be *fragments* (str/UTF8 or unicode), not
29 complete documents
30
31 The markup argument is a function to markup the spans of words.
32 This function is called like markup('Hello', 'version 2'), and
33 returns HTML. The first argument is text and never includes any
34 markup. The default uses a span with a title:
35
36 >>> default_markup('Some Text', 'by Joe')
37 u'<span title="by Joe">Some Text</span>'
38 """
39
40
41
42
43
44 tokenlist = [tokenize_annotated(doc, version)
45 for doc, version in doclist]
46 cur_tokens = tokenlist[0]
47 for tokens in tokenlist[1:]:
48 html_annotate_merge_annotations(cur_tokens, tokens)
49 cur_tokens = tokens
50
51
52
53 cur_tokens = compress_tokens(cur_tokens)
54
55 result = markup_serialize_tokens(cur_tokens, markup)
56 return ''.join(result).strip()
57
59 """Tokenize a document and add an annotation attribute to each token
60 """
61 tokens = tokenize(doc, include_hrefs=False)
62 for tok in tokens:
63 tok.annotation = annotation
64 return tokens
65
67 """Merge the annotations from tokens_old into tokens_new, when the
68 tokens in the new document already existed in the old document.
69 """
70 s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
71 commands = s.get_opcodes()
72
73 for command, i1, i2, j1, j2 in commands:
74 if command == 'equal':
75 eq_old = tokens_old[i1:i2]
76 eq_new = tokens_new[j1:j2]
77 copy_annotations(eq_old, eq_new)
78
80 """
81 Copy annotations from the tokens listed in src to the tokens in dest
82 """
83 assert len(src) == len(dest)
84 for src_tok, dest_tok in zip(src, dest):
85 dest_tok.annotation = src_tok.annotation
86
88 """
89 Combine adjacent tokens when there is no HTML between the tokens,
90 and they share an annotation
91 """
92 result = [tokens[0]]
93 for tok in tokens[1:]:
94 if (not result[-1].post_tags and
95 not tok.pre_tags and
96 result[-1].annotation == tok.annotation):
97 compress_merge_back(result, tok)
98 else:
99 result.append(tok)
100 return result
101
103 """ Merge tok into the last element of tokens (modifying the list of
104 tokens in-place). """
105 last = tokens[-1]
106 if type(last) is not token or type(tok) is not token:
107 tokens.append(tok)
108 else:
109 text = unicode(last)
110 if last.trailing_whitespace:
111 text += ' '
112 text += tok
113 merged = token(text,
114 pre_tags=last.pre_tags,
115 post_tags=tok.post_tags,
116 trailing_whitespace=tok.trailing_whitespace)
117 merged.annotation = last.annotation
118 tokens[-1] = merged
119
121 """
122 Serialize the list of tokens into a list of text chunks, calling
123 markup_func around text to add annotations.
124 """
125 for token in tokens:
126 for pre in token.pre_tags:
127 yield pre
128 html = token.html()
129 html = markup_func(html, token.annotation)
130 if token.trailing_whitespace:
131 html += ' '
132 yield html
133 for post in token.post_tags:
134 yield post
135
136
137
138
139
140
142 """ Do a diff of the old and new document. The documents are HTML
143 *fragments* (str/UTF8 or unicode), they are not complete documents
144 (i.e., no <html> tag).
145
146 Returns HTML with <ins> and <del> tags added around the
147 appropriate text.
148
149 Markup is generally ignored, with the markup from new_html
150 preserved, and possibly some markup from old_html (though it is
151 considered acceptable to lose some of the old markup). Only the
152 words in the HTML are diffed. The exception is <img> tags, which
153 are treated like words, and the href attribute of <a> tags, which
154 are noted inside the tag itself when there are changes.
155 """
156 old_html_tokens = tokenize(old_html)
157 new_html_tokens = tokenize(new_html)
158 result = htmldiff_tokens(old_html_tokens, new_html_tokens)
159 result = ''.join(result).strip()
160 return fixup_ins_del_tags(result)
161
163 """ Does a diff on the tokens themselves, returning a list of text
164 chunks (not tokens).
165 """
166
167
168
169
170
171
172
173
174
175
176
177
178
179 s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
180 commands = s.get_opcodes()
181 result = []
182 for command, i1, i2, j1, j2 in commands:
183 if command == 'equal':
184 result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
185 continue
186 if command == 'insert' or command == 'replace':
187 ins_tokens = expand_tokens(html2_tokens[j1:j2])
188 merge_insert(ins_tokens, result)
189 if command == 'delete' or command == 'replace':
190 del_tokens = expand_tokens(html1_tokens[i1:i2])
191 merge_delete(del_tokens, result)
192
193
194
195
196 result = cleanup_delete(result)
197
198 return result
199
201 """Given a list of tokens, return a generator of the chunks of
202 text for the data in the tokens.
203 """
204 for token in tokens:
205 for pre in token.pre_tags:
206 yield pre
207 if not equal or not token.hide_when_equal:
208 if token.trailing_whitespace:
209 yield token.html() + ' '
210 else:
211 yield token.html()
212 for post in token.post_tags:
213 yield post
214
216 """ doc is the already-handled document (as a list of text chunks);
217 here we add <ins>ins_chunks</ins> to the end of that. """
218
219
220
221 unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
222 doc.extend(unbalanced_start)
223 if doc and not doc[-1].endswith(' '):
224
225
226 doc[-1] += ' '
227 doc.append('<ins>')
228 if balanced and balanced[-1].endswith(' '):
229
230 balanced[-1] = balanced[-1][:-1]
231 doc.extend(balanced)
232 doc.append('</ins> ')
233 doc.extend(unbalanced_end)
234
235
236
237
242
244 """ Raised when the document no longer contains any pending deletes
245 (DEL_START/DEL_END) """
246
248 """ Adds the text chunks in del_chunks to the document doc (another
249 list of text chunks) with marker to show it is a delete.
250 cleanup_delete later resolves these markers into <del> tags."""
251 doc.append(DEL_START)
252 doc.extend(del_chunks)
253 doc.append(DEL_END)
254
256 """ Cleans up any DEL_START/DEL_END markers in the document, replacing
257 them with <del></del>. To do this while keeping the document
258 valid, it may need to drop some tags (either start or end tags).
259
260 It may also move the del into adjacent tags to try to move it to a
261 similar location where it was originally located (e.g., moving a
262 delete into preceding <div> tag, if the del looks like (DEL_START,
263 'Text</div>', DEL_END)"""
264 while 1:
265
266
267
268 try:
269 pre_delete, delete, post_delete = split_delete(chunks)
270 except NoDeletes:
271
272 break
273
274
275 unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
276
277
278 locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
279 locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
280 doc = pre_delete
281 if doc and not doc[-1].endswith(' '):
282
283 doc[-1] += ' '
284 doc.append('<del>')
285 if balanced and balanced[-1].endswith(' '):
286
287 balanced[-1] = balanced[-1][:-1]
288 doc.extend(balanced)
289 doc.append('</del> ')
290 doc.extend(post_delete)
291 chunks = doc
292 return chunks
293
295 """Return (unbalanced_start, balanced, unbalanced_end), where each is
296 a list of text and tag chunks.
297
298 unbalanced_start is a list of all the tags that are opened, but
299 not closed in this span. Similarly, unbalanced_end is a list of
300 tags that are closed but were not opened. Extracting these might
301 mean some reordering of the chunks."""
302 start = []
303 end = []
304 tag_stack = []
305 balanced = []
306 for chunk in chunks:
307 if not chunk.startswith('<'):
308 balanced.append(chunk)
309 continue
310 endtag = chunk[1] == '/'
311 name = chunk.split()[0].strip('<>/')
312 if name in empty_tags:
313 assert not endtag, (
314 "Empty tag %r should have no end tag" % chunk)
315 balanced.append(chunk)
316 continue
317 if endtag:
318 if tag_stack and tag_stack[-1][0] == name:
319 balanced.append(chunk)
320 name, pos, tag = tag_stack.pop()
321 balanced[pos] = tag
322 elif tag_stack:
323 start.extend(tag for name, pos, tag in tag_stack)
324 tag_stack = []
325 end.append(chunk)
326 else:
327 end.append(chunk)
328 else:
329 tag_stack.append((name, len(balanced), chunk))
330 balanced.append(None)
331 start.extend(
332 [chunk for name, pos, chunk in tag_stack])
333 balanced = [chunk for chunk in balanced if chunk is not None]
334 return start, balanced, end
335
337 """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
338 stuff_after_DEL_END). Returns the first case found (there may be
339 more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
340 there's no DEL_START found. """
341 try:
342 pos = chunks.index(DEL_START)
343 except ValueError:
344 raise NoDeletes
345 pos2 = chunks.index(DEL_END)
346 return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
347
349 """ pre_delete and post_delete implicitly point to a place in the
350 document (where the two were split). This moves that point (by
351 popping items from one and pushing them onto the other). It moves
352 the point to try to find a place where unbalanced_start applies.
353
354 As an example::
355
356 >>> unbalanced_start = ['<div>']
357 >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
358 >>> pre, post = doc[:3], doc[3:]
359 >>> pre, post
360 (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
361 >>> locate_unbalanced_start(unbalanced_start, pre, post)
362 >>> pre, post
363 (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
364
365 As you can see, we moved the point so that the dangling <div> that
366 we found will be effectively replaced by the div in the original
367 document. If this doesn't work out, we just throw away
368 unbalanced_start without doing anything.
369 """
370 while 1:
371 if not unbalanced_start:
372
373 break
374 finding = unbalanced_start[0]
375 finding_name = finding.split()[0].strip('<>')
376 if not post_delete:
377 break
378 next = post_delete[0]
379 if next is DEL_START or not next.startswith('<'):
380
381 break
382 if next[1] == '/':
383
384 break
385 name = next.split()[0].strip('<>')
386 if name == 'ins':
387
388 break
389 assert name != 'del', (
390 "Unexpected delete tag: %r" % next)
391 if name == finding_name:
392 unbalanced_start.pop(0)
393 pre_delete.append(post_delete.pop(0))
394 else:
395
396 break
397
399 """ like locate_unbalanced_start, except handling end tags and
400 possibly moving the point earlier in the document. """
401 while 1:
402 if not unbalanced_end:
403
404 break
405 finding = unbalanced_end[-1]
406 finding_name = finding.split()[0].strip('<>/')
407 if not pre_delete:
408 break
409 next = pre_delete[-1]
410 if next is DEL_END or not next.startswith('</'):
411
412 break
413 name = next.split()[0].strip('<>/')
414 if name == 'ins' or name == 'del':
415
416 break
417 if name == finding_name:
418 unbalanced_end.pop()
419 post_delete.insert(0, pre_delete.pop())
420 else:
421
422 break
423
425 """ Represents a diffable token, generally a word that is displayed to
426 the user. Opening tags are attached to this token when they are
427 adjacent (pre_tags) and closing tags that follow the word
428 (post_tags). Some exceptions occur when there are empty tags
429 adjacent to a word, so there may be close tags in pre_tags, or
430 open tags in post_tags.
431
432 We also keep track of whether the word was originally followed by
433 whitespace, even though we do not want to treat the word as
434 equivalent to a similar word that does not have a trailing
435 space."""
436
437
438
439 hide_when_equal = False
440
441 - def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=False):
442 obj = unicode.__new__(cls, text)
443
444 if pre_tags is not None:
445 obj.pre_tags = pre_tags
446 else:
447 obj.pre_tags = []
448
449 if post_tags is not None:
450 obj.post_tags = post_tags
451 else:
452 obj.post_tags = []
453
454 obj.trailing_whitespace = trailing_whitespace
455
456 return obj
457
459 return 'token(%s, %r, %r)' % (unicode.__repr__(self), self.pre_tags, self.post_tags)
460
463
465
466 """ Represents a token that is actually a tag. Currently this is just
467 the <img> tag, which takes up visible space just like a word but
468 is only represented in a document by a tag. """
469
470 - def __new__(cls, tag, data, html_repr, pre_tags=None,
471 post_tags=None, trailing_whitespace=False):
472 obj = token.__new__(cls, "%s: %s" % (type, data),
473 pre_tags=pre_tags,
474 post_tags=post_tags,
475 trailing_whitespace=trailing_whitespace)
476 obj.tag = tag
477 obj.data = data
478 obj.html_repr = html_repr
479 return obj
480
482 return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%s)' % (
483 self.tag,
484 self.data,
485 self.html_repr,
486 self.pre_tags,
487 self.post_tags,
488 self.trailing_whitespace)
490 return self.html_repr
491
493
494 """ Represents the href in an anchor tag. Unlike other words, we only
495 show the href when it changes. """
496
497 hide_when_equal = True
498
500 return 'Link: %s' % self
501
503 """
504 Parse the given HTML and returns token objects (words with attached tags).
505
506 This parses only the content of a page; anything in the head is
507 ignored, and the <head> and <body> elements are themselves
508 optional. The content is then parsed by lxml, which ensures the
509 validity of the resulting parsed document (though lxml may make
510 incorrect guesses when the markup is particular bad).
511
512 <ins> and <del> tags are also eliminated from the document, as
513 that gets confusing.
514
515 If include_hrefs is true, then the href attribute of <a> tags is
516 included as a special kind of diffable token."""
517 body_el = parse_html(html, cleanup=True)
518
519 chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
520
521 return fixup_chunks(chunks)
522
524 """
525 Parses an HTML fragment, returning an lxml element. Note that the HTML will be
526 wrapped in a <div> tag that was not in the original document.
527
528 If cleanup is true, make sure there's no <head> or <body>, and get
529 rid of any <ins> and <del> tags.
530 """
531 if cleanup:
532
533 html = cleanup_html(html)
534 return fragment_fromstring(html, create_parent=True)
535
536 _body_re = re.compile(r'<body.*?>', re.I|re.S)
537 _end_body_re = re.compile(r'</body.*?>', re.I|re.S)
538 _ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
539
552
553
554 end_whitespace_re = re.compile(r'[ \t\n\r]$')
555
557 """
558 This function takes a list of chunks and produces a list of tokens.
559 """
560 tag_accum = []
561 cur_word = None
562 result = []
563 for chunk in chunks:
564 if isinstance(chunk, tuple):
565 if chunk[0] == 'img':
566 src = chunk[1]
567 tag = chunk[2]
568 if tag.endswith(' '):
569 tag = tag[:-1]
570 trailing_whitespace = True
571 else:
572 trailing_whitespace = False
573 cur_word = tag_token('img', src, html_repr=tag,
574 pre_tags=tag_accum,
575 trailing_whitespace=trailing_whitespace)
576 tag_accum = []
577 result.append(cur_word)
578 elif chunk[0] == 'href':
579 href = chunk[1]
580 cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=True)
581 tag_accum = []
582 result.append(cur_word)
583 continue
584 if is_word(chunk):
585 if chunk.endswith(' '):
586 chunk = chunk[:-1]
587 trailing_whitespace = True
588 else:
589 trailing_whitespace = False
590 cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
591 tag_accum = []
592 result.append(cur_word)
593 elif is_start_tag(chunk):
594 tag_accum.append(chunk)
595 elif is_end_tag(chunk):
596 if tag_accum:
597 tag_accum.append(chunk)
598 else:
599 assert cur_word, (
600 "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
601 % (cur_word, result, chunk, chunks))
602 cur_word.post_tags.append(chunk)
603 else:
604 assert(0)
605
606 if not result:
607 return [token('', pre_tags=tag_accum)]
608 else:
609 result[-1].post_tags.extend(tag_accum)
610
611 return result
612
613
614
615 empty_tags = (
616 'param', 'img', 'area', 'br', 'basefont', 'input',
617 'base', 'meta', 'link', 'col')
618
619 block_level_tags = (
620 'address',
621 'blockquote',
622 'center',
623 'dir',
624 'div',
625 'dl',
626 'fieldset',
627 'form',
628 'h1',
629 'h2',
630 'h3',
631 'h4',
632 'h5',
633 'h6',
634 'hr',
635 'isindex',
636 'menu',
637 'noframes',
638 'noscript',
639 'ol',
640 'p',
641 'pre',
642 'table',
643 'ul',
644 )
645
646 block_level_container_tags = (
647 'dd',
648 'dt',
649 'frameset',
650 'li',
651 'tbody',
652 'td',
653 'tfoot',
654 'th',
655 'thead',
656 'tr',
657 )
658
659
660 -def flatten_el(el, include_hrefs, skip_tag=False):
661 """ Takes an lxml element el, and generates all the text chunks for
662 that tag. Each start tag is a chunk, each word is a chunk, and each
663 end tag is a chunk.
664
665 If skip_tag is true, then the outermost container tag is
666 not returned (just its contents)."""
667 if not skip_tag:
668 if el.tag == 'img':
669 yield ('img', el.attrib['src'], start_tag(el))
670 else:
671 yield start_tag(el)
672 if el.tag in empty_tags and not el.text and not len(el):
673 return
674 start_words = split_words(el.text)
675 for word in start_words:
676 yield cgi.escape(word)
677 for child in el:
678 for item in flatten_el(child, include_hrefs=include_hrefs):
679 yield item
680 if el.tag == 'a' and el.attrib.get('href') and include_hrefs:
681 yield ('href', el.attrib['href'])
682 if not skip_tag:
683 yield end_tag(el)
684 end_words = split_words(el.tail)
685 for word in end_words:
686 yield cgi.escape(word)
687
689 """ Splits some text into words. Includes trailing whitespace (one
690 space) on each word when appropriate. """
691 if not text or not text.strip():
692 return []
693 words = [w + ' ' for w in text.strip().split()]
694 if not end_whitespace_re.search(text):
695 words[-1] = words[-1][:-1]
696 return words
697
698 start_whitespace_re = re.compile(r'^[ \t\n\r]')
699
701 """
702 The text representation of the start tag for a tag.
703 """
704 return '<%s%s>' % (
705 el.tag, ''.join(' %s="%s"' % (name, cgi.escape(value, True))
706 for name, value in el.attrib.items()))
707
709 """ The text representation of an end tag for a tag. Includes
710 trailing whitespace when appropriate. """
711 if el.tail and start_whitespace_re.search(el.tail):
712 extra = ' '
713 else:
714 extra = ''
715 return '</%s>%s' % (el.tag, extra)
716
718 return not tok.startswith('<')
719
721 return tok.startswith('</')
722
724 return tok.startswith('<') and not tok.startswith('</')
725
734
736 """ Serialize a single lxml element as HTML. The serialized form
737 includes the elements tail.
738
739 If skip_outer is true, then don't serialize the outermost tag
740 """
741
742 html_xsl = """\
743 <xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
744 <xsl:output method="html" encoding="UTF-8" />
745 <xsl:template match="/">
746 <xsl:copy-of select="."/>
747 </xsl:template>
748 </xsl:transform>
749 """
750 transform = etree.XSLT(etree.XML(html_xsl))
751 assert not isinstance(el, basestring), (
752 "You should pass in an element, not a string like %r" % el)
753 html = str(transform(el))
754 if skip_outer:
755
756 html = html[html.find('>')+1:]
757 if skip_outer:
758
759 html = html[:html.rfind('<')]
760 if skip_outer:
761 return html.strip()
762 else:
763 return html.lstrip()
764
774
775
785
818
820 """
821 Removes an element, but merges its contents into its place, e.g.,
822 given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
823 <p>Hi there!</p>
824 """
825 parent = el.getparent()
826 text = el.text or ''
827 if el.tail:
828 if not len(el):
829 text += el.tail
830 else:
831 if el[-1].tail:
832 el[-1].tail += el.tail
833 else:
834 el[-1].tail = el.tail
835 index = parent.index(el)
836 if text:
837 if index == 0:
838 previous = None
839 else:
840 previous = parent[index-1]
841 if previous is None:
842 if parent.text:
843 parent.text += text
844 else:
845 parent.text = text
846 else:
847 if previous.tail:
848 previous.tail += text
849 else:
850 previous.tail = text
851 parent[index:index+1] = el.getchildren()
852
854 """
855 Acts like SequenceMatcher, but tries not to find very small equal
856 blocks amidst large spans of changes
857 """
858
859 threshold = 2
860
862 size = min(len(self.b), len(self.b))
863 threshold = min(self.threshold, size / 4)
864 actual = difflib.SequenceMatcher.get_matching_blocks(self)
865 return [item for item in actual
866 if item[2] > threshold
867 or not item[2]]
868
869 if __name__ == '__main__':
870 from lxml.html import _diffcommand
871 _diffcommand.main()
872