Package lxml :: Package html :: Module clean
[hide private]
[frames] | no frames]

Source Code for Module lxml.html.clean

  1  import re 
  2  from lxml import etree 
  3  from lxml.html import defs 
  4  from lxml.html import fromstring, tostring 
  5   
  6  try: 
  7      set 
  8  except NameError: 
  9      from sets import Set as set 
 10   
 11  __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', 
 12             'word_break', 'word_break_html'] 
 13   
 14  # Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl 
 15  #   Particularly the CSS cleaning; most of the tag cleaning is integrated now 
 16  # I have multiple kinds of schemes searched; but should schemes be 
 17  #   whitelisted instead? 
 18  # max height? 
 19  # remove images?  Also in CSS?  background attribute? 
 20  # Some way to whitelist object, iframe, etc (e.g., if you want to 
 21  #   allow *just* embedded YouTube movies) 
 22  # Log what was deleted and why? 
 23  # style="behavior: ..." might be bad in IE? 
 24  # Should we have something for just <meta http-equiv>?  That's the worst of the 
 25  #   metas. 
 26  # UTF-7 detections?  Example: 
 27  #     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- 
 28  #   you don't always have to have the charset set, if the page has no charset 
 29  #   and there's UTF7-like code in it. 
 30   
 31   
 32  # This is an IE-specific construct you can have in a stylesheet to 
 33  # run some Javascript: 
 34  _css_javascript_re = re.compile( 
 35      r'expression\s*\(.*?\)', re.S|re.I) 
 36   
 37  # Do I have to worry about @\nimport? 
 38  _css_import_re = re.compile( 
 39      r'@\s*import', re.I) 
 40   
 41  # All kinds of schemes besides just javascript: that can cause 
 42  # execution: 
 43  _javascript_scheme_re = re.compile( 
 44      r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I) 
 45  _whitespace_re = re.compile(r'\s+') 
 46  # FIXME: should data: be blocked? 
 47   
 48  # FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx 
 49  _conditional_comment_re = re.compile( 
 50      r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) 
 51   
 52  _find_styled_elements = etree.XPath( 
 53      "descendant-or-self::*[@style]") 
 54   
 55  _find_external_links = etree.XPath( 
 56      "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']") 
 57   
58 -def clean_html(html, **kw):
59 """ 60 Like clean(), but takes a text input document, and returns a text 61 document. 62 """ 63 doc = fromstring(html) 64 clean(doc, **kw) 65 return tostring(doc)
66
67 -class Cleaner(object):
68 """ 69 Instances cleans the document of each of the possible offending 70 elements. The cleaning is controlled by attributes; you can 71 override attributes in a subclass, or set them in the constructor. 72 73 ``scripts``: 74 Removes any ``<script>`` tags. 75 76 ``javascript``: 77 Removes any Javascript, like an ``onclick`` attribute. 78 79 ``comments``: 80 Removes any comments. 81 82 ``style``: 83 Removes any style tags or attributes. 84 85 ``links``: 86 Removes any ``<link>`` tags 87 88 ``meta``: 89 Removes any ``<meta>`` tags 90 91 ``page_structure``: 92 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``. 93 94 ``processing_instructions``: 95 Removes any processing instructions. 96 97 ``embedded``: 98 Removes any embedded objects (flash, iframes) 99 100 ``frames``: 101 Removes any frame-related tags 102 103 ``forms``: 104 Removes any form tags 105 106 ``annoying_tags``: 107 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>`` 108 109 ``remove_tags``: 110 A list of tags to remove. 111 112 ``allow_tags``: 113 A list of tags to include (default include all). 114 115 ``remove_unknown_tags``: 116 Remove any tags that aren't standard parts of HTML. 117 118 ``safe_attrs_only``: 119 If true, only include 'safe' attributes (specifically the list 120 from `feedparser 121 <http://feedparser.org/docs/html-sanitization.html>`_). 122 123 ``add_nofollow``: 124 If true, then any <a> tags will have ``rel="nofollow"`` added to them. 125 126 This modifies the document *in place*. 127 """ 128 129 scripts = True 130 javascript = True 131 comments = True 132 style = False 133 links = True 134 meta = True 135 page_structure = True 136 processing_instructions = True 137 embedded = True 138 frames = True 139 forms = True 140 annoying_tags = True 141 remove_tags = None 142 allow_tags = None 143 remove_unknown_tags = True 144 safe_attrs_only = True 145 add_nofollow = False 146
147 - def __init__(self, **kw):
148 for name, value in kw.items(): 149 if not hasattr(self, name): 150 raise TypeError( 151 "Unknown parameter: %s=%r" % (name, value)) 152 setattr(self, name, value)
153
154 - def __call__(self, doc):
155 """ 156 Cleans the document. 157 """ 158 if hasattr(doc, 'getroot'): 159 # ElementTree 160 doc = doc.getroot() 161 # Normalize a case that IE treats <image> like <img>, and that 162 # can confuse either this step or later steps. 163 for el in doc.getiterator('image'): 164 el.tag = 'img' 165 if not self.comments: 166 # Of course, if we were going to kill comments anyway, we don't 167 # need to worry about this 168 self.kill_conditional_comments(doc) 169 kill_tags = set() 170 remove_tags = set(self.remove_tags or ()) 171 if self.allow_tags: 172 allow_tags = set(self.allow_tags) 173 else: 174 allow_tags = set() 175 if self.scripts: 176 kill_tags.add('script') 177 if self.safe_attrs_only: 178 safe_attrs = set(defs.safe_attrs) 179 for el in doc.getiterator(): 180 attrib = el.attrib 181 for aname in attrib.keys(): 182 if aname not in safe_attrs: 183 del attrib[aname] 184 if self.javascript: 185 if not self.safe_attrs_only: 186 # safe_attrs handles events attributes itself 187 for el in doc.getiterator(): 188 attrib = el.attrib 189 for aname in attrib.keys(): 190 if aname.startswith('on'): 191 del attrib[aname] 192 doc.rewrite_links(self._remove_javascript_link, 193 resolve_base_href=False) 194 if not self.style: 195 # If we're deleting style then we don't have to remove JS links 196 # from styles, otherwise... 197 for el in _find_styled_elements(doc): 198 old = el.get('style') 199 new = _css_javascript_re.sub('', old) 200 new = _css_import_re.sub('', old) 201 if self._has_sneaky_javascript(new): 202 # Something tricky is going on... 203 del el.attrib['style'] 204 elif new != old: 205 el.set('style', new) 206 for el in list(doc.getiterator('style')): 207 if el.get('type', '').lower().strip() == 'text/javascript': 208 el.drop_tree() 209 continue 210 old = el.text or '' 211 new = _css_javascript_re.sub('', old) 212 # The imported CSS can do anything; we just can't allow: 213 new = _css_import_re.sub('', old) 214 if self._has_sneaky_javascript(new): 215 # Something tricky is going on... 216 el.text = '/* deleted */' 217 elif new != old: 218 el.text = new 219 if self.comments or self.processing_instructions: 220 # FIXME: why either? I feel like there's some obscure reason 221 # because you can put PIs in comments...? But I've already 222 # forgotten it 223 kill_tags.add(etree.Comment) 224 if self.processing_instructions: 225 kill_tags.add(etree.ProcessingInstruction) 226 if self.style: 227 kill_tags.add('style') 228 for el in _find_styled_elements(doc): 229 del el.attrib['style'] 230 if self.links: 231 kill_tags.add('link') 232 elif self.style or self.javascript: 233 # We must get rid of included stylesheets if Javascript is not 234 # allowed, as you can put Javascript in them 235 for el in list(doc.getiterator('link')): 236 if 'stylesheet' in el.get('rel', '').lower(): 237 # Note this kills alternate stylesheets as well 238 el.drop_tree() 239 if self.meta: 240 kill_tags.add('meta') 241 if self.page_structure: 242 remove_tags.update(('head', 'html', 'title')) 243 if self.embedded: 244 # FIXME: is <layer> really embedded? 245 kill_tags.update(('applet', 'param')) 246 # The alternate contents that are in an iframe are a good fallback: 247 # FIXME: somehow embed seems to be getting data, but from what I 248 # can tell the embed tag is supposed to always be empty 249 remove_tags.update(('iframe', 'object', 'embed', 'layer')) 250 if self.frames: 251 kill_tags.update(defs.frame_tags) 252 if self.forms: 253 remove_tags.add('form') 254 kill_tags.update(('button', 'input', 'select', 'textarea')) 255 if self.annoying_tags: 256 remove_tags.update(('blink', 'marque')) 257 258 _remove = [] 259 _kill = [] 260 for el in doc.getiterator(): 261 if el.tag in kill_tags: 262 _kill.append(el) 263 elif el.tag in remove_tags: 264 _remove.append(el) 265 266 if _remove and _remove[0] == doc: 267 # We have to drop the parent-most tag, which we can't 268 # do. Instead we'll rewrite it: 269 el = _remove.pop(0) 270 el.tag = 'div' 271 el.attrib.clear() 272 elif _kill and _kill[0] == doc: 273 # We have to drop the parent-most element, which we can't 274 # do. Instead we'll clear it: 275 el = _kill.pop(0) 276 if el.tag != 'html': 277 el.tag = 'div' 278 el.clear() 279 280 for el in _kill: 281 el.drop_tree() 282 for el in _remove: 283 el.drop_tag() 284 285 allow_tags = self.allow_tags 286 if self.remove_unknown_tags: 287 if allow_tags: 288 raise ValueError( 289 "It does not make sense to pass in both allow_tags and remove_unknown_tags") 290 allow_tags = set(defs.tags) 291 if allow_tags: 292 bad = [] 293 for el in doc.getiterator(): 294 if el.tag not in allow_tags: 295 bad.append(el) 296 for el in bad: 297 el.drop_tag() 298 if self.add_nofollow: 299 for el in _find_external_links(doc): 300 el.set('rel', 'nofollow')
301
302 - def kill_conditional_comments(self, doc):
303 """ 304 IE conditional comments basically embed HTML that the parser 305 doesn't normally see. We can't allow anything like that, so 306 we'll kill any comments that could be conditional. 307 """ 308 bad = [] 309 self._kill_elements( 310 doc, lambda el: _conditional_comment_re.search(el.text), 311 etree.Comment)
312
313 - def _kill_elements(self, doc, condition, iterate=None):
314 bad = [] 315 for el in doc.getiterator(iterate): 316 if condition(el): 317 bad.append(el) 318 for el in bad: 319 el.drop_tree()
320 328 329 _decomment_re = re.compile(r'/\*.*?\*/', re.S) 330
331 - def _has_sneaky_javascript(self, style):
332 """ 333 Depending on the browser, stuff like ``e x p r e s s i o n(...)`` 334 can get interpreted, or ``expre/* stuff */ssion(...)``. This 335 checks for attempt to do stuff like this. 336 337 Typically the response will be to kill the entire style; if you 338 have just a bit of Javascript in the style another rule will catch 339 that and remove only the Javascript from the style; this catches 340 more sneaky attempts. 341 """ 342 style = self._decomment_re.sub('', style) 343 style = style.replace('\\', '') 344 style = _whitespace_re.sub('', style) 345 style = style.lower() 346 if 'javascript:' in style: 347 return True 348 if 'expression(' in style: 349 return True 350 return False
351
352 - def clean_html(self, html):
353 if isinstance(html, basestring): 354 return_string = True 355 doc = fromstring(html) 356 else: 357 return_string = False 358 doc = copy.deepcopy(doc) 359 self(doc) 360 if return_string: 361 return tostring(doc) 362 else: 363 return doc
364 365 clean = Cleaner() 366 clean_html = clean.clean_html 367 368 ############################################################ 369 ## Autolinking 370 ############################################################ 371 372 _link_regexes = [ 373 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I), 374 # This is conservative, but autolinking can be a bit conservative: 375 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I), 376 ] 377 378 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a'] 379 380 _avoid_hosts = [ 381 re.compile(r'^localhost', re.I), 382 re.compile(r'\bexample\.(?:com|org|net)$', re.I), 383 re.compile(r'^127\.0\.0\.1$'), 384 ] 385 386 _avoid_classes = ['nolink'] 387 432 490 503 504 autolink_html.__doc__ = autolink.__doc__ 505 506 ############################################################ 507 ## Word wrapping 508 ############################################################ 509 510 _avoid_word_break_elements = ['pre', 'textarea', 'code'] 511 _avoid_word_break_classes = ['nobreak'] 512
513 -def word_break(el, max_width=40, 514 avoid_elements=_avoid_word_break_elements, 515 avoid_classes=_avoid_word_break_classes, 516 break_character=u'\u200b'):
517 """ 518 Breaks any long words found in the body of the text (not attributes). 519 520 Doesn't effect any of the tags in avoid_elements, by default 521 ``<textarea>`` and ``<pre>`` 522 523 Breaks words by inserting &#8203;, which is a unicode character 524 for Zero Width Space character. This generally takes up no space 525 in rendering, but does copy as a space, and in monospace contexts 526 usually takes up space. 527 528 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion 529 """ 530 # Character suggestion of &#8203 comes from: 531 # http://www.cs.tut.fi/~jkorpela/html/nobr.html 532 if el.tag in _avoid_word_break_elements: 533 return 534 class_name = el.get('class') 535 if class_name: 536 dont_break = False 537 class_name = class_name.split() 538 for avoid in avoid_classes: 539 if avoid in class_name: 540 dont_break = True 541 break 542 if dont_break: 543 return 544 if el.text: 545 el.text = _break_text(el.text, max_width, break_character) 546 for child in el: 547 word_break(child, max_width=max_width, 548 avoid_elements=avoid_elements, 549 avoid_classes=avoid_classes, 550 break_character=break_character) 551 if child.tail: 552 child.tail = _break_text(child.tail, max_width, break_character)
553
554 -def word_break_html(html, *args, **kw):
555 doc = fromstring(html) 556 word_break(doc, *args, **kw) 557 return tostring(doc)
558
559 -def _break_text(text, max_width, break_character):
560 words = text.split() 561 for word in words: 562 if len(word) > max_width: 563 replacement = _insert_break(word, max_width, break_character) 564 text = text.replace(word, replacement) 565 return text
566 567 _break_prefer_re = re.compile(r'[^a-z]', re.I) 568
569 -def _insert_break(word, width, break_character):
570 orig_word = word 571 result = '' 572 while len(word) > width: 573 start = word[:width] 574 breaks = list(_break_prefer_re.finditer(start)) 575 if breaks: 576 last_break = breaks[-1] 577 # Only walk back up to 10 characters to find a nice break: 578 if last_break.end() > width-10: 579 # FIXME: should the break character be at the end of the 580 # chunk, or the beginning of the next chunk? 581 start = word[:last_break.end()] 582 result += start + break_character 583 word = word[len(start):] 584 result += word 585 return result
586