1 import re
2 from lxml import etree
3 from lxml.html import defs
4 from lxml.html import fromstring, tostring
5
6 try:
7 set
8 except NameError:
9 from sets import Set as set
10
11 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
12 'word_break', 'word_break_html']
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34 _css_javascript_re = re.compile(
35 r'expression\s*\(.*?\)', re.S|re.I)
36
37
38 _css_import_re = re.compile(
39 r'@\s*import', re.I)
40
41
42
43 _javascript_scheme_re = re.compile(
44 r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I)
45 _whitespace_re = re.compile(r'\s+')
46
47
48
49 _conditional_comment_re = re.compile(
50 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
51
52 _find_styled_elements = etree.XPath(
53 "descendant-or-self::*[@style]")
54
55 _find_external_links = etree.XPath(
56 "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']")
57
59 """
60 Like clean(), but takes a text input document, and returns a text
61 document.
62 """
63 doc = fromstring(html)
64 clean(doc, **kw)
65 return tostring(doc)
66
68 """
69 Instances cleans the document of each of the possible offending
70 elements. The cleaning is controlled by attributes; you can
71 override attributes in a subclass, or set them in the constructor.
72
73 ``scripts``:
74 Removes any ``<script>`` tags.
75
76 ``javascript``:
77 Removes any Javascript, like an ``onclick`` attribute.
78
79 ``comments``:
80 Removes any comments.
81
82 ``style``:
83 Removes any style tags or attributes.
84
85 ``links``:
86 Removes any ``<link>`` tags
87
88 ``meta``:
89 Removes any ``<meta>`` tags
90
91 ``page_structure``:
92 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
93
94 ``processing_instructions``:
95 Removes any processing instructions.
96
97 ``embedded``:
98 Removes any embedded objects (flash, iframes)
99
100 ``frames``:
101 Removes any frame-related tags
102
103 ``forms``:
104 Removes any form tags
105
106 ``annoying_tags``:
107 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>``
108
109 ``remove_tags``:
110 A list of tags to remove.
111
112 ``allow_tags``:
113 A list of tags to include (default include all).
114
115 ``remove_unknown_tags``:
116 Remove any tags that aren't standard parts of HTML.
117
118 ``safe_attrs_only``:
119 If true, only include 'safe' attributes (specifically the list
120 from `feedparser
121 <http://feedparser.org/docs/html-sanitization.html>`_).
122
123 ``add_nofollow``:
124 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
125
126 This modifies the document *in place*.
127 """
128
129 scripts = True
130 javascript = True
131 comments = True
132 style = False
133 links = True
134 meta = True
135 page_structure = True
136 processing_instructions = True
137 embedded = True
138 frames = True
139 forms = True
140 annoying_tags = True
141 remove_tags = None
142 allow_tags = None
143 remove_unknown_tags = True
144 safe_attrs_only = True
145 add_nofollow = False
146
153
155 """
156 Cleans the document.
157 """
158 if hasattr(doc, 'getroot'):
159
160 doc = doc.getroot()
161
162
163 for el in doc.getiterator('image'):
164 el.tag = 'img'
165 if not self.comments:
166
167
168 self.kill_conditional_comments(doc)
169 kill_tags = set()
170 remove_tags = set(self.remove_tags or ())
171 if self.allow_tags:
172 allow_tags = set(self.allow_tags)
173 else:
174 allow_tags = set()
175 if self.scripts:
176 kill_tags.add('script')
177 if self.safe_attrs_only:
178 safe_attrs = set(defs.safe_attrs)
179 for el in doc.getiterator():
180 attrib = el.attrib
181 for aname in attrib.keys():
182 if aname not in safe_attrs:
183 del attrib[aname]
184 if self.javascript:
185 if not self.safe_attrs_only:
186
187 for el in doc.getiterator():
188 attrib = el.attrib
189 for aname in attrib.keys():
190 if aname.startswith('on'):
191 del attrib[aname]
192 doc.rewrite_links(self._remove_javascript_link,
193 resolve_base_href=False)
194 if not self.style:
195
196
197 for el in _find_styled_elements(doc):
198 old = el.get('style')
199 new = _css_javascript_re.sub('', old)
200 new = _css_import_re.sub('', old)
201 if self._has_sneaky_javascript(new):
202
203 del el.attrib['style']
204 elif new != old:
205 el.set('style', new)
206 for el in list(doc.getiterator('style')):
207 if el.get('type', '').lower().strip() == 'text/javascript':
208 el.drop_tree()
209 continue
210 old = el.text or ''
211 new = _css_javascript_re.sub('', old)
212
213 new = _css_import_re.sub('', old)
214 if self._has_sneaky_javascript(new):
215
216 el.text = '/* deleted */'
217 elif new != old:
218 el.text = new
219 if self.comments or self.processing_instructions:
220
221
222
223 kill_tags.add(etree.Comment)
224 if self.processing_instructions:
225 kill_tags.add(etree.ProcessingInstruction)
226 if self.style:
227 kill_tags.add('style')
228 for el in _find_styled_elements(doc):
229 del el.attrib['style']
230 if self.links:
231 kill_tags.add('link')
232 elif self.style or self.javascript:
233
234
235 for el in list(doc.getiterator('link')):
236 if 'stylesheet' in el.get('rel', '').lower():
237
238 el.drop_tree()
239 if self.meta:
240 kill_tags.add('meta')
241 if self.page_structure:
242 remove_tags.update(('head', 'html', 'title'))
243 if self.embedded:
244
245 kill_tags.update(('applet', 'param'))
246
247
248
249 remove_tags.update(('iframe', 'object', 'embed', 'layer'))
250 if self.frames:
251 kill_tags.update(defs.frame_tags)
252 if self.forms:
253 remove_tags.add('form')
254 kill_tags.update(('button', 'input', 'select', 'textarea'))
255 if self.annoying_tags:
256 remove_tags.update(('blink', 'marque'))
257
258 _remove = []
259 _kill = []
260 for el in doc.getiterator():
261 if el.tag in kill_tags:
262 _kill.append(el)
263 elif el.tag in remove_tags:
264 _remove.append(el)
265
266 if _remove and _remove[0] == doc:
267
268
269 el = _remove.pop(0)
270 el.tag = 'div'
271 el.attrib.clear()
272 elif _kill and _kill[0] == doc:
273
274
275 el = _kill.pop(0)
276 if el.tag != 'html':
277 el.tag = 'div'
278 el.clear()
279
280 for el in _kill:
281 el.drop_tree()
282 for el in _remove:
283 el.drop_tag()
284
285 allow_tags = self.allow_tags
286 if self.remove_unknown_tags:
287 if allow_tags:
288 raise ValueError(
289 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
290 allow_tags = set(defs.tags)
291 if allow_tags:
292 bad = []
293 for el in doc.getiterator():
294 if el.tag not in allow_tags:
295 bad.append(el)
296 for el in bad:
297 el.drop_tag()
298 if self.add_nofollow:
299 for el in _find_external_links(doc):
300 el.set('rel', 'nofollow')
301
312
314 bad = []
315 for el in doc.getiterator(iterate):
316 if condition(el):
317 bad.append(el)
318 for el in bad:
319 el.drop_tree()
320
328
329 _decomment_re = re.compile(r'/\*.*?\*/', re.S)
330
332 """
333 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
334 can get interpreted, or ``expre/* stuff */ssion(...)``. This
335 checks for attempt to do stuff like this.
336
337 Typically the response will be to kill the entire style; if you
338 have just a bit of Javascript in the style another rule will catch
339 that and remove only the Javascript from the style; this catches
340 more sneaky attempts.
341 """
342 style = self._decomment_re.sub('', style)
343 style = style.replace('\\', '')
344 style = _whitespace_re.sub('', style)
345 style = style.lower()
346 if 'javascript:' in style:
347 return True
348 if 'expression(' in style:
349 return True
350 return False
351
353 if isinstance(html, basestring):
354 return_string = True
355 doc = fromstring(html)
356 else:
357 return_string = False
358 doc = copy.deepcopy(doc)
359 self(doc)
360 if return_string:
361 return tostring(doc)
362 else:
363 return doc
364
365 clean = Cleaner()
366 clean_html = clean.clean_html
367
368
369
370
371
372 _link_regexes = [
373 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I),
374
375 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
376 ]
377
378 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
379
380 _avoid_hosts = [
381 re.compile(r'^localhost', re.I),
382 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
383 re.compile(r'^127\.0\.0\.1$'),
384 ]
385
386 _avoid_classes = ['nolink']
387
392 """
393 Turn any URLs into links.
394
395 It will search for links identified by the given regular
396 expressions (by default mailto and http(s) links).
397
398 It won't link text in an element in avoid_elements, or an element
399 with a class in avoid_classes. It won't link to anything with a
400 host that matches one of the regular expressions in avoid_hosts
401 (default localhost and 127.0.0.1).
402
403 If you pass in an element, the elements tail will not be
404 substituted, only the contents of the element.
405 """
406 if el.tag in avoid_elements:
407 return
408 class_name = el.get('class')
409 if class_name:
410 class_name = class_name.split()
411 for match_class in avoid_classes:
412 if match_class in class_name:
413 return
414 for child in list(el):
415 autolink(child, link_regexes=link_regexes,
416 avoid_elements=avoid_elements,
417 avoid_hosts=avoid_hosts,
418 avoid_classes=avoid_classes)
419 if child.tail:
420 text, tail_children = _link_text(
421 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
422 if tail_children:
423 child.tail = text
424 index = el.index(child)
425 el[index+1:index+1] = tail_children
426 if el.text:
427 text, pre_children = _link_text(
428 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
429 if pre_children:
430 el.text = text
431 el[:0] = pre_children
432
433 -def _link_text(text, link_regexes, avoid_hosts, factory):
434 leading_text = ''
435 links = []
436 last_pos = 0
437 while 1:
438 best_match, best_pos = None, None
439 for regex in link_regexes:
440 regex_pos = last_pos
441 while 1:
442 match = regex.search(text, pos=regex_pos)
443 if match is None:
444 break
445 host = match.group('host')
446 for host_regex in avoid_hosts:
447 if host_regex.search(host):
448 regex_pos = match.end()
449 break
450 else:
451 break
452 if match is None:
453 continue
454 if best_pos is None or match.start() < best_pos:
455 best_match = match
456 best_pos = match.start()
457 if best_match is None:
458
459 if links:
460 assert not links[-1].tail
461 links[-1].tail = text
462 else:
463 assert not leading_text
464 leading_text = text
465 break
466 link = best_match.group(0)
467 end = best_match.end()
468 if link.endswith('.') or link.endswith(','):
469
470 end -= 1
471 link = link[:-1]
472 prev_text = text[:best_match.start()]
473 if links:
474 assert not links[-1].tail
475 links[-1].tail = prev_text
476 else:
477 assert not leading_text
478 leading_text = prev_text
479 anchor = factory('a')
480 anchor.set('href', link)
481 body = best_match.group('body')
482 if not body:
483 body = link
484 if body.endswith('.') or body.endswith(','):
485 body = body[:-1]
486 anchor.text = body
487 links.append(anchor)
488 text = text[end:]
489 return leading_text, links
490
492 if isinstance(html, basestring):
493 doc = fromstring(html)
494 return_string = True
495 else:
496 doc = copy.deepcopy(html)
497 return_string = False
498 autolink(doc, *args, **kw)
499 if return_string:
500 return tostring(doc)
501 else:
502 return doc
503
504 autolink_html.__doc__ = autolink.__doc__
505
506
507
508
509
510 _avoid_word_break_elements = ['pre', 'textarea', 'code']
511 _avoid_word_break_classes = ['nobreak']
512
517 """
518 Breaks any long words found in the body of the text (not attributes).
519
520 Doesn't effect any of the tags in avoid_elements, by default
521 ``<textarea>`` and ``<pre>``
522
523 Breaks words by inserting ​, which is a unicode character
524 for Zero Width Space character. This generally takes up no space
525 in rendering, but does copy as a space, and in monospace contexts
526 usually takes up space.
527
528 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
529 """
530
531
532 if el.tag in _avoid_word_break_elements:
533 return
534 class_name = el.get('class')
535 if class_name:
536 dont_break = False
537 class_name = class_name.split()
538 for avoid in avoid_classes:
539 if avoid in class_name:
540 dont_break = True
541 break
542 if dont_break:
543 return
544 if el.text:
545 el.text = _break_text(el.text, max_width, break_character)
546 for child in el:
547 word_break(child, max_width=max_width,
548 avoid_elements=avoid_elements,
549 avoid_classes=avoid_classes,
550 break_character=break_character)
551 if child.tail:
552 child.tail = _break_text(child.tail, max_width, break_character)
553
558
559 -def _break_text(text, max_width, break_character):
560 words = text.split()
561 for word in words:
562 if len(word) > max_width:
563 replacement = _insert_break(word, max_width, break_character)
564 text = text.replace(word, replacement)
565 return text
566
567 _break_prefer_re = re.compile(r'[^a-z]', re.I)
568
570 orig_word = word
571 result = ''
572 while len(word) > width:
573 start = word[:width]
574 breaks = list(_break_prefer_re.finditer(start))
575 if breaks:
576 last_break = breaks[-1]
577
578 if last_break.end() > width-10:
579
580
581 start = word[:last_break.end()]
582 result += start + break_character
583 word = word[len(start):]
584 result += word
585 return result
586