1 import re
2 from lxml import etree
3 from lxml.html import defs
4 from lxml.html import fromstring, tostring
5
6 try:
7 set
8 except NameError:
9 from sets import Set as set
10
11 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
12 'word_break', 'word_break_html']
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35 _css_javascript_re = re.compile(
36 r'expression\s*\(.*?\)', re.S|re.I)
37
38
39 _css_import_re = re.compile(
40 r'@\s*import', re.I)
41
42
43
44 _javascript_scheme_re = re.compile(
45 r'\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I)
46 _whitespace_re = re.compile(r'\s+')
47
48
49
50 _conditional_comment_re = re.compile(
51 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
52
53 _find_styled_elements = etree.XPath(
54 "descendant-or-self::*[@style]")
55
56 _find_external_links = etree.XPath(
57 "descendant-or-self::a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']")
58
60 """
61 Like clean(), but takes a text input document, and returns a text
62 document.
63 """
64 doc = fromstring(html)
65 clean(doc, **kw)
66 return tostring(doc)
67
69 """
70 Instances cleans the document of each of the possible offending
71 elements. The cleaning is controlled by attributes; you can
72 override attributes in a subclass, or set them in the constructor.
73
74 ``scripts``:
75 Removes any ``<script>`` tags.
76
77 ``javascript``:
78 Removes any Javascript, like an ``onclick`` attribute.
79
80 ``comments``:
81 Removes any comments.
82
83 ``style``:
84 Removes any style tags or attributes.
85
86 ``links``:
87 Removes any ``<link>`` tags
88
89 ``meta``:
90 Removes any ``<meta>`` tags
91
92 ``page_structure``:
93 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
94
95 ``processing_instructions``:
96 Removes any processing instructions.
97
98 ``embedded``:
99 Removes any embedded objects (flash, iframes)
100
101 ``frames``:
102 Removes any frame-related tags
103
104 ``forms``:
105 Removes any form tags
106
107 ``annoying_tags``:
108 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marque>``
109
110 ``remove_tags``:
111 A list of tags to remove.
112
113 ``allow_tags``:
114 A list of tags to include (default include all).
115
116 ``remove_unknown_tags``:
117 Remove any tags that aren't standard parts of HTML.
118
119 ``safe_attrs_only``:
120 If true, only include 'safe' attributes (specifically the list
121 from `feedparser
122 <http://feedparser.org/docs/html-sanitization.html>`_).
123
124 ``add_nofollow``:
125 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
126
127 This modifies the document *in place*.
128 """
129
130 scripts = True
131 javascript = True
132 comments = True
133 style = False
134 links = True
135 meta = True
136 page_structure = True
137 processing_instructions = True
138 embedded = True
139 frames = True
140 forms = True
141 annoying_tags = True
142 remove_tags = None
143 allow_tags = None
144 remove_unknown_tags = True
145 safe_attrs_only = True
146 add_nofollow = False
147
154
156 """
157 Cleans the document.
158 """
159 if hasattr(doc, 'getroot'):
160
161 doc = doc.getroot()
162
163
164 for el in doc.getiterator('image'):
165 el.tag = 'img'
166 if not self.comments:
167
168
169 self.kill_conditional_comments(doc)
170 kill_tags = set()
171 remove_tags = set(self.remove_tags or ())
172 if self.allow_tags:
173 allow_tags = set(self.allow_tags)
174 else:
175 allow_tags = set()
176 if self.scripts:
177 kill_tags.add('script')
178 if self.safe_attrs_only:
179 safe_attrs = set(defs.safe_attrs)
180 for el in doc.getiterator():
181 attrib = el.attrib
182 for aname in attrib.keys():
183 if aname not in safe_attrs:
184 del attrib[aname]
185 if self.javascript:
186 if not self.safe_attrs_only:
187
188 for el in doc.getiterator():
189 attrib = el.attrib
190 for aname in attrib.keys():
191 if aname.startswith('on'):
192 del attrib[aname]
193 doc.rewrite_links(self._remove_javascript_link,
194 resolve_base_href=False)
195 if not self.style:
196
197
198 for el in _find_styled_elements(doc):
199 old = el.get('style')
200 new = _css_javascript_re.sub('', old)
201 new = _css_import_re.sub('', old)
202 if self._has_sneaky_javascript(new):
203
204 del el.attrib['style']
205 elif new != old:
206 el.set('style', new)
207 for el in list(doc.getiterator('style')):
208 if el.get('type', '').lower().strip() == 'text/javascript':
209 el.drop_tree()
210 continue
211 old = el.text or ''
212 new = _css_javascript_re.sub('', old)
213
214 new = _css_import_re.sub('', old)
215 if self._has_sneaky_javascript(new):
216
217 el.text = '/* deleted */'
218 elif new != old:
219 el.text = new
220 if self.comments or self.processing_instructions:
221
222
223
224 kill_tags.add(etree.Comment)
225 if self.processing_instructions:
226 kill_tags.add(etree.ProcessingInstruction)
227 if self.style:
228 kill_tags.add('style')
229 for el in _find_styled_elements(doc):
230 del el.attrib['style']
231 if self.links:
232 kill_tags.add('link')
233 elif self.style or self.javascript:
234
235
236 for el in list(doc.getiterator('link')):
237 if 'stylesheet' in el.get('rel', '').lower():
238
239 el.drop_tree()
240 if self.meta:
241 kill_tags.add('meta')
242 if self.page_structure:
243 remove_tags.update(('head', 'html', 'title'))
244 if self.embedded:
245
246 kill_tags.update(('applet', 'param'))
247
248
249
250 remove_tags.update(('iframe', 'object', 'embed', 'layer'))
251 if self.frames:
252 kill_tags.update(defs.frame_tags)
253 if self.forms:
254 remove_tags.add('form')
255 kill_tags.update(('button', 'input', 'select', 'textarea'))
256 if self.annoying_tags:
257 remove_tags.update(('blink', 'marque'))
258
259 _remove = []
260 _kill = []
261 for el in doc.getiterator():
262 if el.tag in kill_tags:
263 _kill.append(el)
264 elif el.tag in remove_tags:
265 _remove.append(el)
266
267 if _remove and _remove[0] == doc:
268
269
270 el = _remove.pop(0)
271 el.tag = 'div'
272 el.attrib.clear()
273 elif _kill and _kill[0] == doc:
274
275
276 el = _kill.pop(0)
277 if el.tag != 'html':
278 el.tag = 'div'
279 el.clear()
280
281 for el in _kill:
282 el.drop_tree()
283 for el in _remove:
284 el.drop_tag()
285
286 allow_tags = self.allow_tags
287 if self.remove_unknown_tags:
288 if allow_tags:
289 raise ValueError(
290 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
291 allow_tags = set(defs.tags)
292 if allow_tags:
293 bad = []
294 for el in doc.getiterator():
295 if el.tag not in allow_tags:
296 bad.append(el)
297 for el in bad:
298 el.drop_tag()
299 if self.add_nofollow:
300 for el in _find_external_links(doc):
301 el.set('rel', 'nofollow')
302
313
315 bad = []
316 for el in doc.getiterator(iterate):
317 if condition(el):
318 bad.append(el)
319 for el in bad:
320 el.drop_tree()
321
329
330 _decomment_re = re.compile(r'/\*.*?\*/', re.S)
331
333 """
334 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
335 can get interpreted, or ``expre/* stuff */ssion(...)``. This
336 checks for attempt to do stuff like this.
337
338 Typically the response will be to kill the entire style; if you
339 have just a bit of Javascript in the style another rule will catch
340 that and remove only the Javascript from the style; this catches
341 more sneaky attempts.
342 """
343 style = self._decomment_re.sub('', style)
344 style = style.replace('\\', '')
345 style = _whitespace_re.sub('', style)
346 style = style.lower()
347 if 'javascript:' in style:
348 return True
349 if 'expression(' in style:
350 return True
351 return False
352
354 if isinstance(html, basestring):
355 return_string = True
356 doc = fromstring(html)
357 else:
358 return_string = False
359 doc = copy.deepcopy(html)
360 self(doc)
361 if return_string:
362 return tostring(doc)
363 else:
364 return doc
365
366 clean = Cleaner()
367 clean_html = clean.clean_html
368
369
370
371
372
373 _link_regexes = [
374 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?)', re.I),
375
376 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
377 ]
378
379 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
380
381 _avoid_hosts = [
382 re.compile(r'^localhost', re.I),
383 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
384 re.compile(r'^127\.0\.0\.1$'),
385 ]
386
387 _avoid_classes = ['nolink']
388
393 """
394 Turn any URLs into links.
395
396 It will search for links identified by the given regular
397 expressions (by default mailto and http(s) links).
398
399 It won't link text in an element in avoid_elements, or an element
400 with a class in avoid_classes. It won't link to anything with a
401 host that matches one of the regular expressions in avoid_hosts
402 (default localhost and 127.0.0.1).
403
404 If you pass in an element, the elements tail will not be
405 substituted, only the contents of the element.
406 """
407 if el.tag in avoid_elements:
408 return
409 class_name = el.get('class')
410 if class_name:
411 class_name = class_name.split()
412 for match_class in avoid_classes:
413 if match_class in class_name:
414 return
415 for child in list(el):
416 autolink(child, link_regexes=link_regexes,
417 avoid_elements=avoid_elements,
418 avoid_hosts=avoid_hosts,
419 avoid_classes=avoid_classes)
420 if child.tail:
421 text, tail_children = _link_text(
422 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
423 if tail_children:
424 child.tail = text
425 index = el.index(child)
426 el[index+1:index+1] = tail_children
427 if el.text:
428 text, pre_children = _link_text(
429 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
430 if pre_children:
431 el.text = text
432 el[:0] = pre_children
433
434 -def _link_text(text, link_regexes, avoid_hosts, factory):
435 leading_text = ''
436 links = []
437 last_pos = 0
438 while 1:
439 best_match, best_pos = None, None
440 for regex in link_regexes:
441 regex_pos = last_pos
442 while 1:
443 match = regex.search(text, pos=regex_pos)
444 if match is None:
445 break
446 host = match.group('host')
447 for host_regex in avoid_hosts:
448 if host_regex.search(host):
449 regex_pos = match.end()
450 break
451 else:
452 break
453 if match is None:
454 continue
455 if best_pos is None or match.start() < best_pos:
456 best_match = match
457 best_pos = match.start()
458 if best_match is None:
459
460 if links:
461 assert not links[-1].tail
462 links[-1].tail = text
463 else:
464 assert not leading_text
465 leading_text = text
466 break
467 link = best_match.group(0)
468 end = best_match.end()
469 if link.endswith('.') or link.endswith(','):
470
471 end -= 1
472 link = link[:-1]
473 prev_text = text[:best_match.start()]
474 if links:
475 assert not links[-1].tail
476 links[-1].tail = prev_text
477 else:
478 assert not leading_text
479 leading_text = prev_text
480 anchor = factory('a')
481 anchor.set('href', link)
482 body = best_match.group('body')
483 if not body:
484 body = link
485 if body.endswith('.') or body.endswith(','):
486 body = body[:-1]
487 anchor.text = body
488 links.append(anchor)
489 text = text[end:]
490 return leading_text, links
491
493 if isinstance(html, basestring):
494 doc = fromstring(html)
495 return_string = True
496 else:
497 doc = copy.deepcopy(html)
498 return_string = False
499 autolink(doc, *args, **kw)
500 if return_string:
501 return tostring(doc)
502 else:
503 return doc
504
505 autolink_html.__doc__ = autolink.__doc__
506
507
508
509
510
511 _avoid_word_break_elements = ['pre', 'textarea', 'code']
512 _avoid_word_break_classes = ['nobreak']
513
518 """
519 Breaks any long words found in the body of the text (not attributes).
520
521 Doesn't effect any of the tags in avoid_elements, by default
522 ``<textarea>`` and ``<pre>``
523
524 Breaks words by inserting ​, which is a unicode character
525 for Zero Width Space character. This generally takes up no space
526 in rendering, but does copy as a space, and in monospace contexts
527 usually takes up space.
528
529 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
530 """
531
532
533 if el.tag in _avoid_word_break_elements:
534 return
535 class_name = el.get('class')
536 if class_name:
537 dont_break = False
538 class_name = class_name.split()
539 for avoid in avoid_classes:
540 if avoid in class_name:
541 dont_break = True
542 break
543 if dont_break:
544 return
545 if el.text:
546 el.text = _break_text(el.text, max_width, break_character)
547 for child in el:
548 word_break(child, max_width=max_width,
549 avoid_elements=avoid_elements,
550 avoid_classes=avoid_classes,
551 break_character=break_character)
552 if child.tail:
553 child.tail = _break_text(child.tail, max_width, break_character)
554
559
560 -def _break_text(text, max_width, break_character):
561 words = text.split()
562 for word in words:
563 if len(word) > max_width:
564 replacement = _insert_break(word, max_width, break_character)
565 text = text.replace(word, replacement)
566 return text
567
568 _break_prefer_re = re.compile(r'[^a-z]', re.I)
569
571 orig_word = word
572 result = ''
573 while len(word) > width:
574 start = word[:width]
575 breaks = list(_break_prefer_re.finditer(start))
576 if breaks:
577 last_break = breaks[-1]
578
579 if last_break.end() > width-10:
580
581
582 start = word[:last_break.end()]
583 result += start + break_character
584 word = word[len(start):]
585 result += word
586 return result
587