Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  import threading 
   2  import re 
   3  import urlparse 
   4  import copy 
   5  from lxml import etree 
   6  from lxml.html import defs 
   7  from lxml import cssselect 
   8  from lxml.html.setmixin import SetMixin 
   9  try: 
  10      from UserDict import DictMixin 
  11  except ImportError: 
  12      # DictMixin was introduced in Python 2.4 
  13      from lxml.html._dictmixin import DictMixin 
  14  import sets 
  15   
  16  __all__ = [ 
  17      'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 
  18      'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 
  19      'find_rel_links', 'find_class', 'make_links_absolute', 
  20      'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] 
  21   
  22  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") 
  23  #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 
  24  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  25  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  26  _collect_string_content = etree.XPath("string()") 
  27  _css_url_re = re.compile(r'url\((.*?)\)', re.I) 
  28  _css_import_re = re.compile(r'@import "(.*?)"') 
  29  _label_xpath = etree.XPath("//label[@for=$id]") 
  30   
31 -class HtmlMixin(object):
32
33 - def base_url(self):
34 """ 35 Returns the base URL, given when the page was parsed. 36 37 Use with ``urlparse.urljoin(el.base_url, href)`` to get 38 absolute URLs. 39 """ 40 return self.getroottree().docinfo.URL
41 base_url = property(base_url, doc=base_url.__doc__) 42
43 - def forms(self):
44 """ 45 Return a list of all the forms 46 """ 47 return list(self.getiterator('form'))
48 forms = property(forms, doc=forms.__doc__) 49
50 - def body(self):
51 """ 52 Return the <body> element. Can be called from a child element 53 to get the document's head. 54 """ 55 return self.xpath('//body')[0]
56 body = property(body, doc=body.__doc__) 57
58 - def head(self):
59 """ 60 Returns the <head> element. Can be called from a child 61 element to get the document's head. 62 """ 63 return self.xpath('//head')[0]
64 head = property(head, doc=head.__doc__) 65
66 - def label__get(self):
67 """ 68 Get or set any <label> element associated with this element. 69 """ 70 id = self.get('id') 71 if not id: 72 return None 73 result = _label_xpath(self, id=id) 74 if not result: 75 return None 76 else: 77 return result[0]
78 - def label__set(self, label):
79 id = self.get('id') 80 if not id: 81 raise TypeError( 82 "You cannot set a label for an element (%r) that has no id" 83 % self) 84 if not label.tag == 'label': 85 raise TypeError( 86 "You can only assign label to a label element (not %r)" 87 % label) 88 label.set('for', id)
89 - def label__del(self):
90 label = self.label 91 if label is not None: 92 del label.attrib['for']
93 label = property(label__get, label__set, label__del, doc=label__get.__doc__) 94
95 - def drop_tree(self):
96 """ 97 Removes this element from the tree, including its children and 98 text. The tail text is joined to the previous element or 99 parent. 100 """ 101 parent = self.getparent() 102 assert parent is not None 103 if self.tail: 104 previous = self.getprevious() 105 if previous is None: 106 parent.text = (parent.text or '') + self.tail 107 else: 108 previous.tail = (previous.tail or '') + self.tail 109 parent.remove(self)
110
111 - def drop_tag(self):
112 """ 113 Remove the tag, but not its children or text. The children and text 114 are merged into the parent. 115 116 Example:: 117 118 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 119 >>> h.find('//b').drop_tag() 120 >>> print tostring(h) 121 <div>Hello World!</div> 122 """ 123 parent = self.getparent() 124 assert parent is not None 125 previous = self.getprevious() 126 if self.text and isinstance(self.tag, basestring): 127 # not a Comment, etc. 128 if previous is None: 129 parent.text = (parent.text or '') + self.text 130 else: 131 previous.tail = (previous.tail or '') + self.text 132 if self.tail: 133 if len(self): 134 last = self[-1] 135 last.tail = (last.tail or '') + self.tail 136 elif previous is None: 137 parent.text = (parent.text or '') + self.tail 138 else: 139 previous.tail = (previous.tail or '') + self.tail 140 index = parent.index(self) 141 parent[index:index+1] = self[:]
142 150
151 - def find_class(self, class_name):
152 """ 153 Find any elements with the given class name. 154 """ 155 return _class_xpath(self, class_name=class_name)
156
157 - def get_element_by_id(self, id, *default):
158 """ 159 Get the first element in a document with the given id. If none is 160 found, return the default argument if provided or raise KeyError 161 otherwise. 162 163 Note that there can be more than one element with the same id, 164 and this isn't uncommon in HTML documents found in the wild. 165 Browsers return only the first match, and this function does 166 the same. 167 """ 168 try: 169 # FIXME: should this check for multiple matches? 170 # browsers just return the first one 171 return _id_xpath(self, id=id)[0] 172 except IndexError: 173 if default: 174 return default[0] 175 else: 176 raise KeyError, id
177
178 - def text_content(self):
179 """ 180 Return the text content of the tag (and the text in any children). 181 """ 182 return _collect_string_content(self)
183
184 - def cssselect(self, expr):
185 """ 186 Run the CSS expression on this element and its children, 187 returning a list of the results. 188 189 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 190 that pre-compiling the expression can provide a substantial 191 speedup. 192 """ 193 return cssselect.CSSSelect(expr)(self)
194 195 ######################################## 196 ## Link functions 197 ######################################## 198 218 self.rewrite_links(link_repl)
219
220 - def resolve_base_href(self):
221 """ 222 Find any ``<base href>`` tag in the document, and apply its 223 values to all links found in the document. Also remove the 224 tag once it has been applied. 225 """ 226 base_href = None 227 basetags = self.xpath('//base[@href]') 228 for b in basetags: 229 base_href = b.get('href') 230 b.drop_tree() 231 if not base_href: 232 return 233 self.make_links_absolute(base_href, resolve_base_href=False)
234 259 306 307
308 -class _MethodFunc(object):
309 """ 310 An object that represents a method on an element as a function; 311 the function takes either an element or an HTML string. It 312 returns whatever the function normally returns, or if the function 313 works in-place (and so returns None) it returns a serialized form 314 of the resulting document. 315 """
316 - def __init__(self, name, copy=False, source_class=HtmlMixin):
317 self.name = name 318 self.copy = copy 319 self.__doc__ = getattr(source_class, self.name).__doc__
320 - def __call__(self, doc, *args, **kw):
321 if isinstance(doc, basestring): 322 if 'copy' in kw: 323 raise TypeError( 324 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 325 return_string = True 326 doc = fromstring(doc, **kw) 327 else: 328 if 'copy' in kw: 329 copy = kw.pop('copy') 330 else: 331 copy = self.copy 332 return_string = False 333 if copy: 334 doc = copy.deepcopy(doc) 335 meth = getattr(doc, self.name) 336 result = meth(*args, **kw) 337 # FIXME: this None test is a bit sloppy 338 if result is None: 339 # Then return what we got in 340 if return_string: 341 return tostring(doc) 342 else: 343 return doc 344 else: 345 return result
346 347 find_rel_links = _MethodFunc('find_rel_links', copy=False) 348 find_class = _MethodFunc('find_class', copy=False) 349 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 350 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 351 iterlinks = _MethodFunc('iterlinks', copy=False) 352 rewrite_links = _MethodFunc('rewrite_links', copy=True) 353
354 -class HtmlComment(etree.CommentBase, HtmlMixin):
355 pass
356
357 -class HtmlElement(etree.ElementBase, HtmlMixin):
358 pass
359
360 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
361 pass
362
363 -class HtmlEntity(etree.EntityBase, HtmlMixin):
364 pass
365 366
367 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
368 """A lookup scheme for HTML Element classes. 369 370 To create a lookup instance with different Element classes, pass a tag 371 name mapping of Element classes in the ``classes`` keyword argument and/or 372 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 373 The special key '*' denotes a Mixin class that should be mixed into all 374 Element classes. 375 """ 376 _default_element_classes = {} 377
378 - def __init__(self, classes=None, mixins=None):
379 etree.CustomElementClassLookup.__init__(self) 380 if classes is None: 381 classes = self._default_element_classes.copy() 382 if mixins: 383 mixers = {} 384 for name, value in mixins: 385 if name == '*': 386 for n in classes.keys(): 387 mixers.setdefault(n, []).append(value) 388 else: 389 mixers.setdefault(name, []).append(value) 390 for name, mix_bases in mixers.items(): 391 cur = classes.get(name, HtmlElement) 392 bases = tuple(mix_bases + [cur]) 393 classes[name] = type(cur.__name__, bases, {}) 394 self._element_classes = classes
395
396 - def lookup(self, node_type, document, namespace, name):
397 if node_type == 'element': 398 return self._element_classes.get(name.lower(), HtmlElement) 399 elif node_type == 'comment': 400 return HtmlComment 401 elif node_type == 'PI': 402 return HtmlProcessingInstruction 403 elif node_type == 'entity': 404 return HtmlEntity 405 # Otherwise normal lookup 406 return None
407 408 ################################################################################ 409 # parsing 410 ################################################################################ 411
412 -def document_fromstring(html, **kw):
413 value = etree.HTML(html, html_parser, **kw) 414 if value is None: 415 raise etree.ParserError( 416 "Document is empty") 417 return value
418
419 -def fragments_fromstring(html, no_leading_text=False, **kw):
420 """ 421 Parses several HTML elements, returning a list of elements. 422 423 The first item in the list may be a string (though leading 424 whitespace is removed). If no_leading_text is true, then it will 425 be an error if there is leading text, and it will always be a list 426 of only elements. 427 """ 428 # FIXME: check what happens when you give html with a body, head, etc. 429 start = html[:20].lstrip().lower() 430 if not start.startswith('<html') and not start.startswith('<!doctype'): 431 html = '<html><body>%s</body></html>' % html 432 doc = document_fromstring(html, **kw) 433 assert doc.tag == 'html' 434 bodies = [e for e in doc if e.tag == 'body'] 435 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 436 body = bodies[0] 437 elements = [] 438 if no_leading_text and body.text and body.text.strip(): 439 raise etree.ParserError( 440 "There is leading text: %r" % body.text) 441 if body.text and body.text.strip(): 442 elements.append(body.text) 443 elements.extend(body) 444 # FIXME: removing the reference to the parent artificial document 445 # would be nice 446 return elements
447
448 -def fragment_fromstring(html, create_parent=False, **kw):
449 """ 450 Parses a single HTML element; it is an error if there is more than 451 one element, or if anything but whitespace precedes or follows the 452 element. 453 454 If create_parent is true (or is a tag name) then a parent node 455 will be created to encapsulate the HTML in a single element. 456 """ 457 if create_parent: 458 if not isinstance(create_parent, basestring): 459 create_parent = 'div' 460 return fragment_fromstring('<%s>%s</%s>' % ( 461 create_parent, html, create_parent), **kw) 462 elements = fragments_fromstring(html, no_leading_text=True) 463 if not elements: 464 raise etree.ParserError( 465 "No elements found") 466 if len(elements) > 1: 467 raise etree.ParserError( 468 "Multiple elements found (%s)" 469 % ', '.join([_element_name(e) for e in elements])) 470 el = elements[0] 471 if el.tail and el.tail.strip(): 472 raise etree.ParserError( 473 "Element followed by text: %r" % el.tail) 474 el.tail = None 475 return el
476
477 -def fromstring(html, **kw):
478 """ 479 Parse the html, returning a single element/document. 480 481 This tries to minimally parse the chunk of text, without knowing if it 482 is a fragment or a document. 483 """ 484 start = html[:10].lstrip().lower() 485 if start.startswith('<html') or start.startswith('<!doctype'): 486 # Looks like a full HTML document 487 return document_fromstring(html, **kw) 488 # otherwise, lets parse it out... 489 doc = document_fromstring(html, **kw) 490 bodies = doc.findall('body') 491 if bodies: 492 body = bodies[0] 493 if len(bodies) > 1: 494 # Somehow there are multiple bodies, which is bad, but just 495 # smash them into one body 496 for other_body in bodies[1:]: 497 if other_body.text: 498 if len(body): 499 body[-1].tail = (body[-1].tail or '') + other_body.text 500 else: 501 body.text = (body.text or '') + other_body.text 502 body.extend(other_body) 503 # We'll ignore tail 504 # I guess we are ignoring attributes too 505 other_body.drop_tree() 506 else: 507 body = None 508 heads = doc.findall('head') 509 if heads: 510 # Well, we have some sort of structure, so lets keep it all 511 head = heads[0] 512 if len(heads) > 1: 513 for other_head in heads[1:]: 514 head.extend(other_head) 515 # We don't care about text or tail in a head 516 other_head.drop_tree() 517 return doc 518 if (len(body) == 1 and (not body.text or not body.text.strip()) 519 and (not body[-1].tail or not body[-1].tail.strip())): 520 # The body has just one element, so it was probably a single 521 # element passed in 522 return body[0] 523 # Now we have a body which represents a bunch of tags which have the 524 # content that was passed in. We will create a fake container, which 525 # is the body tag, except <body> implies too much structure. 526 if _contains_block_level_tag(body): 527 body.tag = 'div' 528 else: 529 body.tag = 'span' 530 return body
531
532 -def parse(filename, parser=None, **kw):
533 """ 534 Parse a filename, URL, or file-like object into an HTML document. 535 536 You may pass the keyword argument ``base_url='http://...'`` to set 537 the base URL. 538 """ 539 if parser is None: 540 parser = html_parser 541 return etree.parse(filename, parser, **kw)
542
543 -def _contains_block_level_tag(el):
544 # FIXME: I could do this with XPath, but would that just be 545 # unnecessarily slow? 546 for el in el.getiterator(): 547 if el.tag in defs.block_tags: 548 return True 549 return False
550
551 -def _element_name(el):
552 if isinstance(el, etree.CommentBase): 553 return 'comment' 554 elif isinstance(el, basestring): 555 return 'string' 556 else: 557 return el.tag
558 559 ################################################################################ 560 # form handling 561 ################################################################################ 562
563 -class FormElement(HtmlElement):
564 """ 565 Represents a <form> element. 566 """ 567
568 - def inputs(self):
569 """ 570 Returns an accessor for all the input elements in the form. 571 572 See `InputGetter` for more information about the object. 573 """ 574 return InputGetter(self)
575 inputs = property(inputs, doc=inputs.__doc__) 576
577 - def fields__get(self):
578 """ 579 Dictionary-like object that represents all the fields in this 580 form. You can set values in this dictionary to effect the 581 form. 582 """ 583 return FieldsDict(self.inputs)
584 - def fields__set(self, value):
585 prev_keys = self.fields.keys() 586 for key, value in value.iteritems(): 587 if key in prev_keys: 588 prev_keys.remove(key) 589 self.fields[key] = value 590 for key in prev_keys: 591 if key is None: 592 # Case of an unnamed input; these aren't really 593 # expressed in form_values() anyway. 594 continue 595 self.fields[key] = None
596 597 fields = property(fields__get, fields__set, doc=fields__get.__doc__) 598
599 - def _name(self):
600 if self.get('name'): 601 return self.get('name') 602 elif self.get('id'): 603 return '#' + self.get('id') 604 return str(self.body.findall('form').index(self))
605
606 - def form_values(self):
607 """ 608 Return a list of tuples of the field values for the form. 609 This is suitable to be passed to ``urllib.urlencode()``. 610 """ 611 results = [] 612 for el in self.inputs: 613 name = el.name 614 if not name: 615 continue 616 if el.tag == 'textarea': 617 results.append((name, el.value)) 618 elif el.tag == 'select': 619 value = el.value 620 if el.multiple: 621 for v in value: 622 results.append((name, v)) 623 elif value is not None: 624 results.append((name, el.value)) 625 else: 626 assert el.tag == 'input', ( 627 "Unexpected tag: %r" % el) 628 if el.checkable and not el.checked: 629 continue 630 if el.type in ('submit', 'image', 'reset'): 631 continue 632 value = el.value 633 if value is not None: 634 results.append((name, el.value)) 635 return results
636
637 - def action__get(self):
638 """ 639 Get/set the form's ``action`` attribute. 640 """ 641 base_url = self.base_url 642 action = self.get('action') 643 if base_url and action is not None: 644 return urlparse.urljoin(base_url, action) 645 else: 646 return action
647 - def action__set(self, value):
648 self.set('action', value)
649 - def action__del(self):
650 if 'action' in self.attrib: 651 del self.attrib['action']
652 action = property(action__get, action__set, action__del, doc=action__get.__doc__) 653
654 - def method__get(self):
655 """ 656 Get/set the form's method. Always returns a capitalized 657 string, and defaults to ``'GET'`` 658 """ 659 return self.get('method', 'GET').upper()
660 - def method__set(self, value):
661 self.set('method', value.upper())
662 method = property(method__get, method__set, doc=method__get.__doc__)
663 664 HtmlElementClassLookup._default_element_classes['form'] = FormElement 665
666 -def submit_form(form, extra_values=None, open_http=None):
667 """ 668 Helper function to submit a form. Returns a file-like object, as from 669 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 670 which shows the URL if there were any redirects. 671 672 You can use this like:: 673 674 >>> form = doc.forms[0] 675 >>> form.inputs['foo'].value = 'bar' # etc 676 >>> response = form.submit() 677 >>> doc = parse(response) 678 >>> doc.make_links_absolute(response.geturl()) 679 680 To change the HTTP requester, pass a function as ``open_http`` keyword 681 argument that opens the URL for you. The function must have the following 682 signature:: 683 684 open_http(method, URL, values) 685 686 The action is one of 'GET' or 'POST', the URL is the target URL as a 687 string, and the values are a sequence of ``(name, value)`` tuples with the 688 form data. 689 """ 690 values = form.form_values() 691 if extra_values: 692 if hasattr(extra_values, 'items'): 693 extra_values = extra_values.items() 694 values.extend(extra_values) 695 if open_http is None: 696 open_http = open_http_urllib 697 return open_http(form.method, form.action, values)
698
699 -def open_http_urllib(method, url, values):
700 import urllib 701 ## FIXME: should test that it's not a relative URL or something 702 if method == 'GET': 703 if '?' in url: 704 url += '&' 705 else: 706 url += '?' 707 url += urllib.urlencode(values) 708 data = None 709 else: 710 data = urllib.urlencode(values) 711 return urllib.urlopen(url, data)
712
713 -class FieldsDict(DictMixin):
714
715 - def __init__(self, inputs):
716 self.inputs = inputs
717 - def __getitem__(self, item):
718 return self.inputs[item].value
719 - def __setitem__(self, item, value):
720 self.inputs[item].value = value
721 - def __delitem__(self, item):
722 raise KeyError( 723 "You cannot remove keys from ElementDict")
724 - def keys(self):
725 return self.inputs.keys()
726 - def __contains__(self, item):
727 return item in self.inputs
728
729 - def __repr__(self):
730 return '<%s for form %s>' % ( 731 self.__class__.__name__, 732 self.inputs.form._name())
733
734 -class InputGetter(object):
735 736 """ 737 An accessor that represents all the input fields in a form. 738 739 You can get fields by name from this, with 740 ``form.inputs['field_name']``. If there are a set of checkboxes 741 with the same name, they are returned as a list (a `CheckboxGroup` 742 which also allows value setting). Radio inputs are handled 743 similarly. 744 745 You can also iterate over this to get all input elements. This 746 won't return the same thing as if you get all the names, as 747 checkboxes and radio elements are returned individually. 748 """ 749 750 _name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]") 751 _all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']") 752
753 - def __init__(self, form):
754 self.form = form
755
756 - def __repr__(self):
757 return '<%s for form %s>' % ( 758 self.__class__.__name__, 759 self.form._name())
760 761 ## FIXME: there should be more methods, and it's unclear if this is 762 ## a dictionary-like object or list-like object 763
764 - def __getitem__(self, name):
765 results = self._name_xpath(self.form, name=name) 766 if results: 767 type = results[0].get('type') 768 if type == 'radio' and len(results) > 1: 769 group = RadioGroup(results) 770 group.name = name 771 return group 772 elif type == 'checkbox' and len(results) > 1: 773 group = CheckboxGroup(results) 774 group.name = name 775 return group 776 else: 777 # I don't like throwing away elements like this 778 return results[0] 779 else: 780 raise KeyError( 781 "No input element with the name %r" % name)
782
783 - def __contains__(self, name):
784 results = self._name_xpath(self.form, name=name) 785 return bool(results)
786
787 - def keys(self):
788 names = sets.Set() 789 for el in self: 790 names.add(el.name) 791 return list(names)
792
793 - def __iter__(self):
794 ## FIXME: kind of dumb to turn a list into an iterator, only 795 ## to have it likely turned back into a list again :( 796 return iter(self._all_xpath(self.form))
797
798 -class InputMixin(object):
799 800 """ 801 Mix-in for all input elements (input, select, and textarea) 802 """ 803 804
805 - def name__get(self):
806 """ 807 Get/set the name of the element 808 """ 809 return self.get('name')
810 - def name__set(self, value):
811 self.set('name', value)
812 - def name__del(self):
813 if 'name' in self.attrib: 814 del self.attrib['name']
815 name = property(name__get, name__set, name__del, doc=name__get.__doc__) 816
817 - def __repr__(self):
818 type = getattr(self, 'type', None) 819 if type: 820 type = ' type=%r' % type 821 else: 822 type = '' 823 return '<%s %x name=%r%s>' % ( 824 self.__class__.__name__, id(self), self.name, type)
825
826 -class TextareaElement(InputMixin, HtmlElement):
827 """ 828 ``<textarea>`` element. You can get the name with ``.name`` and 829 get/set the value with ``.value`` 830 """ 831
832 - def value__get(self):
833 """ 834 Get/set the value (which is the contents of this element) 835 """ 836 return self.text or ''
837 - def value__set(self, value):
838 self.text = value
839 - def value__del(self):
840 self.text = ''
841 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
842 843 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 844
845 -class SelectElement(InputMixin, HtmlElement):
846 """ 847 ``<select>`` element. You can get the name with ``.name``. 848 849 ``.value`` will be the value of the selected option, unless this 850 is a multi-select element (``<select multiple>``), in which case 851 it will be a set-like object. In either case ``.value_options`` 852 gives the possible values. 853 854 The boolean attribute ``.multiple`` shows if this is a 855 multi-select. 856 """ 857
858 - def value__get(self):
859 """ 860 Get/set the value of this select (the selected option). 861 862 If this is a multi-select, this is a set-like object that 863 represents all the selected options. 864 """ 865 if self.multiple: 866 return MultipleSelectOptions(self) 867 for el in self.getiterator('option'): 868 if 'selected' in el.attrib: 869 value = el.get('value') 870 # FIXME: If value is None, what to return?, get_text()? 871 return value 872 return None
873
874 - def value__set(self, value):
875 if self.multiple: 876 if isinstance(value, basestring): 877 raise TypeError( 878 "You must pass in a sequence") 879 self.value.clear() 880 self.value.update(value) 881 return 882 if value is not None: 883 for el in self.getiterator('option'): 884 # FIXME: also if el.get('value') is None? 885 if el.get('value') == value: 886 checked_option = el 887 break 888 else: 889 raise ValueError( 890 "There is no option with the value of %r" % value) 891 for el in self.getiterator('option'): 892 if 'selected' in el.attrib: 893 del el.attrib['selected'] 894 if value is not None: 895 checked_option.set('selected', '')
896
897 - def value__del(self):
898 # FIXME: should del be allowed at all? 899 if self.multiple: 900 self.value.clear() 901 else: 902 self.value = None
903 904 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 905
906 - def value_options(self):
907 """ 908 All the possible values this select can have (the ``value`` 909 attribute of all the ``<option>`` elements. 910 """ 911 return [el.get('value') for el in self.getiterator('option')]
912 value_options = property(value_options, doc=value_options.__doc__) 913
914 - def multiple__get(self):
915 """ 916 Boolean attribute: is there a ``multiple`` attribute on this element. 917 """ 918 return 'multiple' in self.attrib
919 - def multiple__set(self, value):
920 if value: 921 self.set('multiple', '') 922 elif 'multiple' in self.attrib: 923 del self.attrib['multiple']
924 multiple = property(multiple__get, multiple__set, doc=multiple__get.__doc__)
925 926 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 927
928 -class MultipleSelectOptions(SetMixin):
929 """ 930 Represents all the selected options in a ``<select multiple>`` element. 931 932 You can add to this set-like option to select an option, or remove 933 to unselect the option. 934 """ 935
936 - def __init__(self, select):
937 self.select = select
938
939 - def options(self):
940 """ 941 Iterator of all the ``<option>`` elements. 942 """ 943 return self.select.getiterator('option')
944 options = property(options) 945
946 - def __iter__(self):
947 for option in self.options: 948 yield option.get('value')
949
950 - def add(self, item):
951 for option in self.options: 952 if option.get('value') == item: 953 option.set('selected', '') 954 break 955 else: 956 raise ValueError( 957 "There is no option with the value %r" % item)
958
959 - def remove(self, item):
960 for option in self.options: 961 if option.get('value') == item: 962 if 'selected' in option.attrib: 963 del option.attrib['selected'] 964 else: 965 raise ValueError( 966 "The option %r is not currently selected" % item) 967 break 968 else: 969 raise ValueError( 970 "There is not option with the value %r" % item)
971
972 - def __repr__(self):
973 return '<%s {%s} for select name=%r>' % ( 974 self.__class__.__name__, 975 ', '.join([repr(v) for v in self]), 976 self.select.name)
977
978 -class RadioGroup(list):
979 """ 980 This object represents several ``<input type=radio>`` elements 981 that have the same name. 982 983 You can use this like a list, but also use the property 984 ``.value`` to check/uncheck inputs. Also you can use 985 ``.value_options`` to get the possible values. 986 """ 987
988 - def value__get(self):
989 """ 990 Get/set the value, which checks the radio with that value (and 991 unchecks any other value). 992 """ 993 for el in self: 994 if 'checked' in el.attrib: 995 return el.get('value') 996 return None
997
998 - def value__set(self, value):
999 if value is not None: 1000 for el in self: 1001 if el.get('value') == value: 1002 checked_option = el 1003 break 1004 else: 1005 raise ValueError( 1006 "There is no radio input with the value %r" % value) 1007 for el in self: 1008 if 'checked' in el.attrib: 1009 del el.attrib['checked'] 1010 if value is not None: 1011 checked_option.set('checked', '')
1012
1013 - def value__del(self):
1014 self.value = None
1015 1016 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1017
1018 - def value_options(self):
1019 """ 1020 Returns a list of all the possible values. 1021 """ 1022 return [el.get('value') for el in self]
1023 value_options = property(value_options, doc=value_options.__doc__) 1024
1025 - def __repr__(self):
1026 return '%s(%s)' % ( 1027 self.__class__.__name__, 1028 list.__repr__(self))
1029
1030 -class CheckboxGroup(list):
1031 """ 1032 Represents a group of checkboxes (``<input type=checkbox>``) that 1033 have the same name. 1034 1035 In addition to using this like a list, the ``.value`` attribute 1036 returns a set-like object that you can add to or remove from to 1037 check and uncheck checkboxes. You can also use ``.value_options`` 1038 to get the possible values. 1039 """ 1040
1041 - def value__get(self):
1042 """ 1043 Return a set-like object that can be modified to check or 1044 uncheck individual checkboxes according to their value. 1045 """ 1046 return CheckboxValues(self)
1047 - def value__set(self, value):
1048 self.value.clear() 1049 if not hasattr(value, '__iter__'): 1050 raise ValueError( 1051 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1052 % (self[0].name, value)) 1053 self.value.update(value)
1054 - def value__del(self):
1055 self.value.clear()
1056 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1057
1058 - def __repr__(self):
1059 return '%s(%s)' % ( 1060 self.__class__.__name__, list.__repr__(self))
1061
1062 -class CheckboxValues(SetMixin):
1063 1064 """ 1065 Represents the values of the checked checkboxes in a group of 1066 checkboxes with the same name. 1067 """ 1068
1069 - def __init__(self, group):
1070 self.group = group
1071
1072 - def __iter__(self):
1073 return iter([ 1074 el.get('value') 1075 for el in self.group 1076 if 'checked' in el.attrib])
1077
1078 - def add(self, value):
1079 for el in self.group: 1080 if el.get('value') == value: 1081 el.set('checked', '') 1082 break 1083 else: 1084 raise KeyError("No checkbox with value %r" % value)
1085
1086 - def remove(self, value):
1087 for el in self.group: 1088 if el.get('value') == value: 1089 if 'checked' in el.attrib: 1090 del el.attrib['checked'] 1091 else: 1092 raise KeyError( 1093 "The checkbox with value %r was already unchecked" % value) 1094 break 1095 else: 1096 raise KeyError( 1097 "No checkbox with value %r" % value)
1098
1099 - def __repr__(self):
1100 return '<%s {%s} for checkboxes name=%r>' % ( 1101 self.__class__.__name__, 1102 ', '.join([repr(v) for v in self]), 1103 self.group.name)
1104
1105 -class InputElement(InputMixin, HtmlElement):
1106 """ 1107 Represents an ``<input>`` element. 1108 1109 You can get the type with ``.type`` (which is lower-cased and 1110 defaults to ``'text'``). 1111 1112 Also you can get and set the value with ``.value`` 1113 1114 Checkboxes and radios have the attribute ``input.checkable == 1115 True`` (for all others it is false) and a boolean attribute 1116 ``.checked``. 1117 1118 """ 1119 1120 ## FIXME: I'm a little uncomfortable with the use of .checked
1121 - def value__get(self):
1122 """ 1123 Get/set the value of this element, using the ``value`` attribute. 1124 1125 Also, if this is a checkbox and it has no value, this defaults 1126 to ``'on'``. If it is a checkbox or radio that is not 1127 checked, this returns None. 1128 """ 1129 if self.checkable: 1130 if self.checked: 1131 return self.get('value') or 'on' 1132 else: 1133 return None 1134 return self.get('value')
1135 - def value__set(self, value):
1136 if self.checkable: 1137 if not value: 1138 self.checked = False 1139 else: 1140 self.checked = True 1141 if isinstance(value, basestring): 1142 self.set('value', value) 1143 else: 1144 self.set('value', value)
1145 - def value__del(self):
1146 if self.checkable: 1147 self.checked = False 1148 else: 1149 if 'value' in self.attrib: 1150 del self.attrib['value']
1151 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1152
1153 - def type__get(self):
1154 """ 1155 Return the type of this element (using the type attribute). 1156 """ 1157 return self.get('type', 'text').lower()
1158 - def type__set(self, value):
1159 self.set('type', value)
1160 type = property(type__get, type__set, doc=type__get.__doc__) 1161
1162 - def checkable__get(self):
1163 """ 1164 Boolean: can this element be checked? 1165 """ 1166 return self.type in ['checkbox', 'radio']
1167 checkable = property(checkable__get, doc=checkable__get.__doc__) 1168
1169 - def checked__get(self):
1170 """ 1171 Boolean attribute to get/set the presence of the ``checked`` 1172 attribute. 1173 1174 You can only use this on checkable input types. 1175 """ 1176 if not self.checkable: 1177 raise AttributeError('Not a checkable input type') 1178 return 'checked' in self.attrib
1179 - def checked__set(self, value):
1180 if not self.checkable: 1181 raise AttributeError('Not a checkable input type') 1182 if value: 1183 self.set('checked', '') 1184 else: 1185 if 'checked' in self.attrib: 1186 del self.attrib['checked']
1187 checked = property(checked__get, checked__set, doc=checked__get.__doc__)
1188 1189 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1190
1191 -class LabelElement(HtmlElement):
1192 """ 1193 Represents a ``<label>`` element. 1194 1195 Label elements are linked to other elements with their ``for`` 1196 attribute. You can access this element with ``label.for_element``. 1197 """ 1198
1199 - def for_element__get(self):
1200 """ 1201 Get/set the element this label points to. Return None if it 1202 can't be found. 1203 """ 1204 id = self.get('for') 1205 if not id: 1206 return None 1207 return self.body.get_element_by_id(id)
1208 - def for_element__set(self, other):
1209 id = other.get('id') 1210 if not id: 1211 raise TypeError( 1212 "Element %r has no id attribute" % other) 1213 self.set('for', id)
1214 - def for_element__del(self):
1215 if 'id' in self.attrib: 1216 del self.attrib['id']
1217 for_element = property(for_element__get, for_element__set, for_element__del, 1218 doc=for_element__get.__doc__)
1219 1220 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1221 1222 ############################################################ 1223 ## Serialization 1224 ############################################################ 1225 1226 # This isn't a general match, but it's a match for what libxml2 1227 # specifically serialises: 1228 __replace_meta_content_type = re.compile( 1229 r'<meta http-equiv="Content-Type".*?>').sub 1230
1231 -def tostring(doc, pretty_print=False, include_meta_content_type=False):
1232 """ 1233 return HTML string representation of the document given 1234 1235 note: this will create a meta http-equiv="Content" tag in the head 1236 and may replace any that are present 1237 """ 1238 assert doc is not None 1239 html = etree.tostring(doc, method="html", pretty_print=pretty_print) 1240 if not include_meta_content_type: 1241 html = __replace_meta_content_type('', html) 1242 return html
1243
1244 -def open_in_browser(doc):
1245 """ 1246 Open the HTML document in a web browser (saving it to a temporary 1247 file to open it). 1248 """ 1249 import os 1250 import webbrowser 1251 try: 1252 write_doc = doc.write 1253 except AttributeError: 1254 write_doc = etree.ElementTree(element=doc).write 1255 fn = os.tempnam() + '.html' 1256 write_doc(fn, method="html") 1257 url = 'file://' + fn.replace(os.path.sep, '/') 1258 print url 1259 webbrowser.open(url)
1260 1261 ################################################################################ 1262 # configure Element class lookup 1263 ################################################################################ 1264
1265 -class HTMLParser(etree.HTMLParser):
1266 - def __init__(self, **kwargs):
1267 super(HTMLParser, self).__init__(**kwargs) 1268 self.setElementClassLookup(HtmlElementClassLookup())
1269
1270 -def Element(*args, **kw):
1271 v = html_parser.makeelement(*args, **kw) 1272 return v
1273 1274 html_parser = HTMLParser() 1275