Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  import threading 
   2  import re 
   3  import urlparse 
   4  import copy 
   5  from lxml import etree 
   6  from lxml.html import defs 
   7  from lxml import cssselect 
   8  from lxml.html.setmixin import SetMixin 
   9  try: 
  10      from UserDict import DictMixin 
  11  except ImportError: 
  12      # DictMixin was introduced in Python 2.4 
  13      from lxml.html._dictmixin import DictMixin 
  14  import sets 
  15   
  16  __all__ = ['document_fromstring', 'tostring', 'Element', 'defs', 
  17             'find_rel_links', 'find_class', 'make_links_absolute', 
  18             'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser'] 
  19   
  20  _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]") 
  21  #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 
  22  _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 
  23  _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 
  24  _collect_string_content = etree.XPath("string()") 
  25  _css_url_re = re.compile(r'url\((.*?)\)', re.I) 
  26  _css_import_re = re.compile(r'@import "(.*?)"') 
  27  _label_xpath = etree.XPath("//label[@for=$id]") 
  28   
29 -class HtmlMixin(object):
30
31 - def base_url(self):
32 """ 33 Returns the base URL, given when the page was parsed. 34 35 Use with ``urlparse.urljoin(el.base_url, href)`` to get 36 absolute URLs. 37 """ 38 return self.getroottree().docinfo.URL
39 base_url = property(base_url, doc=base_url.__doc__) 40
41 - def forms(self):
42 """ 43 Return a list of all the forms 44 """ 45 return list(self.getiterator('form'))
46 forms = property(forms, doc=forms.__doc__) 47
48 - def body(self):
49 """ 50 Return the <body> element. Can be called from a child element 51 to get the document's head. 52 """ 53 return self.xpath('//body')[0]
54 body = property(body, doc=body.__doc__) 55
56 - def head(self):
57 """ 58 Returns the <head> element. Can be called from a child 59 element to get the document's head. 60 """ 61 return self.xpath('//head')[0]
62 head = property(head, doc=head.__doc__) 63
64 - def label__get(self):
65 """ 66 Get or set any <label> element associated with this element. 67 """ 68 id = self.get('id') 69 if not id: 70 return None 71 result = _label_xpath(self, id=id) 72 if not result: 73 return None 74 else: 75 return result[0]
76 - def label__set(self, label):
77 id = self.get('id') 78 if not id: 79 raise TypeError( 80 "You cannot set a label for an element (%r) that has no id" 81 % self) 82 if not label.tag == 'label': 83 raise TypeError( 84 "You can only assign label to a label element (not %r)" 85 % label) 86 label.set('for', id)
87 - def label__del(self):
88 label = self.label 89 if label is not None: 90 del label.attrib['for']
91 label = property(label__get, label__set, label__del, doc=label__get.__doc__) 92
93 - def drop_tree(self):
94 """ 95 Removes this element from the tree, including its children and 96 text. The tail text is joined to the previous element or 97 parent. 98 """ 99 parent = self.getparent() 100 assert parent is not None 101 if self.tail: 102 previous = self.getprevious() 103 if previous is None: 104 parent.text = (parent.text or '') + self.tail 105 else: 106 previous.tail = (previous.tail or '') + self.tail 107 parent.remove(self)
108
109 - def drop_tag(self):
110 """ 111 Remove the tag, but not its children or text. The children and text 112 are merged into the parent. 113 114 Example:: 115 116 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 117 >>> h.find('//b').drop_tag() 118 >>> print tostring(h) 119 <div>Hello World!</div> 120 """ 121 parent = self.getparent() 122 assert parent is not None 123 previous = self.getprevious() 124 if self.text and isinstance(self.tag, basestring): 125 # not a Comment, etc. 126 if previous is None: 127 parent.text = (parent.text or '') + self.text 128 else: 129 previous.tail = (previous.tail or '') + self.text 130 if self.tail: 131 if len(self): 132 last = self[-1] 133 last.tail = (last.tail or '') + self.tail 134 elif previous is None: 135 parent.text = (parent.text or '') + self.tail 136 else: 137 previous.tail = (previous.tail or '') + self.tail 138 index = parent.index(self) 139 parent[index:index+1] = self[:]
140 148
149 - def find_class(self, class_name):
150 """ 151 Find any elements with the given class name. 152 """ 153 return _class_xpath(self, class_name=class_name)
154
155 - def get_element_by_id(self, id, *default):
156 """ 157 Get the first element in a document with the given id. If none is 158 found, return the default argument if provided or raise KeyError 159 otherwise. 160 161 Note that there can be more than one element with the same id, 162 and this isn't uncommon in HTML documents found in the wild. 163 Browsers return only the first match, and this function does 164 the same. 165 """ 166 try: 167 # FIXME: should this check for multiple matches? 168 # browsers just return the first one 169 return _id_xpath(self, id=id)[0] 170 except IndexError: 171 if default: 172 return default[0] 173 else: 174 raise KeyError, id
175
176 - def text_content(self):
177 """ 178 Return the text content of the tag (and the text in any children). 179 """ 180 return _collect_string_content(self)
181
182 - def cssselect(self, expr):
183 """ 184 Run the CSS expression on this element and its children, 185 returning a list of the results. 186 187 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 188 that pre-compiling the expression can provide a substantial 189 speedup. 190 """ 191 return cssselect.CSSSelect(expr)(self)
192 193 ######################################## 194 ## Link functions 195 ######################################## 196 216 self.rewrite_links(link_repl)
217
218 - def resolve_base_href(self):
219 """ 220 Find any ``<base href>`` tag in the document, and apply its 221 values to all links found in the document. Also remove the 222 tag once it has been applied. 223 """ 224 base_href = None 225 basetags = self.xpath('//base[@href]') 226 for b in basetags: 227 base_href = b.get('href') 228 b.drop_tree() 229 if not base_href: 230 return 231 self.make_links_absolute(base_href, resolve_base_href=False)
232 257 304 305
306 -class _MethodFunc(object):
307 """ 308 An object that represents a method on an element as a function; 309 the function takes either an element or an HTML string. It 310 returns whatever the function normally returns, or if the function 311 works in-place (and so returns None) it returns a serialized form 312 of the resulting document. 313 """
314 - def __init__(self, name, copy=False, source_class=HtmlMixin):
315 self.name = name 316 self.copy = copy 317 self.__doc__ = getattr(source_class, self.name).__doc__
318 - def __call__(self, doc, *args, **kw):
319 if isinstance(doc, basestring): 320 if 'copy' in kw: 321 raise TypeError( 322 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 323 return_string = True 324 doc = fromstring(doc, **kw) 325 else: 326 if 'copy' in kw: 327 copy = kw.pop('copy') 328 else: 329 copy = self.copy 330 return_string = False 331 if copy: 332 doc = copy.deepcopy(doc) 333 meth = getattr(doc, self.name) 334 result = meth(*args, **kw) 335 # FIXME: this None test is a bit sloppy 336 if result is None: 337 # Then return what we got in 338 if return_string: 339 return tostring(doc) 340 else: 341 return doc 342 else: 343 return result
344 345 find_rel_links = _MethodFunc('find_rel_links', copy=False) 346 find_class = _MethodFunc('find_class', copy=False) 347 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 348 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 349 iterlinks = _MethodFunc('iterlinks', copy=False) 350 rewrite_links = _MethodFunc('rewrite_links', copy=True) 351
352 -class HtmlComment(etree.CommentBase, HtmlMixin):
353 pass
354
355 -class HtmlElement(etree.ElementBase, HtmlMixin):
356 pass
357
358 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
359 pass
360
361 -class HtmlEntity(etree.EntityBase, HtmlMixin):
362 pass
363 364
365 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
366 """A lookup scheme for HTML Element classes. 367 368 To create a lookup instance with different Element classes, pass a tag 369 name mapping of Element classes in the ``classes`` keyword argument and/or 370 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 371 The special key '*' denotes a Mixin class that should be mixed into all 372 Element classes. 373 """ 374 _default_element_classes = {} 375
376 - def __init__(self, classes=None, mixins=None):
377 etree.CustomElementClassLookup.__init__(self) 378 if classes is None: 379 classes = self._default_element_classes.copy() 380 if mixins: 381 mixers = {} 382 for name, value in mixins: 383 if name == '*': 384 for n in classes.keys(): 385 mixers.setdefault(n, []).append(value) 386 else: 387 mixers.setdefault(name, []).append(value) 388 for name, mix_bases in mixers.items(): 389 cur = classes.get(name, HtmlElement) 390 bases = tuple(mix_bases + [cur]) 391 classes[name] = type(cur.__name__, bases, {}) 392 self._element_classes = classes
393
394 - def lookup(self, node_type, document, namespace, name):
395 if node_type == 'element': 396 return self._element_classes.get(name.lower(), HtmlElement) 397 elif node_type == 'comment': 398 return HtmlComment 399 elif node_type == 'PI': 400 return HtmlProcessingInstruction 401 elif node_type == 'entity': 402 return HtmlEntity 403 # Otherwise normal lookup 404 return None
405 406 407 html_parser = etree.HTMLParser() 408
409 -def document_fromstring(html, **kw):
410 value = etree.HTML(html, html_parser, **kw) 411 if value is None: 412 raise etree.ParserError( 413 "Document is empty") 414 return value
415
416 -def fragments_fromstring(html, no_leading_text=False, **kw):
417 """ 418 Parses several HTML elements, returning a list of elements. 419 420 The first item in the list may be a string (though leading 421 whitespace is removed). If no_leading_text is true, then it will 422 be an error if there is leading text, and it will always be a list 423 of only elements. 424 """ 425 # FIXME: check what happens when you give html with a body, head, etc. 426 start = html[:20].lstrip().lower() 427 if not start.startswith('<html') and not start.startswith('<!doctype'): 428 html = '<html><body>%s</body></html>' % html 429 doc = document_fromstring(html, **kw) 430 assert doc.tag == 'html' 431 bodies = [e for e in doc if e.tag == 'body'] 432 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 433 body = bodies[0] 434 elements = [] 435 if no_leading_text and body.text and body.text.strip(): 436 raise etree.ParserError( 437 "There is leading text: %r" % body.text) 438 if body.text and body.text.strip(): 439 elements.append(body.text) 440 elements.extend(body) 441 # FIXME: removing the reference to the parent artificial document 442 # would be nice 443 return elements
444
445 -def fragment_fromstring(html, create_parent=False, **kw):
446 """ 447 Parses a single HTML element; it is an error if there is more than 448 one element, or if anything but whitespace precedes or follows the 449 element. 450 451 If create_parent is true (or is a tag name) then a parent node 452 will be created to encapsulate the HTML in a single element. 453 """ 454 if create_parent: 455 if not isinstance(create_parent, basestring): 456 create_parent = 'div' 457 return fragment_fromstring('<%s>%s</%s>' % ( 458 create_parent, html, create_parent), **kw) 459 elements = fragments_fromstring(html, no_leading_text=True) 460 if not elements: 461 raise etree.ParserError( 462 "No elements found") 463 if len(elements) > 1: 464 raise etree.ParserError( 465 "Multiple elements found (%s)" 466 % ', '.join([_element_name(e) for e in elements])) 467 el = elements[0] 468 if el.tail and el.tail.strip(): 469 raise etree.ParserError( 470 "Element followed by text: %r" % el.tail) 471 el.tail = None 472 return el
473
474 -def fromstring(html, **kw):
475 """ 476 Parse the html, returning a single element/document. 477 478 This tries to minimally parse the chunk of text, without knowing if it 479 is a fragment or a document. 480 """ 481 start = html[:10].lstrip().lower() 482 if start.startswith('<html') or start.startswith('<!doctype'): 483 # Looks like a full HTML document 484 return document_fromstring(html, **kw) 485 # otherwise, lets parse it out... 486 doc = document_fromstring(html, **kw) 487 bodies = doc.findall('body') 488 if bodies: 489 body = bodies[0] 490 if len(bodies) > 1: 491 # Somehow there are multiple bodies, which is bad, but just 492 # smash them into one body 493 for other_body in bodies[1:]: 494 if other_body.text: 495 if len(body): 496 body[-1].tail = (body[-1].tail or '') + other_body.text 497 else: 498 body.text = (body.text or '') + other_body.text 499 body.extend(other_body) 500 # We'll ignore tail 501 # I guess we are ignoring attributes too 502 other_body.drop_tree() 503 else: 504 body = None 505 heads = doc.findall('head') 506 if heads: 507 # Well, we have some sort of structure, so lets keep it all 508 head = heads[0] 509 if len(heads) > 1: 510 for other_head in heads[1:]: 511 head.extend(other_head) 512 # We don't care about text or tail in a head 513 other_head.drop_tree() 514 return doc 515 if (len(body) == 1 and (not body.text or not body.text.strip()) 516 and (not body[-1].tail or not body[-1].tail.strip())): 517 # The body has just one element, so it was probably a single 518 # element passed in 519 return body[0] 520 # Now we have a body which represents a bunch of tags which have the 521 # content that was passed in. We will create a fake container, which 522 # is the body tag, except <body> implies too much structure. 523 if _contains_block_level_tag(body): 524 body.tag = 'div' 525 else: 526 body.tag = 'span' 527 return body
528
529 -def parse(filename, **kw):
530 """ 531 Parse a filename, URL, or file-like object into an HTML document. 532 533 You may pass the keyword argument ``base_url='http://...'`` to set 534 the base URL. 535 """ 536 return etree.parse(filename, html_parser, **kw)
537
538 -def _contains_block_level_tag(el):
539 # FIXME: I could do this with XPath, but would that just be 540 # unnecessarily slow? 541 for el in el.getiterator(): 542 if el.tag in defs.block_tags: 543 return True 544 return False
545
546 -def _element_name(el):
547 if isinstance(el, etree.CommentBase): 548 return 'comment' 549 elif isinstance(el, basestring): 550 return 'string' 551 else: 552 return el.tag
553
554 -def Element(*args, **kw):
555 v = html_parser.makeelement(*args, **kw) 556 return v
557
558 -class FormElement(HtmlElement):
559 """ 560 Represents a <form> element. 561 """ 562
563 - def inputs(self):
564 """ 565 Returns an accessor for all the input elements in the form. 566 567 See `InputGetter` for more information about the object. 568 """ 569 return InputGetter(self)
570 inputs = property(inputs, doc=inputs.__doc__) 571
572 - def fields__get(self):
573 """ 574 Dictionary-like object that represents all the fields in this 575 form. You can set values in this dictionary to effect the 576 form. 577 """ 578 return FieldsDict(self.inputs)
579 - def fields__set(self, value):
580 prev_keys = self.fields.keys() 581 for key, value in value.iteritems(): 582 if key in prev_keys: 583 prev_keys.remove(key) 584 self.fields[key] = value 585 for key in prev_keys: 586 # FIXME: but right now I don't even allow 587 # deleting, and I'm not sure what it would 588 # mean if I did. 589 del self.fields[key]
590 591 fields = property(fields__get, fields__set, doc=fields__get.__doc__) 592
593 - def _name(self):
594 if self.get('name'): 595 return self.get('name') 596 elif self.get('id'): 597 return '#' + self.get('id') 598 return str(self.body.findall('form').index(self))
599
600 - def form_values(self):
601 """ 602 Return a list of tuples of the field values for the form. 603 This is suitable to be passed to ``urllib.urlencode()``. 604 """ 605 results = [] 606 for el in self.inputs: 607 name = el.name 608 if not name: 609 continue 610 if el.tag == 'textarea': 611 results.append((name, el.value)) 612 elif el.tag == 'select': 613 value = el.value 614 if el.multiple: 615 for v in value: 616 results.append((name, v)) 617 elif value is not None: 618 results.append((name, el.value)) 619 else: 620 assert el.tag == 'input', ( 621 "Unexpected tag: %r" % el) 622 if el.checkable and not el.checked: 623 continue 624 if el.type in ('submit', 'image', 'reset'): 625 continue 626 value = el.value 627 if value is not None: 628 results.append((name, el.value)) 629 return results
630
631 - def action__get(self):
632 """ 633 Get/set the form's ``action`` attribute. 634 """ 635 base_url = self.base_url 636 action = self.get('action') 637 if base_url and action is not None: 638 return urlparse.urljoin(base_url, action) 639 else: 640 return action
641 - def action__set(self, value):
642 self.set('action', value)
643 - def action__del(self):
644 if 'action' in self.attrib: 645 del self.attrib['action']
646 action = property(action__get, action__set, action__del, doc=action__get.__doc__) 647
648 - def method__get(self):
649 """ 650 Get/set the form's method. Always returns a capitalized 651 string, and defaults to ``'GET'`` 652 """ 653 return self.get('method', 'GET').upper()
654 - def method__set(self, value):
655 self.set('method', value.upper())
656 method = property(method__get, method__set, doc=method__get.__doc__)
657 658 HtmlElementClassLookup._default_element_classes['form'] = FormElement 659
660 -def submit_form(form, extra_values=None, open_http=None):
661 """ 662 Helper function to submit a form. Returns a file-like object, as from 663 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 664 which shows the URL if there were any redirects. 665 666 You can use this like:: 667 668 >>> form = doc.forms[0] 669 >>> form.inputs['foo'].value = 'bar' # etc 670 >>> response = form.submit() 671 >>> doc = parse(response) 672 >>> doc.make_links_absolute(response.geturl()) 673 674 To change the HTTP requester, pass a function as ``open_http`` keyword 675 argument that opens the URL for you. The function must have the following 676 signature:: 677 678 open_http(method, URL, values) 679 680 The action is one of 'GET' or 'POST', the URL is the target URL as a 681 string, and the values are a sequence of ``(name, value)`` tuples with the 682 form data. 683 """ 684 values = form.form_values() 685 if extra_values: 686 if hasattr(extra_values, 'items'): 687 extra_values = extra_values.items() 688 values.extend(extra_values) 689 if open_http is None: 690 open_http = open_http_urllib 691 return open_http(form.method, form.action, values)
692
693 -def open_http_urllib(method, url, values):
694 import urllib 695 ## FIXME: should test that it's not a relative URL or something 696 if method == 'GET': 697 if '?' in url: 698 url += '&' 699 else: 700 url += '?' 701 url += urllib.urlencode(values) 702 data = None 703 else: 704 data = urllib.urlencode(values) 705 return urllib.urlopen(url, data)
706
707 -class FieldsDict(DictMixin):
708
709 - def __init__(self, inputs):
710 self.inputs = inputs
711 - def __getitem__(self, item):
712 return self.inputs[item].value
713 - def __setitem__(self, item, value):
714 self.inputs[item].value = value
715 - def __delitem__(self, item):
716 raise KeyError( 717 "You cannot remove keys from ElementDict")
718 - def keys(self):
719 return self.inputs.keys()
720 - def __contains__(self, item):
721 return item in self.inputs
722
723 - def __repr__(self):
724 return '<%s for form %s>' % ( 725 self.__class__.__name__, 726 self.inputs.form._name())
727
728 -class InputGetter(object):
729 730 """ 731 An accessor that represents all the input fields in a form. 732 733 You can get fields by name from this, with 734 ``form.inputs['field_name']``. If there are a set of checkboxes 735 with the same name, they are returned as a list (a `CheckboxGroup` 736 which also allows value setting). Radio inputs are handled 737 similarly. 738 739 You can also iterate over this to get all input elements. This 740 won't return the same thing as if you get all the names, as 741 checkboxes and radio elements are returned individually. 742 """ 743 744 _name_xpath = etree.XPath(".//*[@name = $name and (name(.) = 'select' or name(.) = 'input' or name(.) = 'textarea')]") 745 _all_xpath = etree.XPath(".//*[name() = 'select' or name() = 'input' or name() = 'textarea']") 746
747 - def __init__(self, form):
748 self.form = form
749
750 - def __repr__(self):
751 return '<%s for form %s>' % ( 752 self.__class__.__name__, 753 self.form._name())
754 755 ## FIXME: there should be more methods, and it's unclear if this is 756 ## a dictionary-like object or list-like object 757
758 - def __getitem__(self, name):
759 results = self._name_xpath(self.form, name=name) 760 if results: 761 type = results[0].get('type') 762 if type == 'radio' and len(results) > 1: 763 group = RadioGroup(results) 764 group.name = name 765 return group 766 elif type == 'checkbox' and len(results) > 1: 767 group = CheckboxGroup(results) 768 group.name = name 769 return group 770 else: 771 # I don't like throwing away elements like this 772 return results[0] 773 else: 774 raise KeyError( 775 "No input element with the name %r" % name)
776
777 - def __contains__(self, name):
778 results = self._name_xpath(self.form, name=name) 779 return bool(results)
780
781 - def keys(self):
782 names = sets.Set() 783 for el in self: 784 names.add(el.name) 785 return list(names)
786
787 - def __iter__(self):
788 ## FIXME: kind of dumb to turn a list into an iterator, only 789 ## to have it likely turned back into a list again :( 790 return iter(self._all_xpath(self.form))
791
792 -class InputMixin(object):
793 794 """ 795 Mix-in for all input elements (input, select, and textarea) 796 """ 797 798
799 - def name__get(self):
800 """ 801 Get/set the name of the element 802 """ 803 return self.get('name')
804 - def name__set(self, value):
805 self.set('name', value)
806 - def name__del(self):
807 if 'name' in self.attrib: 808 del self.attrib['name']
809 name = property(name__get, name__set, name__del, doc=name__get.__doc__) 810
811 - def __repr__(self):
812 type = getattr(self, 'type', None) 813 if type: 814 type = ' type=%r' % type 815 else: 816 type = '' 817 return '<%s %x name=%r%s>' % ( 818 self.__class__.__name__, id(self), self.name, type)
819
820 -class TextareaElement(InputMixin, HtmlElement):
821 """ 822 ``<textarea>`` element. You can get the name with ``.name`` and 823 get/set the value with ``.value`` 824 """ 825
826 - def value__get(self):
827 """ 828 Get/set the value (which is the contents of this element) 829 """ 830 return self.text or ''
831 - def value__set(self, value):
832 self.text = value
833 - def value__del(self):
834 self.text = ''
835 value = property(value__get, value__set, value__del, doc=value__get.__doc__)
836 837 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 838
839 -class SelectElement(InputMixin, HtmlElement):
840 """ 841 ``<select>`` element. You can get the name with ``.name``. 842 843 ``.value`` will be the value of the selected option, unless this 844 is a multi-select element (``<select multiple>``), in which case 845 it will be a set-like object. In either case ``.value_options`` 846 gives the possible values. 847 848 The boolean attribute ``.multiple`` shows if this is a 849 multi-select. 850 """ 851
852 - def value__get(self):
853 """ 854 Get/set the value of this select (the selected option). 855 856 If this is a multi-select, this is a set-like object that 857 represents all the selected options. 858 """ 859 if self.multiple: 860 return MultipleSelectOptions(self) 861 for el in self.getiterator('option'): 862 if 'selected' in el.attrib: 863 value = el.get('value') 864 # FIXME: If value is None, what to return?, get_text()? 865 return value 866 return None
867
868 - def value__set(self, value):
869 if self.multiple: 870 if isinstance(value, basestring): 871 raise TypeError( 872 "You must pass in a sequence") 873 self.value.clear() 874 self.value.update(value) 875 return 876 if value is not None: 877 for el in self.getiterator('option'): 878 # FIXME: also if el.get('value') is None? 879 if el.get('value') == value: 880 checked_option = el 881 break 882 else: 883 raise ValueError( 884 "There is no option with the value of %r" % value) 885 for el in self.getiterator('option'): 886 if 'selected' in el.attrib: 887 del el.attrib['selected'] 888 if value is not None: 889 checked_option.set('selected', '')
890
891 - def value__del(self):
892 # FIXME: should del be allowed at all? 893 if self.multiple: 894 self.value.clear() 895 else: 896 self.value = None
897 898 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 899
900 - def value_options(self):
901 """ 902 All the possible values this select can have (the ``value`` 903 attribute of all the ``<option>`` elements. 904 """ 905 return [el.get('value') for el in self.getiterator('option')]
906 value_options = property(value_options, doc=value_options.__doc__) 907
908 - def multiple__get(self):
909 """ 910 Boolean attribute: is there a ``multiple`` attribute on this element. 911 """ 912 return 'multiple' in self.attrib
913 - def multiple__set(self, value):
914 if value: 915 self.set('multiple', '') 916 elif 'multiple' in self.attrib: 917 del self.attrib['multiple']
918 multiple = property(multiple__get, multiple__set, doc=multiple__get.__doc__)
919 920 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 921
922 -class MultipleSelectOptions(SetMixin):
923 """ 924 Represents all the selected options in a ``<select multiple>`` element. 925 926 You can add to this set-like option to select an option, or remove 927 to unselect the option. 928 """ 929
930 - def __init__(self, select):
931 self.select = select
932
933 - def options(self):
934 """ 935 Iterator of all the ``<option>`` elements. 936 """ 937 return self.select.getiterator('option')
938 options = property(options) 939
940 - def __iter__(self):
941 for option in self.options: 942 yield option.get('value')
943
944 - def add(self, item):
945 for option in self.options: 946 if option.get('value') == item: 947 option.set('selected', '') 948 break 949 else: 950 raise ValueError( 951 "There is no option with the value %r" % item)
952
953 - def remove(self, item):
954 for option in self.options: 955 if option.get('value') == item: 956 if 'selected' in option.attrib: 957 del option.attrib['selected'] 958 else: 959 raise ValueError( 960 "The option %r is not currently selected" % item) 961 break 962 else: 963 raise ValueError( 964 "There is not option with the value %r" % item)
965
966 - def __repr__(self):
967 return '<%s {%s} for select name=%r>' % ( 968 self.__class__.__name__, 969 ', '.join([repr(v) for v in self]), 970 self.select.name)
971
972 -class RadioGroup(list):
973 """ 974 This object represents several ``<input type=radio>`` elements 975 that have the same name. 976 977 You can use this like a list, but also use the property 978 ``.value`` to check/uncheck inputs. Also you can use 979 ``.value_options`` to get the possible values. 980 """ 981
982 - def value__get(self):
983 """ 984 Get/set the value, which checks the radio with that value (and 985 unchecks any other value). 986 """ 987 for el in self: 988 if 'checked' in el.attrib: 989 return el.get('value') 990 return None
991
992 - def value__set(self, value):
993 if value is not None: 994 for el in self: 995 if el.get('value') == value: 996 checked_option = el 997 break 998 else: 999 raise ValueError( 1000 "There is no radio input with the value %r" % value) 1001 for el in self: 1002 if 'checked' in el.attrib: 1003 del el.attrib['checked'] 1004 if value is not None: 1005 checked_option.set('checked', '')
1006
1007 - def value__del(self):
1008 self.value = None
1009 1010 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1011
1012 - def value_options(self):
1013 """ 1014 Returns a list of all the possible values. 1015 """ 1016 return [el.get('value') for el in self]
1017 value_options = property(value_options, doc=value_options.__doc__) 1018
1019 - def __repr__(self):
1020 return '%s(%s)' % ( 1021 self.__class__.__name__, 1022 list.__repr__(self))
1023
1024 -class CheckboxGroup(list):
1025 """ 1026 Represents a group of checkboxes (``<input type=checkbox>``) that 1027 have the same name. 1028 1029 In addition to using this like a list, the ``.value`` attribute 1030 returns a set-like object that you can add to or remove from to 1031 check and uncheck checkboxes. You can also use ``.value_options`` 1032 to get the possible values. 1033 """ 1034
1035 - def value__get(self):
1036 """ 1037 Return a set-like object that can be modified to check or 1038 uncheck individual checkboxes according to their value. 1039 """ 1040 return CheckboxValues(self)
1041 - def value__set(self, value):
1042 self.value.clear() 1043 self.value |= value
1044 - def value__del(self):
1045 self.value.clear()
1046 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1047
1048 - def __repr__(self):
1049 return '%s(%s)' % ( 1050 self.__class__.__name__, list.__repr__(self))
1051
1052 -class CheckboxValues(SetMixin):
1053 1054 """ 1055 Represents the values of the checked checkboxes in a group of 1056 checkboxes with the same name. 1057 """ 1058
1059 - def __init__(self, group):
1060 self.group = group
1061
1062 - def __iter__(self):
1063 return iter([ 1064 el.get('value') 1065 for el in self.group 1066 if 'checked' in el.attrib])
1067
1068 - def add(self, value):
1069 for el in self.group: 1070 if el.get('value') == value: 1071 el.set('checked', '') 1072 break 1073 else: 1074 raise KeyError("No checkbox with value %r" % value)
1075
1076 - def remove(self, value):
1077 for el in self.group: 1078 if el.get('value') == value: 1079 if 'checked' in el.attrib: 1080 del el.attrib['checked'] 1081 else: 1082 raise KeyError( 1083 "The checkbox with value %r was already unchecked" % value) 1084 break 1085 else: 1086 raise KeyError( 1087 "No checkbox with value %r" % value)
1088
1089 - def __repr__(self):
1090 return '<%s {%s} for checkboxes name=%r>' % ( 1091 self.__class__.__name__, 1092 ', '.join([repr(v) for v in self]), 1093 self.group.name)
1094
1095 -class InputElement(InputMixin, HtmlElement):
1096 """ 1097 Represents an ``<input>`` element. 1098 1099 You can get the type with ``.type`` (which is lower-cased and 1100 defaults to ``'text'``). 1101 1102 Also you can get and set the value with ``.value`` 1103 1104 Checkboxes and radios have the attribute ``input.checkable == 1105 True`` (for all others it is false) and a boolean attribute 1106 ``.checked``. 1107 1108 """ 1109 1110 ## FIXME: I'm a little uncomfortable with the use of .checked
1111 - def value__get(self):
1112 """ 1113 Get/set the value of this element, using the ``value`` attribute. 1114 1115 Also, if this is a checkbox and it has no value, this defaults 1116 to ``'on'``. If it is a checkbox or radio that is not 1117 checked, this returns None. 1118 """ 1119 if self.checkable: 1120 if self.checked: 1121 return self.get('value') or 'on' 1122 else: 1123 return None 1124 return self.get('value')
1125 - def value__set(self, value):
1126 if self.checkable: 1127 if not value: 1128 self.checked = False 1129 else: 1130 self.checked = True 1131 if isinstance(value, basestring): 1132 self.set('value', value) 1133 else: 1134 self.set('value', value)
1135 - def value__del(self):
1136 if self.checkable: 1137 self.checked = False 1138 else: 1139 if 'value' in self.attrib: 1140 del self.attrib['value']
1141 value = property(value__get, value__set, value__del, doc=value__get.__doc__) 1142
1143 - def type__get(self):
1144 """ 1145 Return the type of this element (using the type attribute). 1146 """ 1147 return self.get('type', 'text').lower()
1148 - def type__set(self, value):
1149 self.set('type', value)
1150 type = property(type__get, type__set, doc=type__get.__doc__) 1151
1152 - def checkable__get(self):
1153 """ 1154 Boolean: can this element be checked? 1155 """ 1156 return self.type in ['checkbox', 'radio']
1157 checkable = property(checkable__get, doc=checkable__get.__doc__) 1158
1159 - def checked__get(self):
1160 """ 1161 Boolean attribute to get/set the presence of the ``checked`` 1162 attribute. 1163 1164 You can only use this on checkable input types. 1165 """ 1166 if not self.checkable: 1167 raise AttributeError('Not a checkable input type') 1168 return 'checked' in self.attrib
1169 - def checked__set(self, value):
1170 if not self.checkable: 1171 raise AttributeError('Not a checkable input type') 1172 if value: 1173 self.set('checked', '') 1174 else: 1175 if 'checked' in self.attrib: 1176 del self.attrib['checked']
1177 checked = property(checked__get, checked__set, doc=checked__get.__doc__)
1178 1179 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1180
1181 -class LabelElement(HtmlElement):
1182 """ 1183 Represents a ``<label>`` element. 1184 1185 Label elements are linked to other elements with their ``for`` 1186 attribute. You can access this element with ``label.for_element``. 1187 """ 1188
1189 - def for_element__get(self):
1190 """ 1191 Get/set the element this label points to. Return None if it 1192 can't be found. 1193 """ 1194 id = self.get('for') 1195 if not id: 1196 return None 1197 return self.body.get_element_by_id(id)
1198 - def for_element__set(self, other):
1199 id = other.get('id') 1200 if not id: 1201 raise TypeError( 1202 "Element %r has no id attribute" % other) 1203 self.set('for', id)
1204 - def for_element__del(self):
1205 if 'id' in self.attrib: 1206 del self.attrib['id']
1207 for_element = property(for_element__get, for_element__set, for_element__del, 1208 doc=for_element__get.__doc__)
1209 1210 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1211 1212 ############################################################ 1213 ## Serialization 1214 ############################################################ 1215 1216 _html_xsl = """\ 1217 <xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> 1218 <xsl:output method="html" encoding="UTF-8" /> 1219 <xsl:template match="/"> 1220 <xsl:copy-of select="."/> 1221 </xsl:template> 1222 </xsl:transform> 1223 """ 1224 1225 _pretty_html_xsl = """\ 1226 <xsl:transform xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> 1227 <xsl:output method="html" encoding="UTF-8" indent="yes" /> 1228 <xsl:template match="/"> 1229 <xsl:copy-of select="."/> 1230 </xsl:template> 1231 </xsl:transform> 1232 """ 1233 1234 _local_transforms = threading.local() 1235 # FIXME: should we just lazily compile these? 1236 _local_transforms.html_transform = etree.XSLT(etree.XML(_html_xsl)) 1237 _local_transforms.pretty_html_transform = etree.XSLT(etree.XML(_pretty_html_xsl)) 1238 1239 # This isn't a general match, but it's a match for what XSLT specifically creates: 1240 __replace_meta_content_type = re.compile( 1241 r'<meta http-equiv="Content-Type".*?>').sub 1242
1243 -def tostring(doc, pretty=False, include_meta_content_type=False):
1244 """ 1245 return HTML string representation of the document given 1246 1247 note: this will create a meta http-equiv="Content" tag in the head 1248 and may replace any that are present 1249 """ 1250 assert doc is not None 1251 if pretty: 1252 try: 1253 pretty_html_transform = _local_transforms.pretty_html_transform 1254 except AttributeError: 1255 pretty_html_transform = _local_transforms.pretty_html_transform = etree.XSLT(etree.XML(_pretty_html_xsl)) 1256 html = str(pretty_html_transform(doc)) 1257 else: 1258 try: 1259 html_transform = _local_transforms.html_transform 1260 except AttributeError: 1261 html_transform = _local_transforms.html_transform = etree.XSLT(etree.XML(_html_xsl)) 1262 html = str(html_transform(doc)) 1263 if not include_meta_content_type: 1264 html = __replace_meta_content_type('', html) 1265 return html
1266
1267 -def open_in_browser(doc):
1268 """ 1269 Open the HTML document in a web browser (saving it to a temporary 1270 file to open it). 1271 """ 1272 import os 1273 import webbrowser 1274 fn = os.tempnam() + '.html' 1275 f = open(fn, 'wb') 1276 f.write(tostring(doc, include_meta_content_type=True)) 1277 f.close() 1278 url = 'file://' + fn.replace(os.path.sep, '/') 1279 print url 1280 webbrowser.open(url)
1281 1282 ################################################################################ 1283 # configure Element class lookup 1284 1285 html_parser.setElementClassLookup(HtmlElementClassLookup()) 1286