Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os 
  9   
 10  from common_imports import StringIO, etree, fileInTestDir 
 11  from common_imports import SillyFileLike, HelperTestCase 
 12   
13 -class HtmlParserTestCaseBase(HelperTestCase):
14 """HTML parser test cases 15 """ 16 etree = etree 17 18 html_str = "<html><head><title>test</title></head><body><h1>page title</h1></body></html>" 19 broken_html_str = "<html><head><title>test<body><h1>page title</h3></p></html>" 20 uhtml_str = u"<html><head><title>test á\uF8D2</title></head><body><h1>page á\uF8D2 title</h1></body></html>" 21
22 - def tearDown(self):
24
25 - def test_module_HTML(self):
26 element = self.etree.HTML(self.html_str) 27 self.assertEqual(self.etree.tostring(element), 28 self.html_str)
29
30 - def test_module_HTML_unicode(self):
31 element = self.etree.HTML(self.uhtml_str) 32 self.assertEqual(unicode(self.etree.tostring(element, 'UTF8'), 'UTF8'), 33 unicode(self.uhtml_str.encode('UTF8'), 'UTF8'))
34
36 parser = self.etree.HTMLParser(recover=False) 37 parse = self.etree.parse 38 f = StringIO("<html></body>") 39 self.assertRaises(self.etree.XMLSyntaxError, 40 parse, f, parser)
41
43 parser = self.etree.HTMLParser(recover=False) 44 parse = self.etree.parse 45 f = StringIO(self.broken_html_str) 46 self.assertRaises(self.etree.XMLSyntaxError, 47 parse, f, parser)
48
49 - def test_module_HTML_broken(self):
50 element = self.etree.HTML(self.broken_html_str) 51 self.assertEqual(self.etree.tostring(element), 52 self.html_str)
53
54 - def test_module_HTML_cdata(self):
55 # by default, libxml2 generates CDATA nodes for <script> content 56 html = '<html><head><style>foo</style></head></html>' 57 element = self.etree.HTML(html) 58 self.assertEquals(element[0][0].text, "foo")
59
60 - def test_module_HTML_access(self):
61 element = self.etree.HTML(self.html_str) 62 self.assertEqual(element[0][0].tag, 'title')
63
64 - def test_module_parse_html(self):
65 parser = self.etree.HTMLParser() 66 filename = tempfile.mktemp(suffix=".html") 67 open(filename, 'wb').write(self.html_str) 68 try: 69 f = open(filename, 'r') 70 tree = self.etree.parse(f, parser) 71 f.close() 72 self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) 73 finally: 74 os.remove(filename)
75
77 parser = self.etree.HTMLParser() 78 f = SillyFileLike(self.html_str) 79 tree = self.etree.parse(f, parser) 80 html = self.etree.tostring(tree.getroot(), 'UTF-8') 81 self.assertEqual(html, self.html_str)
82 83 ## def test_module_parse_html_filelike_unicode(self): 84 ## parser = self.etree.HTMLParser() 85 ## f = SillyFileLike(self.uhtml_str) 86 ## tree = self.etree.parse(f, parser) 87 ## html = self.etree.tostring(tree.getroot(), 'UTF-8') 88 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 89
90 - def test_html_file_error(self):
91 parser = self.etree.HTMLParser() 92 parse = self.etree.parse 93 self.assertRaises(IOError, 94 parse, "__some_hopefully_nonexisting_file__.html", 95 parser)
96
98 self.assertRaises(self.etree.XMLSyntaxError, 99 self.etree.parse, StringIO(self.broken_html_str)) 100 101 self.etree.setDefaultParser( self.etree.HTMLParser() ) 102 103 tree = self.etree.parse(StringIO(self.broken_html_str)) 104 self.assertEqual(self.etree.tostring(tree.getroot()), 105 self.html_str) 106 107 self.etree.setDefaultParser() 108 109 self.assertRaises(self.etree.XMLSyntaxError, 110 self.etree.parse, StringIO(self.broken_html_str))
111
112 -def test_suite():
113 suite = unittest.TestSuite() 114 suite.addTests([unittest.makeSuite(HtmlParserTestCaseBase)]) 115 return suite
116 117 if __name__ == '__main__': 118 print 'to test use test.py %s' % __file__ 119