Package lxml :: Package tests :: Module test_htmlparser
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_htmlparser

  1  # -*- coding: utf-8 -*- 
  2   
  3  """ 
  4  HTML parser test cases for etree 
  5  """ 
  6   
  7  import unittest 
  8  import tempfile, os 
  9   
 10  from common_imports import StringIO, etree, fileInTestDir 
 11  from common_imports import SillyFileLike, HelperTestCase 
 12   
13 -class HtmlParserTestCaseBase(HelperTestCase):
14 """HTML parser test cases 15 """ 16 etree = etree 17 18 html_str = "<html><head><title>test</title></head><body><h1>page title</h1></body></html>" 19 broken_html_str = "<html><head><title>test<body><h1>page title</h3></p></html>" 20 uhtml_str = u"<html><head><title>test á\uF8D2</title></head><body><h1>page á\uF8D2 title</h1></body></html>" 21
22 - def tearDown(self):
24
25 - def test_module_HTML(self):
26 element = self.etree.HTML(self.html_str) 27 self.assertEqual(self.etree.tostring(element), 28 self.html_str)
29
30 - def test_module_HTML_unicode(self):
31 element = self.etree.HTML(self.uhtml_str) 32 self.assertEqual(unicode(self.etree.tostring(element, 'UTF8'), 'UTF8'), 33 unicode(self.uhtml_str.encode('UTF8'), 'UTF8'))
34
36 parser = self.etree.HTMLParser(recover=False) 37 parse = self.etree.parse 38 f = StringIO("<html></body>") 39 self.assertRaises(self.etree.XMLSyntaxError, 40 parse, f, parser)
41
43 parser = self.etree.HTMLParser() 44 Element = parser.makeelement 45 46 el = Element('name') 47 self.assertRaises(ValueError, Element, '{}') 48 self.assertRaises(ValueError, setattr, el, 'tag', '{}') 49 50 self.assertRaises(ValueError, Element, '{test}') 51 self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
52
54 parser = self.etree.HTMLParser() 55 Element = parser.makeelement 56 57 pname = Element('p:name') 58 self.assertEquals(pname.tag, 'p:name') 59 60 pname = Element('{test}p:name') 61 self.assertEquals(pname.tag, '{test}p:name') 62 63 pname = Element('name') 64 pname.tag = 'p:name' 65 self.assertEquals(pname.tag, 'p:name')
66
68 parser = self.etree.HTMLParser() 69 Element = parser.makeelement 70 71 self.assertRaises(ValueError, Element, 'p"name') 72 self.assertRaises(ValueError, Element, "na'me") 73 self.assertRaises(ValueError, Element, '{test}"name') 74 self.assertRaises(ValueError, Element, "{test}name'") 75 76 el = Element('name') 77 self.assertRaises(ValueError, setattr, el, 'tag', "pname'") 78 self.assertRaises(ValueError, setattr, el, 'tag', '"pname') 79 self.assertEquals(el.tag, "name")
80
82 parser = self.etree.HTMLParser() 83 Element = parser.makeelement 84 85 self.assertRaises(ValueError, Element, ' name ') 86 self.assertRaises(ValueError, Element, 'na me') 87 self.assertRaises(ValueError, Element, '{test} name') 88 89 el = Element('name') 90 self.assertRaises(ValueError, setattr, el, 'tag', ' name ') 91 self.assertEquals(el.tag, "name")
92
94 parser = self.etree.HTMLParser() 95 Element = parser.makeelement 96 97 SubElement = self.etree.SubElement 98 99 el = Element('name') 100 self.assertRaises(ValueError, SubElement, el, '{}') 101 self.assertRaises(ValueError, SubElement, el, '{test}')
102
104 parser = self.etree.HTMLParser() 105 Element = parser.makeelement 106 SubElement = self.etree.SubElement 107 108 el = Element('name') 109 pname = SubElement(el, 'p:name') 110 self.assertEquals(pname.tag, 'p:name') 111 112 pname = SubElement(el, '{test}p:name') 113 self.assertEquals(pname.tag, '{test}p:name')
114
116 parser = self.etree.HTMLParser() 117 Element = parser.makeelement 118 SubElement = self.etree.SubElement 119 120 el = Element('name') 121 self.assertRaises(ValueError, SubElement, el, "name'") 122 self.assertRaises(ValueError, SubElement, el, 'na"me') 123 self.assertRaises(ValueError, SubElement, el, "{test}na'me") 124 self.assertRaises(ValueError, SubElement, el, '{test}"name')
125
127 parser = self.etree.HTMLParser() 128 Element = parser.makeelement 129 SubElement = self.etree.SubElement 130 131 el = Element('name') 132 self.assertRaises(ValueError, SubElement, el, ' name ') 133 self.assertRaises(ValueError, SubElement, el, 'na me') 134 self.assertRaises(ValueError, SubElement, el, '{test} name')
135
137 parser = self.etree.HTMLParser(recover=False) 138 parse = self.etree.parse 139 f = StringIO(self.broken_html_str) 140 self.assertRaises(self.etree.XMLSyntaxError, 141 parse, f, parser)
142
144 text = u'Søk på nettet' 145 html_latin1 = (u'<p>%s</p>' % text).encode('iso-8859-1') 146 147 tree = self.etree.parse( 148 StringIO(html_latin1), 149 self.etree.HTMLParser(encoding="iso-8859-1")) 150 p = tree.find("//p") 151 self.assertEquals(p.text, text)
152
154 text = u'Søk på nettet' 155 wrong_head = ''' 156 <head> 157 <meta http-equiv="Content-Type" 158 content="text/html; charset=UTF-8" /> 159 </head>''' 160 html_latin1 = (u'<html>%s<body><p>%s</p></body></html>' % (wrong_head, 161 text) 162 ).encode('iso-8859-1') 163 164 self.assertRaises(self.etree.ParseError, 165 self.etree.parse, 166 StringIO(html_latin1)) 167 168 tree = self.etree.parse( 169 StringIO(html_latin1), 170 self.etree.HTMLParser(encoding="iso-8859-1")) 171 p = tree.find("//p") 172 self.assertEquals(p.text, text)
173
174 - def test_module_HTML_broken(self):
175 element = self.etree.HTML(self.broken_html_str) 176 self.assertEqual(self.etree.tostring(element), 177 self.html_str)
178
179 - def test_module_HTML_cdata(self):
180 # by default, libxml2 generates CDATA nodes for <script> content 181 html = '<html><head><style>foo</style></head></html>' 182 element = self.etree.HTML(html) 183 self.assertEquals(element[0][0].text, "foo")
184
185 - def test_module_HTML_access(self):
186 element = self.etree.HTML(self.html_str) 187 self.assertEqual(element[0][0].tag, 'title')
188
189 - def test_module_parse_html(self):
190 parser = self.etree.HTMLParser() 191 filename = tempfile.mktemp(suffix=".html") 192 open(filename, 'wb').write(self.html_str) 193 try: 194 f = open(filename, 'r') 195 tree = self.etree.parse(f, parser) 196 f.close() 197 self.assertEqual(self.etree.tostring(tree.getroot()), self.html_str) 198 finally: 199 os.remove(filename)
200
202 parser = self.etree.HTMLParser() 203 f = SillyFileLike(self.html_str) 204 tree = self.etree.parse(f, parser) 205 html = self.etree.tostring(tree.getroot(), 'UTF-8') 206 self.assertEqual(html, self.html_str)
207 208 ## def test_module_parse_html_filelike_unicode(self): 209 ## parser = self.etree.HTMLParser() 210 ## f = SillyFileLike(self.uhtml_str) 211 ## tree = self.etree.parse(f, parser) 212 ## html = self.etree.tostring(tree.getroot(), 'UTF-8') 213 ## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str) 214
215 - def test_html_file_error(self):
216 parser = self.etree.HTMLParser() 217 parse = self.etree.parse 218 self.assertRaises(IOError, 219 parse, "__some_hopefully_nonexisting_file__.html", 220 parser)
221
223 self.assertRaises(self.etree.XMLSyntaxError, 224 self.etree.parse, StringIO(self.broken_html_str)) 225 226 self.etree.setDefaultParser( self.etree.HTMLParser() ) 227 228 tree = self.etree.parse(StringIO(self.broken_html_str)) 229 self.assertEqual(self.etree.tostring(tree.getroot()), 230 self.html_str) 231 232 self.etree.setDefaultParser() 233 234 self.assertRaises(self.etree.XMLSyntaxError, 235 self.etree.parse, StringIO(self.broken_html_str))
236
237 - def test_html_iterparse(self):
238 iterparse = self.etree.iterparse 239 f = StringIO( 240 '<html><head><title>TITLE</title><body><p>P</p></body></html>') 241 242 iterator = iterparse(f, html=True) 243 self.assertEquals(None, iterator.root) 244 245 events = list(iterator) 246 root = iterator.root 247 self.assert_(root is not None) 248 self.assertEquals( 249 [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]), 250 ('end', root[1]), ('end', root)], 251 events)
252
253 - def test_html_iterparse_file(self):
254 iterparse = self.etree.iterparse 255 iterator = iterparse(fileInTestDir("css_shakespear.html"), 256 html=True) 257 258 self.assertEquals(None, iterator.root) 259 events = list(iterator) 260 root = iterator.root 261 self.assert_(root is not None) 262 self.assertEquals(249, len(events)) 263 self.assertEquals( 264 [], 265 [ event for (event, element) in events if event != 'end' ])
266
267 -def test_suite():
268 suite = unittest.TestSuite() 269 suite.addTests([unittest.makeSuite(HtmlParserTestCaseBase)]) 270 return suite
271 272 if __name__ == '__main__': 273 print 'to test use test.py %s' % __file__ 274