Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2:mod:`pandas.io.html` is a module containing functionality for dealing with 

3HTML IO. 

4 

5""" 

6 

7from collections import abc 

8import numbers 

9import os 

10import re 

11 

12from pandas.compat._optional import import_optional_dependency 

13from pandas.errors import AbstractMethodError, EmptyDataError 

14 

15from pandas.core.dtypes.common import is_list_like 

16 

17from pandas.core.construction import create_series_with_explicit_dtype 

18 

19from pandas.io.common import is_url, urlopen, validate_header_arg 

20from pandas.io.formats.printing import pprint_thing 

21from pandas.io.parsers import TextParser 

22 

23_IMPORTS = False 

24_HAS_BS4 = False 

25_HAS_LXML = False 

26_HAS_HTML5LIB = False 

27 

28 

29def _importers(): 

30 # import things we need 

31 # but make this done on a first use basis 

32 

33 global _IMPORTS 

34 if _IMPORTS: 

35 return 

36 

37 global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB 

38 bs4 = import_optional_dependency("bs4", raise_on_missing=False, on_version="ignore") 

39 _HAS_BS4 = bs4 is not None 

40 

41 lxml = import_optional_dependency( 

42 "lxml.etree", raise_on_missing=False, on_version="ignore" 

43 ) 

44 _HAS_LXML = lxml is not None 

45 

46 html5lib = import_optional_dependency( 

47 "html5lib", raise_on_missing=False, on_version="ignore" 

48 ) 

49 _HAS_HTML5LIB = html5lib is not None 

50 

51 _IMPORTS = True 

52 

53 

54############# 

55# READ HTML # 

56############# 

57_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") 

58 

59 

60def _remove_whitespace(s: str, regex=_RE_WHITESPACE) -> str: 

61 """ 

62 Replace extra whitespace inside of a string with a single space. 

63 

64 Parameters 

65 ---------- 

66 s : str or unicode 

67 The string from which to remove extra whitespace. 

68 regex : re.Pattern 

69 The regular expression to use to remove extra whitespace. 

70 

71 Returns 

72 ------- 

73 subd : str or unicode 

74 `s` with all extra whitespace replaced with a single space. 

75 """ 

76 return regex.sub(" ", s.strip()) 

77 

78 

79def _get_skiprows(skiprows): 

80 """ 

81 Get an iterator given an integer, slice or container. 

82 

83 Parameters 

84 ---------- 

85 skiprows : int, slice, container 

86 The iterator to use to skip rows; can also be a slice. 

87 

88 Raises 

89 ------ 

90 TypeError 

91 * If `skiprows` is not a slice, integer, or Container 

92 

93 Returns 

94 ------- 

95 it : iterable 

96 A proper iterator to use to skip rows of a DataFrame. 

97 """ 

98 if isinstance(skiprows, slice): 

99 start, step = skiprows.start or 0, skiprows.step or 1 

100 return list(range(start, skiprows.stop, step)) 

101 elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): 

102 return skiprows 

103 elif skiprows is None: 

104 return 0 

105 raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") 

106 

107 

108def _read(obj): 

109 """ 

110 Try to read from a url, file or string. 

111 

112 Parameters 

113 ---------- 

114 obj : str, unicode, or file-like 

115 

116 Returns 

117 ------- 

118 raw_text : str 

119 """ 

120 if is_url(obj): 

121 with urlopen(obj) as url: 

122 text = url.read() 

123 elif hasattr(obj, "read"): 

124 text = obj.read() 

125 elif isinstance(obj, (str, bytes)): 

126 text = obj 

127 try: 

128 if os.path.isfile(text): 

129 with open(text, "rb") as f: 

130 return f.read() 

131 except (TypeError, ValueError): 

132 pass 

133 else: 

134 raise TypeError(f"Cannot read object of type '{type(obj).__name__}'") 

135 return text 

136 

137 

138class _HtmlFrameParser: 

139 """ 

140 Base class for parsers that parse HTML into DataFrames. 

141 

142 Parameters 

143 ---------- 

144 io : str or file-like 

145 This can be either a string of raw HTML, a valid URL using the HTTP, 

146 FTP, or FILE protocols or a file-like object. 

147 

148 match : str or regex 

149 The text to match in the document. 

150 

151 attrs : dict 

152 List of HTML <table> element attributes to match. 

153 

154 encoding : str 

155 Encoding to be used by parser 

156 

157 displayed_only : bool 

158 Whether or not items with "display:none" should be ignored 

159 

160 .. versionadded:: 0.23.0 

161 

162 Attributes 

163 ---------- 

164 io : str or file-like 

165 raw HTML, URL, or file-like object 

166 

167 match : regex 

168 The text to match in the raw HTML 

169 

170 attrs : dict-like 

171 A dictionary of valid table attributes to use to search for table 

172 elements. 

173 

174 encoding : str 

175 Encoding to be used by parser 

176 

177 displayed_only : bool 

178 Whether or not items with "display:none" should be ignored 

179 

180 .. versionadded:: 0.23.0 

181 

182 Notes 

183 ----- 

184 To subclass this class effectively you must override the following methods: 

185 * :func:`_build_doc` 

186 * :func:`_attr_getter` 

187 * :func:`_text_getter` 

188 * :func:`_parse_td` 

189 * :func:`_parse_thead_tr` 

190 * :func:`_parse_tbody_tr` 

191 * :func:`_parse_tfoot_tr` 

192 * :func:`_parse_tables` 

193 * :func:`_equals_tag` 

194 See each method's respective documentation for details on their 

195 functionality. 

196 """ 

197 

198 def __init__(self, io, match, attrs, encoding, displayed_only): 

199 self.io = io 

200 self.match = match 

201 self.attrs = attrs 

202 self.encoding = encoding 

203 self.displayed_only = displayed_only 

204 

205 def parse_tables(self): 

206 """ 

207 Parse and return all tables from the DOM. 

208 

209 Returns 

210 ------- 

211 list of parsed (header, body, footer) tuples from tables. 

212 """ 

213 tables = self._parse_tables(self._build_doc(), self.match, self.attrs) 

214 return (self._parse_thead_tbody_tfoot(table) for table in tables) 

215 

216 def _attr_getter(self, obj, attr): 

217 """ 

218 Return the attribute value of an individual DOM node. 

219 

220 Parameters 

221 ---------- 

222 obj : node-like 

223 A DOM node. 

224 

225 attr : str or unicode 

226 The attribute, such as "colspan" 

227 

228 Returns 

229 ------- 

230 str or unicode 

231 The attribute value. 

232 """ 

233 # Both lxml and BeautifulSoup have the same implementation: 

234 return obj.get(attr) 

235 

236 def _text_getter(self, obj): 

237 """ 

238 Return the text of an individual DOM node. 

239 

240 Parameters 

241 ---------- 

242 obj : node-like 

243 A DOM node. 

244 

245 Returns 

246 ------- 

247 text : str or unicode 

248 The text from an individual DOM node. 

249 """ 

250 raise AbstractMethodError(self) 

251 

252 def _parse_td(self, obj): 

253 """ 

254 Return the td elements from a row element. 

255 

256 Parameters 

257 ---------- 

258 obj : node-like 

259 A DOM <tr> node. 

260 

261 Returns 

262 ------- 

263 list of node-like 

264 These are the elements of each row, i.e., the columns. 

265 """ 

266 raise AbstractMethodError(self) 

267 

268 def _parse_thead_tr(self, table): 

269 """ 

270 Return the list of thead row elements from the parsed table element. 

271 

272 Parameters 

273 ---------- 

274 table : a table element that contains zero or more thead elements. 

275 

276 Returns 

277 ------- 

278 list of node-like 

279 These are the <tr> row elements of a table. 

280 """ 

281 raise AbstractMethodError(self) 

282 

283 def _parse_tbody_tr(self, table): 

284 """ 

285 Return the list of tbody row elements from the parsed table element. 

286 

287 HTML5 table bodies consist of either 0 or more <tbody> elements (which 

288 only contain <tr> elements) or 0 or more <tr> elements. This method 

289 checks for both structures. 

290 

291 Parameters 

292 ---------- 

293 table : a table element that contains row elements. 

294 

295 Returns 

296 ------- 

297 list of node-like 

298 These are the <tr> row elements of a table. 

299 """ 

300 raise AbstractMethodError(self) 

301 

302 def _parse_tfoot_tr(self, table): 

303 """ 

304 Return the list of tfoot row elements from the parsed table element. 

305 

306 Parameters 

307 ---------- 

308 table : a table element that contains row elements. 

309 

310 Returns 

311 ------- 

312 list of node-like 

313 These are the <tr> row elements of a table. 

314 """ 

315 raise AbstractMethodError(self) 

316 

317 def _parse_tables(self, doc, match, attrs): 

318 """ 

319 Return all tables from the parsed DOM. 

320 

321 Parameters 

322 ---------- 

323 doc : the DOM from which to parse the table element. 

324 

325 match : str or regular expression 

326 The text to search for in the DOM tree. 

327 

328 attrs : dict 

329 A dictionary of table attributes that can be used to disambiguate 

330 multiple tables on a page. 

331 

332 Raises 

333 ------ 

334 ValueError : `match` does not match any text in the document. 

335 

336 Returns 

337 ------- 

338 list of node-like 

339 HTML <table> elements to be parsed into raw data. 

340 """ 

341 raise AbstractMethodError(self) 

342 

343 def _equals_tag(self, obj, tag): 

344 """ 

345 Return whether an individual DOM node matches a tag 

346 

347 Parameters 

348 ---------- 

349 obj : node-like 

350 A DOM node. 

351 

352 tag : str 

353 Tag name to be checked for equality. 

354 

355 Returns 

356 ------- 

357 boolean 

358 Whether `obj`'s tag name is `tag` 

359 """ 

360 raise AbstractMethodError(self) 

361 

362 def _build_doc(self): 

363 """ 

364 Return a tree-like object that can be used to iterate over the DOM. 

365 

366 Returns 

367 ------- 

368 node-like 

369 The DOM from which to parse the table element. 

370 """ 

371 raise AbstractMethodError(self) 

372 

373 def _parse_thead_tbody_tfoot(self, table_html): 

374 """ 

375 Given a table, return parsed header, body, and foot. 

376 

377 Parameters 

378 ---------- 

379 table_html : node-like 

380 

381 Returns 

382 ------- 

383 tuple of (header, body, footer), each a list of list-of-text rows. 

384 

385 Notes 

386 ----- 

387 Header and body are lists-of-lists. Top level list is a list of 

388 rows. Each row is a list of str text. 

389 

390 Logic: Use <thead>, <tbody>, <tfoot> elements to identify 

391 header, body, and footer, otherwise: 

392 - Put all rows into body 

393 - Move rows from top of body to header only if 

394 all elements inside row are <th> 

395 - Move rows from bottom of body to footer only if 

396 all elements inside row are <th> 

397 """ 

398 

399 header_rows = self._parse_thead_tr(table_html) 

400 body_rows = self._parse_tbody_tr(table_html) 

401 footer_rows = self._parse_tfoot_tr(table_html) 

402 

403 def row_is_all_th(row): 

404 return all(self._equals_tag(t, "th") for t in self._parse_td(row)) 

405 

406 if not header_rows: 

407 # The table has no <thead>. Move the top all-<th> rows from 

408 # body_rows to header_rows. (This is a common case because many 

409 # tables in the wild have no <thead> or <tfoot> 

410 while body_rows and row_is_all_th(body_rows[0]): 

411 header_rows.append(body_rows.pop(0)) 

412 

413 header = self._expand_colspan_rowspan(header_rows) 

414 body = self._expand_colspan_rowspan(body_rows) 

415 footer = self._expand_colspan_rowspan(footer_rows) 

416 

417 return header, body, footer 

418 

419 def _expand_colspan_rowspan(self, rows): 

420 """ 

421 Given a list of <tr>s, return a list of text rows. 

422 

423 Parameters 

424 ---------- 

425 rows : list of node-like 

426 List of <tr>s 

427 

428 Returns 

429 ------- 

430 list of list 

431 Each returned row is a list of str text. 

432 

433 Notes 

434 ----- 

435 Any cell with ``rowspan`` or ``colspan`` will have its contents copied 

436 to subsequent cells. 

437 """ 

438 

439 all_texts = [] # list of rows, each a list of str 

440 remainder = [] # list of (index, text, nrows) 

441 

442 for tr in rows: 

443 texts = [] # the output for this row 

444 next_remainder = [] 

445 

446 index = 0 

447 tds = self._parse_td(tr) 

448 for td in tds: 

449 # Append texts from previous rows with rowspan>1 that come 

450 # before this <td> 

451 while remainder and remainder[0][0] <= index: 

452 prev_i, prev_text, prev_rowspan = remainder.pop(0) 

453 texts.append(prev_text) 

454 if prev_rowspan > 1: 

455 next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) 

456 index += 1 

457 

458 # Append the text from this <td>, colspan times 

459 text = _remove_whitespace(self._text_getter(td)) 

460 rowspan = int(self._attr_getter(td, "rowspan") or 1) 

461 colspan = int(self._attr_getter(td, "colspan") or 1) 

462 

463 for _ in range(colspan): 

464 texts.append(text) 

465 if rowspan > 1: 

466 next_remainder.append((index, text, rowspan - 1)) 

467 index += 1 

468 

469 # Append texts from previous rows at the final position 

470 for prev_i, prev_text, prev_rowspan in remainder: 

471 texts.append(prev_text) 

472 if prev_rowspan > 1: 

473 next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) 

474 

475 all_texts.append(texts) 

476 remainder = next_remainder 

477 

478 # Append rows that only appear because the previous row had non-1 

479 # rowspan 

480 while remainder: 

481 next_remainder = [] 

482 texts = [] 

483 for prev_i, prev_text, prev_rowspan in remainder: 

484 texts.append(prev_text) 

485 if prev_rowspan > 1: 

486 next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) 

487 all_texts.append(texts) 

488 remainder = next_remainder 

489 

490 return all_texts 

491 

492 def _handle_hidden_tables(self, tbl_list, attr_name): 

493 """ 

494 Return list of tables, potentially removing hidden elements 

495 

496 Parameters 

497 ---------- 

498 tbl_list : list of node-like 

499 Type of list elements will vary depending upon parser used 

500 attr_name : str 

501 Name of the accessor for retrieving HTML attributes 

502 

503 Returns 

504 ------- 

505 list of node-like 

506 Return type matches `tbl_list` 

507 """ 

508 if not self.displayed_only: 

509 return tbl_list 

510 

511 return [ 

512 x 

513 for x in tbl_list 

514 if "display:none" 

515 not in getattr(x, attr_name).get("style", "").replace(" ", "") 

516 ] 

517 

518 

519class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): 

520 """ 

521 HTML to DataFrame parser that uses BeautifulSoup under the hood. 

522 

523 See Also 

524 -------- 

525 pandas.io.html._HtmlFrameParser 

526 pandas.io.html._LxmlFrameParser 

527 

528 Notes 

529 ----- 

530 Documentation strings for this class are in the base class 

531 :class:`pandas.io.html._HtmlFrameParser`. 

532 """ 

533 

534 def __init__(self, *args, **kwargs): 

535 super().__init__(*args, **kwargs) 

536 from bs4 import SoupStrainer 

537 

538 self._strainer = SoupStrainer("table") 

539 

540 def _parse_tables(self, doc, match, attrs): 

541 element_name = self._strainer.name 

542 tables = doc.find_all(element_name, attrs=attrs) 

543 

544 if not tables: 

545 raise ValueError("No tables found") 

546 

547 result = [] 

548 unique_tables = set() 

549 tables = self._handle_hidden_tables(tables, "attrs") 

550 

551 for table in tables: 

552 if self.displayed_only: 

553 for elem in table.find_all(style=re.compile(r"display:\s*none")): 

554 elem.decompose() 

555 

556 if table not in unique_tables and table.find(text=match) is not None: 

557 result.append(table) 

558 unique_tables.add(table) 

559 

560 if not result: 

561 raise ValueError(f"No tables found matching pattern {repr(match.pattern)}") 

562 return result 

563 

564 def _text_getter(self, obj): 

565 return obj.text 

566 

567 def _equals_tag(self, obj, tag): 

568 return obj.name == tag 

569 

570 def _parse_td(self, row): 

571 return row.find_all(("td", "th"), recursive=False) 

572 

573 def _parse_thead_tr(self, table): 

574 return table.select("thead tr") 

575 

576 def _parse_tbody_tr(self, table): 

577 from_tbody = table.select("tbody tr") 

578 from_root = table.find_all("tr", recursive=False) 

579 # HTML spec: at most one of these lists has content 

580 return from_tbody + from_root 

581 

582 def _parse_tfoot_tr(self, table): 

583 return table.select("tfoot tr") 

584 

585 def _setup_build_doc(self): 

586 raw_text = _read(self.io) 

587 if not raw_text: 

588 raise ValueError(f"No text parsed from document: {self.io}") 

589 return raw_text 

590 

591 def _build_doc(self): 

592 from bs4 import BeautifulSoup 

593 

594 bdoc = self._setup_build_doc() 

595 if isinstance(bdoc, bytes) and self.encoding is not None: 

596 udoc = bdoc.decode(self.encoding) 

597 from_encoding = None 

598 else: 

599 udoc = bdoc 

600 from_encoding = self.encoding 

601 return BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) 

602 

603 

604def _build_xpath_expr(attrs) -> str: 

605 """Build an xpath expression to simulate bs4's ability to pass in kwargs to 

606 search for attributes when using the lxml parser. 

607 

608 Parameters 

609 ---------- 

610 attrs : dict 

611 A dict of HTML attributes. These are NOT checked for validity. 

612 

613 Returns 

614 ------- 

615 expr : unicode 

616 An XPath expression that checks for the given HTML attributes. 

617 """ 

618 # give class attribute as class_ because class is a python keyword 

619 if "class_" in attrs: 

620 attrs["class"] = attrs.pop("class_") 

621 

622 s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()]) 

623 return f"[{s}]" 

624 

625 

626_re_namespace = {"re": "http://exslt.org/regular-expressions"} 

627_valid_schemes = "http", "file", "ftp" 

628 

629 

630class _LxmlFrameParser(_HtmlFrameParser): 

631 """ 

632 HTML to DataFrame parser that uses lxml under the hood. 

633 

634 Warning 

635 ------- 

636 This parser can only handle HTTP, FTP, and FILE urls. 

637 

638 See Also 

639 -------- 

640 _HtmlFrameParser 

641 _BeautifulSoupLxmlFrameParser 

642 

643 Notes 

644 ----- 

645 Documentation strings for this class are in the base class 

646 :class:`_HtmlFrameParser`. 

647 """ 

648 

649 def __init__(self, *args, **kwargs): 

650 super().__init__(*args, **kwargs) 

651 

652 def _text_getter(self, obj): 

653 return obj.text_content() 

654 

655 def _parse_td(self, row): 

656 # Look for direct children only: the "row" element here may be a 

657 # <thead> or <tfoot> (see _parse_thead_tr). 

658 return row.xpath("./td|./th") 

659 

660 def _parse_tables(self, doc, match, kwargs): 

661 pattern = match.pattern 

662 

663 # 1. check all descendants for the given pattern and only search tables 

664 # 2. go up the tree until we find a table 

665 xpath_expr = f"//table//*[re:test(text(), {repr(pattern)})]/ancestor::table" 

666 

667 # if any table attributes were given build an xpath expression to 

668 # search for them 

669 if kwargs: 

670 xpath_expr += _build_xpath_expr(kwargs) 

671 

672 tables = doc.xpath(xpath_expr, namespaces=_re_namespace) 

673 

674 tables = self._handle_hidden_tables(tables, "attrib") 

675 if self.displayed_only: 

676 for table in tables: 

677 # lxml utilizes XPATH 1.0 which does not have regex 

678 # support. As a result, we find all elements with a style 

679 # attribute and iterate them to check for display:none 

680 for elem in table.xpath(".//*[@style]"): 

681 if "display:none" in elem.attrib.get("style", "").replace(" ", ""): 

682 elem.getparent().remove(elem) 

683 

684 if not tables: 

685 raise ValueError(f"No tables found matching regex {repr(pattern)}") 

686 return tables 

687 

688 def _equals_tag(self, obj, tag): 

689 return obj.tag == tag 

690 

691 def _build_doc(self): 

692 """ 

693 Raises 

694 ------ 

695 ValueError 

696 * If a URL that lxml cannot parse is passed. 

697 

698 Exception 

699 * Any other ``Exception`` thrown. For example, trying to parse a 

700 URL that is syntactically correct on a machine with no internet 

701 connection will fail. 

702 

703 See Also 

704 -------- 

705 pandas.io.html._HtmlFrameParser._build_doc 

706 """ 

707 from lxml.html import parse, fromstring, HTMLParser 

708 from lxml.etree import XMLSyntaxError 

709 

710 parser = HTMLParser(recover=True, encoding=self.encoding) 

711 

712 try: 

713 if is_url(self.io): 

714 with urlopen(self.io) as f: 

715 r = parse(f, parser=parser) 

716 else: 

717 # try to parse the input in the simplest way 

718 r = parse(self.io, parser=parser) 

719 try: 

720 r = r.getroot() 

721 except AttributeError: 

722 pass 

723 except (UnicodeDecodeError, IOError) as e: 

724 # if the input is a blob of html goop 

725 if not is_url(self.io): 

726 r = fromstring(self.io, parser=parser) 

727 

728 try: 

729 r = r.getroot() 

730 except AttributeError: 

731 pass 

732 else: 

733 raise e 

734 else: 

735 if not hasattr(r, "text_content"): 

736 raise XMLSyntaxError("no text parsed from document", 0, 0, 0) 

737 return r 

738 

739 def _parse_thead_tr(self, table): 

740 rows = [] 

741 

742 for thead in table.xpath(".//thead"): 

743 rows.extend(thead.xpath("./tr")) 

744 

745 # HACK: lxml does not clean up the clearly-erroneous 

746 # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add 

747 # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its 

748 # children as though it's a <tr>. 

749 # 

750 # Better solution would be to use html5lib. 

751 elements_at_root = thead.xpath("./td|./th") 

752 if elements_at_root: 

753 rows.append(thead) 

754 

755 return rows 

756 

757 def _parse_tbody_tr(self, table): 

758 from_tbody = table.xpath(".//tbody//tr") 

759 from_root = table.xpath("./tr") 

760 # HTML spec: at most one of these lists has content 

761 return from_tbody + from_root 

762 

763 def _parse_tfoot_tr(self, table): 

764 return table.xpath(".//tfoot//tr") 

765 

766 

767def _expand_elements(body): 

768 data = [len(elem) for elem in body] 

769 lens = create_series_with_explicit_dtype(data, dtype_if_empty=object) 

770 lens_max = lens.max() 

771 not_max = lens[lens != lens_max] 

772 

773 empty = [""] 

774 for ind, length in not_max.items(): 

775 body[ind] += empty * (lens_max - length) 

776 

777 

778def _data_to_frame(**kwargs): 

779 head, body, foot = kwargs.pop("data") 

780 header = kwargs.pop("header") 

781 kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) 

782 if head: 

783 body = head + body 

784 

785 # Infer header when there is a <thead> or top <th>-only rows 

786 if header is None: 

787 if len(head) == 1: 

788 header = 0 

789 else: 

790 # ignore all-empty-text rows 

791 header = [i for i, row in enumerate(head) if any(text for text in row)] 

792 

793 if foot: 

794 body += foot 

795 

796 # fill out elements of body that are "ragged" 

797 _expand_elements(body) 

798 tp = TextParser(body, header=header, **kwargs) 

799 df = tp.read() 

800 return df 

801 

802 

803_valid_parsers = { 

804 "lxml": _LxmlFrameParser, 

805 None: _LxmlFrameParser, 

806 "html5lib": _BeautifulSoupHtml5LibFrameParser, 

807 "bs4": _BeautifulSoupHtml5LibFrameParser, 

808} 

809 

810 

811def _parser_dispatch(flavor): 

812 """ 

813 Choose the parser based on the input flavor. 

814 

815 Parameters 

816 ---------- 

817 flavor : str 

818 The type of parser to use. This must be a valid backend. 

819 

820 Returns 

821 ------- 

822 cls : _HtmlFrameParser subclass 

823 The parser class based on the requested input flavor. 

824 

825 Raises 

826 ------ 

827 ValueError 

828 * If `flavor` is not a valid backend. 

829 ImportError 

830 * If you do not have the requested `flavor` 

831 """ 

832 valid_parsers = list(_valid_parsers.keys()) 

833 if flavor not in valid_parsers: 

834 raise ValueError( 

835 f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}" 

836 ) 

837 

838 if flavor in ("bs4", "html5lib"): 

839 if not _HAS_HTML5LIB: 

840 raise ImportError("html5lib not found, please install it") 

841 if not _HAS_BS4: 

842 raise ImportError("BeautifulSoup4 (bs4) not found, please install it") 

843 # Although we call this above, we want to raise here right before use. 

844 bs4 = import_optional_dependency("bs4") # noqa:F841 

845 

846 else: 

847 if not _HAS_LXML: 

848 raise ImportError("lxml not found, please install it") 

849 return _valid_parsers[flavor] 

850 

851 

852def _print_as_set(s) -> str: 

853 arg = ", ".join(pprint_thing(el) for el in s) 

854 return f"{{{arg}}}" 

855 

856 

857def _validate_flavor(flavor): 

858 if flavor is None: 

859 flavor = "lxml", "bs4" 

860 elif isinstance(flavor, str): 

861 flavor = (flavor,) 

862 elif isinstance(flavor, abc.Iterable): 

863 if not all(isinstance(flav, str) for flav in flavor): 

864 raise TypeError( 

865 f"Object of type {repr(type(flavor).__name__)} " 

866 f"is not an iterable of strings" 

867 ) 

868 else: 

869 msg = repr(flavor) if isinstance(flavor, str) else str(flavor) 

870 msg += " is not a valid flavor" 

871 raise ValueError(msg) 

872 

873 flavor = tuple(flavor) 

874 valid_flavors = set(_valid_parsers) 

875 flavor_set = set(flavor) 

876 

877 if not flavor_set & valid_flavors: 

878 raise ValueError( 

879 f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid " 

880 f"flavors are {_print_as_set(valid_flavors)}" 

881 ) 

882 return flavor 

883 

884 

885def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): 

886 flavor = _validate_flavor(flavor) 

887 compiled_match = re.compile(match) # you can pass a compiled regex here 

888 

889 retained = None 

890 for flav in flavor: 

891 parser = _parser_dispatch(flav) 

892 p = parser(io, compiled_match, attrs, encoding, displayed_only) 

893 

894 try: 

895 tables = p.parse_tables() 

896 except ValueError as caught: 

897 # if `io` is an io-like object, check if it's seekable 

898 # and try to rewind it before trying the next parser 

899 if hasattr(io, "seekable") and io.seekable(): 

900 io.seek(0) 

901 elif hasattr(io, "seekable") and not io.seekable(): 

902 # if we couldn't rewind it, let the user know 

903 raise ValueError( 

904 f"The flavor {flav} failed to parse your input. " 

905 "Since you passed a non-rewindable file " 

906 "object, we can't rewind it to try " 

907 "another parser. Try read_html() with a " 

908 "different flavor." 

909 ) 

910 

911 retained = caught 

912 else: 

913 break 

914 else: 

915 raise retained 

916 

917 ret = [] 

918 for table in tables: 

919 try: 

920 ret.append(_data_to_frame(data=table, **kwargs)) 

921 except EmptyDataError: # empty table 

922 continue 

923 return ret 

924 

925 

926def read_html( 

927 io, 

928 match=".+", 

929 flavor=None, 

930 header=None, 

931 index_col=None, 

932 skiprows=None, 

933 attrs=None, 

934 parse_dates=False, 

935 thousands=",", 

936 encoding=None, 

937 decimal=".", 

938 converters=None, 

939 na_values=None, 

940 keep_default_na=True, 

941 displayed_only=True, 

942): 

943 r""" 

944 Read HTML tables into a ``list`` of ``DataFrame`` objects. 

945 

946 Parameters 

947 ---------- 

948 io : str, path object or file-like object 

949 A URL, a file-like object, or a raw string containing HTML. Note that 

950 lxml only accepts the http, ftp and file url protocols. If you have a 

951 URL that starts with ``'https'`` you might try removing the ``'s'``. 

952 

953 match : str or compiled regular expression, optional 

954 The set of tables containing text matching this regex or string will be 

955 returned. Unless the HTML is extremely simple you will probably need to 

956 pass a non-empty string here. Defaults to '.+' (match any non-empty 

957 string). The default value will return all tables contained on a page. 

958 This value is converted to a regular expression so that there is 

959 consistent behavior between Beautiful Soup and lxml. 

960 

961 flavor : str or None 

962 The parsing engine to use. 'bs4' and 'html5lib' are synonymous with 

963 each other, they are both there for backwards compatibility. The 

964 default of ``None`` tries to use ``lxml`` to parse and if that fails it 

965 falls back on ``bs4`` + ``html5lib``. 

966 

967 header : int or list-like or None, optional 

968 The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to 

969 make the columns headers. 

970 

971 index_col : int or list-like or None, optional 

972 The column (or list of columns) to use to create the index. 

973 

974 skiprows : int or list-like or slice or None, optional 

975 Number of rows to skip after parsing the column integer. 0-based. If a 

976 sequence of integers or a slice is given, will skip the rows indexed by 

977 that sequence. Note that a single element sequence means 'skip the nth 

978 row' whereas an integer means 'skip n rows'. 

979 

980 attrs : dict or None, optional 

981 This is a dictionary of attributes that you can pass to use to identify 

982 the table in the HTML. These are not checked for validity before being 

983 passed to lxml or Beautiful Soup. However, these attributes must be 

984 valid HTML table attributes to work correctly. For example, :: 

985 

986 attrs = {'id': 'table'} 

987 

988 is a valid attribute dictionary because the 'id' HTML tag attribute is 

989 a valid HTML attribute for *any* HTML tag as per `this document 

990 <http://www.w3.org/TR/html-markup/global-attributes.html>`__. :: 

991 

992 attrs = {'asdf': 'table'} 

993 

994 is *not* a valid attribute dictionary because 'asdf' is not a valid 

995 HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 

996 table attributes can be found `here 

997 <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A 

998 working draft of the HTML 5 spec can be found `here 

999 <http://www.w3.org/TR/html-markup/table.html>`__. It contains the 

1000 latest information on table attributes for the modern web. 

1001 

1002 parse_dates : bool, optional 

1003 See :func:`~read_csv` for more details. 

1004 

1005 thousands : str, optional 

1006 Separator to use to parse thousands. Defaults to ``','``. 

1007 

1008 encoding : str or None, optional 

1009 The encoding used to decode the web page. Defaults to ``None``.``None`` 

1010 preserves the previous encoding behavior, which depends on the 

1011 underlying parser library (e.g., the parser library will try to use 

1012 the encoding provided by the document). 

1013 

1014 decimal : str, default '.' 

1015 Character to recognize as decimal point (e.g. use ',' for European 

1016 data). 

1017 

1018 converters : dict, default None 

1019 Dict of functions for converting values in certain columns. Keys can 

1020 either be integers or column labels, values are functions that take one 

1021 input argument, the cell (not column) content, and return the 

1022 transformed content. 

1023 

1024 na_values : iterable, default None 

1025 Custom NA values. 

1026 

1027 keep_default_na : bool, default True 

1028 If na_values are specified and keep_default_na is False the default NaN 

1029 values are overridden, otherwise they're appended to. 

1030 

1031 displayed_only : bool, default True 

1032 Whether elements with "display: none" should be parsed. 

1033 

1034 Returns 

1035 ------- 

1036 dfs 

1037 A list of DataFrames. 

1038 

1039 See Also 

1040 -------- 

1041 read_csv 

1042 

1043 Notes 

1044 ----- 

1045 Before using this function you should read the :ref:`gotchas about the 

1046 HTML parsing libraries <io.html.gotchas>`. 

1047 

1048 Expect to do some cleanup after you call this function. For example, you 

1049 might need to manually assign column names if the column names are 

1050 converted to NaN when you pass the `header=0` argument. We try to assume as 

1051 little as possible about the structure of the table and push the 

1052 idiosyncrasies of the HTML contained in the table to the user. 

1053 

1054 This function searches for ``<table>`` elements and only for ``<tr>`` 

1055 and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>`` 

1056 element in the table. ``<td>`` stands for "table data". This function 

1057 attempts to properly handle ``colspan`` and ``rowspan`` attributes. 

1058 If the function has a ``<thead>`` argument, it is used to construct 

1059 the header, otherwise the function attempts to find the header within 

1060 the body (by putting rows with only ``<th>`` elements into the header). 

1061 

1062 .. versionadded:: 0.21.0 

1063 

1064 Similar to :func:`~read_csv` the `header` argument is applied 

1065 **after** `skiprows` is applied. 

1066 

1067 This function will *always* return a list of :class:`DataFrame` *or* 

1068 it will fail, e.g., it will *not* return an empty list. 

1069 

1070 Examples 

1071 -------- 

1072 See the :ref:`read_html documentation in the IO section of the docs 

1073 <io.read_html>` for some examples of reading in HTML tables. 

1074 """ 

1075 _importers() 

1076 

1077 # Type check here. We don't want to parse only to fail because of an 

1078 # invalid value of an integer skiprows. 

1079 if isinstance(skiprows, numbers.Integral) and skiprows < 0: 

1080 raise ValueError( 

1081 "cannot skip rows starting from the end of the " 

1082 "data (you passed a negative value)" 

1083 ) 

1084 validate_header_arg(header) 

1085 return _parse( 

1086 flavor=flavor, 

1087 io=io, 

1088 match=match, 

1089 header=header, 

1090 index_col=index_col, 

1091 skiprows=skiprows, 

1092 parse_dates=parse_dates, 

1093 thousands=thousands, 

1094 attrs=attrs, 

1095 encoding=encoding, 

1096 decimal=decimal, 

1097 converters=converters, 

1098 na_values=na_values, 

1099 keep_default_na=keep_default_na, 

1100 displayed_only=displayed_only, 

1101 )