Source code for jeevesagent.loader.html
"""HTML loader → markdown.
Uses ``beautifulsoup4`` (lazy import) to walk the DOM and emit
markdown that preserves heading + paragraph + list structure.
Strips ``<script>`` / ``<style>`` content. Drops most attributes;
the goal is to keep the textual structure, not pixel-perfect
rendering.
"""
from __future__ import annotations
from pathlib import Path
from .base import Document
_BLOCK_TAGS_TO_HEADINGS: dict[str, int] = {
"h1": 1,
"h2": 2,
"h3": 3,
"h4": 4,
"h5": 5,
"h6": 6,
}
def _walk(node: object, parts: list[str]) -> None:
"""Recursively walk a BeautifulSoup node and append markdown
fragments to ``parts``."""
name = getattr(node, "name", None)
# Text node
if name is None:
text = str(node).strip()
if text:
parts.append(text)
return
if name in ("script", "style"):
return
if name in _BLOCK_TAGS_TO_HEADINGS:
level = _BLOCK_TAGS_TO_HEADINGS[name]
text = node.get_text(separator=" ", strip=True) # type: ignore[attr-defined]
if text:
parts.append(f"\n\n{'#' * level} {text}\n")
return
if name in ("p", "div", "section", "article", "main"):
text = node.get_text(separator=" ", strip=True) # type: ignore[attr-defined]
if text:
parts.append(f"\n\n{text}\n")
return
if name in ("ul", "ol"):
ordered = name == "ol"
for i, li in enumerate(
node.find_all("li", recursive=False), # type: ignore[attr-defined]
start=1,
):
text = li.get_text(separator=" ", strip=True)
if text:
marker = f"{i}." if ordered else "-"
parts.append(f"{marker} {text}\n")
parts.append("\n")
return
if name in ("table",):
rows: list[list[str]] = []
for tr in node.find_all("tr"): # type: ignore[attr-defined]
cells = [
td.get_text(separator=" ", strip=True).replace(
"|", "\\|"
)
for td in tr.find_all(["th", "td"])
]
if cells:
rows.append(cells)
if rows:
header = rows[0]
parts.append(
"\n\n| "
+ " | ".join(header)
+ " |\n| "
+ " | ".join("---" for _ in header)
+ " |\n"
)
for row in rows[1:]:
padded = row + [""] * (len(header) - len(row))
parts.append("| " + " | ".join(padded) + " |\n")
return
if name in ("br",):
parts.append("\n")
return
# Generic container — descend
for child in getattr(node, "children", []):
_walk(child, parts)
[docs]
def load_html(path: str | Path) -> Document:
"""Load an HTML file → markdown.
Requires ``beautifulsoup4``:
``pip install 'jeevesagent[loader-html]'``.
"""
try:
from bs4 import BeautifulSoup # type: ignore[import-not-found, import-untyped]
except ImportError as exc: # pragma: no cover
raise ImportError(
"beautifulsoup4 is not installed. "
"Install with: pip install 'jeevesagent[loader-html]'."
) from exc
p = Path(path)
raw = p.read_text(encoding="utf-8", errors="replace")
soup = BeautifulSoup(raw, "html.parser")
title_tag = soup.find("title")
title = (
title_tag.get_text(strip=True) if title_tag else p.stem
)
body = soup.find("body") or soup
parts: list[str] = [f"# {title}\n"]
_walk(body, parts)
# Collapse runs of blank lines and strip trailing whitespace.
text = "".join(parts)
out_lines = []
prev_blank = False
for line in text.splitlines():
is_blank = not line.strip()
if is_blank and prev_blank:
continue
out_lines.append(line)
prev_blank = is_blank
content = "\n".join(out_lines).strip() + "\n"
return Document(
content=content,
metadata={
"source": str(p),
"format": "html",
"title": title,
},
)