Source code for jeevesagent.loader.pdf

"""PDF loader → markdown.

Uses ``pypdf`` (lazy import). Each page becomes a section ``#
Page N`` in the markdown output. Page-level whitespace is
normalized; otherwise the text comes through as the PDF's
extractable layer reports it.

PDFs vary wildly in extractability — scanned image PDFs return
empty text; layout-heavy PDFs lose column structure. For
production use cases needing OCR / table extraction, swap this
loader for ``pdfplumber`` or ``unstructured`` (kept out of the
default dependency footprint).
"""

from __future__ import annotations

import re
from pathlib import Path

from .base import Document

_WHITESPACE_RE = re.compile(r"\s+")


def _normalize_page_text(text: str) -> str:
    """Strip absurd spacing PDFs sometimes have."""
    # Collapse any run of 3+ whitespace into a paragraph break.
    out = re.sub(r"\s*\n\s*\n\s*", "\n\n", text)
    # Strip trailing whitespace from each line.
    out = "\n".join(line.rstrip() for line in out.splitlines())
    return out.strip()


[docs] def load_pdf(path: str | Path) -> Document: """Load a PDF, convert to markdown. Each page becomes ``## Page N`` followed by the extracted text. Requires ``pypdf``: ``pip install 'jeevesagent[loader-pdf]'``. """ try: from pypdf import PdfReader # type: ignore[import-not-found, import-untyped] except ImportError as exc: # pragma: no cover raise ImportError( "pypdf is not installed. " "Install with: pip install 'jeevesagent[loader-pdf]' " "(or 'jeevesagent[loader]' for all loader extras)." ) from exc p = Path(path) reader = PdfReader(str(p)) # Document-level metadata from the PDF (title, author, etc.). title = "" try: raw_title = reader.metadata.get("/Title", "") if reader.metadata else "" title = (raw_title or "").strip() if isinstance(raw_title, str) else "" except (TypeError, AttributeError): title = "" parts: list[str] = [] if title: parts.append(f"# {title}\n") elif p.stem: parts.append(f"# {p.stem}\n") for i, page in enumerate(reader.pages, start=1): try: text = page.extract_text() or "" except Exception: # noqa: BLE001 — PDFs are messy text = "" text = _normalize_page_text(text) parts.append(f"## Page {i}\n") parts.append(text or "(no extractable text)") parts.append("") # blank line between pages content = "\n".join(parts) return Document( content=content, metadata={ "source": str(p), "format": "pdf", "page_count": len(reader.pages), "title": title, }, )