Source code for scitex_scholar.pdf_highlight.highlighter

"""Thin orchestrator — wires extraction, classification, and annotation.

Public entry points:
- :func:`highlight_pdf` — end-to-end: extract → classify → write annotated PDF.
- :func:`save_with_highlights` — apply pre-classified labels to a document.
- :func:`apply_classifications` — merge offline JSON labels into blocks.

The source PDF bytes are not modified; highlights are PDF annotation
objects compatible with any PDF viewer.
"""

from __future__ import annotations

import datetime as _dt
import importlib.metadata as _md
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Optional

import pymupdf

from ._annotator import add_legend_page, apply_highlights
from ._blocks import Block, extract_blocks
from ._classifier import classify_llm, classify_stub
from ._colors import CATEGORIES, CATEGORY_LABELS, COLOR_RGB

__all__ = [
    "CATEGORIES",
    "COLOR_RGB",
    "CATEGORY_LABELS",
    "Block",
    "HighlightResult",
    "apply_classifications",
    "extract_blocks",
    "highlight_pdf",
    "save_with_highlights",
]


def _package_version() -> str:
    try:
        return _md.version("scitex-scholar")
    except Exception:
        return "unknown"


def _default_signature(model: Optional[str]) -> str:
    ts = _dt.datetime.now().strftime("%Y-%m-%d %H:%M")
    return f"Highlighted by scitex-scholar v{_package_version()} (pdf_highlight) — {ts}"


[docs] def apply_classifications( blocks: list[Block], classifications: list[dict[str, Any]], ) -> int: """Assign offline-produced labels to already-extracted blocks. Each entry must contain at least ``id`` and ``category``; ``confidence`` is optional (defaults to 0.0). Categories outside :data:`CATEGORIES` are silently dropped. Returns the number of blocks that received a label. """ by_id = {b.id: b for b in blocks} n = 0 for item in classifications: b = by_id.get(int(item["id"])) if b is None: continue cat = item.get("category", "none") if cat not in CATEGORIES: continue b.category = cat b.confidence = float(item.get("confidence", 0.0)) n += 1 return n
[docs] def save_with_highlights( doc: pymupdf.Document, blocks: list[Block], output_path: str | os.PathLike, *, add_legend: bool = True, signature: Optional[str] = None, model_label: Optional[str] = None, source_name: Optional[str] = None, ) -> int: """Write ``doc`` with highlight annotations for all labelled blocks. When ``add_legend=True`` (default) a colour legend + signature page is prepended so readers can see which colour means what. Returns the number of highlight annotations added (not counting the legend page). """ n = apply_highlights(doc, blocks) if add_legend: add_legend_page( doc, signature=signature or _default_signature(model_label), model_label=model_label, source_name=source_name or "(unknown)", ) doc.save(Path(output_path), garbage=3, deflate=True) return n
[docs] @dataclass class HighlightResult: input_path: Path output_path: Optional[Path] blocks: list[Block] pages: int annotations_added: int
[docs] def counts(self) -> dict[str, int]: c: dict[str, int] = {} for b in self.blocks: c[b.category or "none"] = c.get(b.category or "none", 0) + 1 return c
[docs] def highlight_pdf( pdf_path: str | os.PathLike, output_path: Optional[str | os.PathLike] = None, *, model: str = "claude-haiku-4-5-20251001", use_stub: bool = False, dry_run: bool = False, max_blocks: int = 0, batch_size: int = 25, min_chars: int = 40, sentence_level: bool = True, add_legend: bool = True, on_info: Optional[Any] = None, on_warning: Optional[Any] = None, ) -> HighlightResult: """Annotate a PDF with rhetorical-role highlights. Args: pdf_path: Input PDF path. output_path: Output PDF. Defaults to ``<input>.highlighted.pdf``. model: Anthropic model ID used by the LLM classifier. use_stub: If True, classify with a keyword heuristic (no API call). dry_run: If True, classify but do not write the output PDF. max_blocks: If >0, truncate to the first N extracted units. batch_size: Classifier batch size (units per API call). min_chars: Minimum text length for an extracted unit. sentence_level: If True (default), classify and highlight at sentence granularity. If False, use paragraph-level (less precise but ~5× cheaper on long papers). add_legend: If True, prepend a colour legend + signature page. Returns: ``HighlightResult`` with the classified units and annotation count. """ info = on_info or (lambda _msg: None) warn = on_warning or (lambda _msg: None) pdf = Path(pdf_path) if not pdf.exists(): raise FileNotFoundError(pdf) out: Optional[Path] if dry_run: out = None else: out = ( Path(output_path) if output_path else pdf.with_name(pdf.stem + ".highlighted.pdf") ) info( f"[1/3] Extracting {'sentences' if sentence_level else 'paragraphs'} from {pdf}" ) doc, blocks = extract_blocks( pdf, min_chars=min_chars, sentence_level=sentence_level ) if max_blocks > 0: blocks = blocks[:max_blocks] info(f" {len(blocks)} candidate units across {doc.page_count} pages") info(f"[2/3] Classifying ({'stub' if use_stub else model})") if use_stub: classify_stub(blocks) else: if not os.environ.get("ANTHROPIC_API_KEY"): raise RuntimeError( "ANTHROPIC_API_KEY is not set (or call with use_stub=True)" ) classify_llm(blocks, model=model, batch_size=batch_size, on_warning=warn) added = 0 if dry_run: info("[3/3] dry-run — not writing PDF") else: assert out is not None info(f"[3/3] Writing highlights to {out}") added = save_with_highlights( doc, blocks, out, add_legend=add_legend, model_label=(None if use_stub else model), source_name=pdf.name, ) info(f" added {added} highlight annotations") return HighlightResult( input_path=pdf, output_path=out, blocks=blocks, pages=doc.page_count, annotations_added=added, )