Source code for scitex_scholar.pdf_highlight._annotator

"""PDF annotation — tight per-sentence highlights + legend/signature page."""

from __future__ import annotations

from typing import Any, Iterable, Optional

import pymupdf

from ._blocks import Block
from ._colors import CATEGORIES, CATEGORY_LABELS, COLOR_RGB


[docs] def _chunk_quads_for_sentence( page: pymupdf.Page, rect: pymupdf.Rect, sentence: str ) -> list[pymupdf.Quad]: """Locate a sentence piecewise via short word-window probes. A whole-sentence ``search_for`` usually fails when the sentence wraps across lines (the on-page text has line breaks / hyphenation that the whitespace-normalised sentence string does not). We instead walk the sentence in ~60-char word windows; each window almost always lives on a single line, so ``search_for`` matches it and returns a tight quad. Concatenating the windows' quads highlights *only* the sentence's glyphs — never the surrounding paragraph. """ words = sentence.split() quads: list[pymupdf.Quad] = [] i = 0 while i < len(words): window: list[str] = [] while i < len(words) and len(" ".join(window)) < 60: window.append(words[i]) i += 1 probe = " ".join(window).strip() if len(probe) < 8: break found = page.search_for(probe, clip=rect, quads=True) if found: quads.extend(found) return quads
[docs] def _search_quads_for_sentence( page: pymupdf.Page, rect: pymupdf.Rect, sentence: str ) -> list[pymupdf.Quad]: """Locate a sentence's glyphs, tightest-match first. 1. Whole-sentence probes (fast path for single-line sentences). 2. Word-window chunks (handles sentences that wrap across lines). Returns an empty list when the sentence cannot be located at all; the caller decides whether to skip it. We deliberately do NOT fall back to the paragraph's line boxes — that paints the entire paragraph and is the source of the over-large block highlights. """ probes: Iterable[str] = ( sentence[:120] if len(sentence) > 120 else sentence, sentence[:80], sentence[:50], ) for probe in probes: probe = probe.strip() if len(probe) < 20: continue found = page.search_for(probe, clip=rect, quads=True) if found: return list(found) return _chunk_quads_for_sentence(page, rect, sentence)
[docs] def apply_highlights( doc: pymupdf.Document, blocks: list[Block], *, min_confidence: float = 0.0, on_info: Optional[Any] = None, ) -> int: """Overlay one highlight annotation per classified block. Returns count. ``min_confidence`` skips any classified block whose confidence is below the threshold, so a reader can thin out low-certainty highlights. ``on_info`` (optional callable) receives periodic progress messages while the per-sentence text search runs — this phase is CPU-bound and otherwise silent, so without it a long PDF looks hung. """ info = on_info or (lambda _msg: None) candidates = [ b for b in blocks if b.category is not None and b.confidence >= min_confidence ] total = len(candidates) info(f" locating {total} sentence(s) to highlight across the PDF") added = 0 for i, b in enumerate(candidates, start=1): assert b.category is not None # candidates filter guarantees this page = doc[b.page] rect = pymupdf.Rect(*b.bbox) # Tight, glyph-hugging quads for the sentence's own text only. We do # NOT fall back to the paragraph's line boxes: the bbox is the whole # paragraph (sentence units share their paragraph's rect), so a line # fill would paint every line of the paragraph — the over-large block # highlight. If the sentence can't be located, skip it. quads = _search_quads_for_sentence(page, rect, b.text) if not quads: continue annot = page.add_highlight_annot(quads) annot.set_colors(stroke=COLOR_RGB[b.category]) # No popup note/comment: a bare highlight keeps the page clean (the # colour legend already explains what each colour means). annot.update(opacity=0.4) added += 1 if i % 100 == 0 or i == total: info(f" located {i}/{total} ({added} highlighted)") return added
# Compact legend block sized for a corner overlay. Width is chosen so the # category labels fit at 7pt; height covers 5 swatch rows + 2 signature lines. _LEGEND_W = 210 _LEGEND_H = 112 _MARGIN = 24
[docs] def _corner_rect( page: pymupdf.Page, corner: str, w: float = _LEGEND_W, h: float = _LEGEND_H ) -> pymupdf.Rect: """Return a rect anchored to ``corner`` of ``page`` ("lr", "ll", "lc").""" pw, ph = page.rect.width, page.rect.height y0 = ph - _MARGIN - h y1 = ph - _MARGIN if corner == "ll": x0, x1 = _MARGIN, _MARGIN + w elif corner == "lc": x0 = (pw - w) / 2 x1 = x0 + w else: # "lr" default — lower-right x0, x1 = pw - _MARGIN - w, pw - _MARGIN return pymupdf.Rect(x0, y0, x1, y1)
[docs] def _draw_legend_overlay( page: pymupdf.Page, rect: pymupdf.Rect, *, signature: str, model_label: Optional[str], source_name: str, ) -> None: """Paint a small opaque legend panel into ``rect`` on ``page``. Opaque white background so the panel remains readable even if it overlays text underneath. Kept intentionally small — the information density is high and the goal is unobtrusive reference. """ page.draw_rect( rect, color=(0.6, 0.6, 0.6), fill=(1.0, 1.0, 1.0), fill_opacity=0.92, width=0.4, ) x0, y0 = rect.x0, rect.y0 pad = 6 page.insert_text( (x0 + pad, y0 + pad + 7), "Semantic highlights", fontname="helv", fontsize=7.5, color=(0.2, 0.2, 0.2), ) swatch_w, swatch_h = 10, 7 row_h = 10 row_y = y0 + pad + 19 short_labels = { "focal_claim": "claim / finding", "focal_method": "novel method", "focal_limitation": "limitation", "related_supportive": "related (supportive)", "related_contradictive": "related (contradictive)", } for cat in CATEGORIES: rgb = COLOR_RGB[cat] sw = pymupdf.Rect( x0 + pad, row_y - swatch_h + 2, x0 + pad + swatch_w, row_y + 2 ) page.draw_rect(sw, color=rgb, fill=rgb, fill_opacity=0.4, width=0.25) page.insert_text( (x0 + pad + swatch_w + 5, row_y), short_labels[cat], fontname="helv", fontsize=6.5, color=(0.2, 0.2, 0.2), ) row_y += row_h # Two-line signature in 5pt — line 1: source, line 2: model + timestamp. # signature string looks like # "Highlighted by scitex-scholar v1.0.1 (pdf_highlight) — 2026-04-18 20:27" ts = signature.rsplit("—", 1)[-1].strip() if "—" in signature else "" line1 = f"scitex-scholar · {source_name}" line2_bits = [] if model_label: line2_bits.append(model_label) if ts: line2_bits.append(ts) line2 = " · ".join(line2_bits) page.insert_text( (x0 + pad, row_y + 4), line1, fontname="helv", fontsize=5.5, color=(0.45, 0.45, 0.45), ) if line2: page.insert_text( (x0 + pad, row_y + 11), line2, fontname="helv", fontsize=5.5, color=(0.45, 0.45, 0.45), ) # CATEGORY_LABELS kept in import for future full-form rendering (e.g. # --legend-verbose) even though short_labels is used here. _ = CATEGORY_LABELS
[docs] def add_legend( doc: pymupdf.Document, *, signature: str, model_label: Optional[str], source_name: str, corner: str = "lr", ) -> None: """Stamp a compact legend overlay in a corner of the last page. Default corner is lower-right ("lr"); valid alternatives are lower-left ("ll") and lower-centre ("lc"). No new pages are added — the overlay sits on top of any existing content (opaque background). """ page = doc[-1] rect = _corner_rect(page, corner) _draw_legend_overlay( page, rect, signature=signature, model_label=model_label, source_name=source_name, )
# Back-compat aliases — older callers referenced add_legend_page and # add_legend_footer; both now collapse to the corner overlay on the # last page, which is what the user confirmed they want. add_legend_page = add_legend add_legend_footer = add_legend