Source code for scitex_scholar.pdf_highlight._classifier

"""LLM and offline classifiers for the semantic highlighter."""

from __future__ import annotations

import json
from typing import Any, Optional

from ._blocks import Block
from ._colors import CATEGORIES

CLASSIFIER_SYSTEM = """You tag sentences from an academic paper into at most one of these rhetorical categories. Emit "none" when nothing clearly fits — precision matters more than recall.

Categories (use these exact strings):
  focal_claim            — THIS paper states, clarifies, suggests, demonstrates, establishes, or
                           reports a finding, result, or interpretation of its own. First-person
                           stance markers ("we show/find/demonstrate/suggest/clarify/establish",
                           "our results", "this finding", "these data indicate") are strong signals.
                           Quantitative results ("β = -1.5", "5/6 subjects", "accuracy 0.85") are
                           usually claims when attached to the paper's own analysis.
  focal_method           — description of THIS paper's own novel method, model, cohort, or analysis
                           pipeline. NOT routine study logistics. NOT background on existing methods.
  focal_limitation       — self-admitted limitation, caveat, confound, or threat to validity of
                           THIS paper. Hedging that names a weakness in the paper's own work.
  related_supportive     — a specific prior/other paper whose finding SUPPORTS this paper's position
                           ("consistent with X (2019)", "as shown by Y", "corroborates").
  related_contradictive  — a specific prior/other paper whose finding CONTRADICTS this paper
                           ("in contrast to X", "unlike Y", "disagrees with").
  none                   — background setup without a stance, reference list entries, headers,
                           figure/table prose, boilerplate. Default when uncertain.

Important: the priority is catching CLAIMS. If a sentence both describes a method and reports
what that method yielded, prefer focal_claim. If a block mentions prior work but does not take a
supportive or contradictive stance toward it, return "none".

Confidence in [0,1]: <0.5 = guess, >0.8 = clear-cut.

Respond with ONLY a JSON array of objects: {"id": int, "category": str, "confidence": float}. Include every input id exactly once."""


def _extract_text_from_message(msg: Any) -> str:
    for block in msg.content:
        if getattr(block, "type", None) == "text":
            return block.text
    return ""


def _strip_code_fence(raw: str) -> str:
    raw = raw.strip()
    if raw.startswith("```"):
        raw = raw.split("```", 2)[1]
        if raw.startswith("json"):
            raw = raw[4:]
        raw = raw.strip()
    return raw


[docs] def classify_llm( blocks: list[Block], model: str, batch_size: int = 25, on_warning: Optional[Any] = None, ) -> None: """Classify blocks in-place by calling the Anthropic Messages API.""" import anthropic client = anthropic.Anthropic() for start in range(0, len(blocks), batch_size): batch = blocks[start : start + batch_size] payload = [{"id": b.id, "text": b.text} for b in batch] msg = client.messages.create( model=model, max_tokens=2048, system=CLASSIFIER_SYSTEM, messages=[ { "role": "user", "content": ( f"Classify these {len(batch)} units:\n\n" f"{json.dumps(payload, ensure_ascii=False)}" ), } ], ) raw = _strip_code_fence(_extract_text_from_message(msg)) try: preds = json.loads(raw) except json.JSONDecodeError as exc: if on_warning is not None: on_warning(f"parse failure at batch {start}: {exc}") continue by_id = {b.id: b for b in batch} for p in preds: b = by_id.get(p.get("id")) if b is None: continue cat = p.get("category", "none") if cat in CATEGORIES: b.category = cat b.confidence = float(p.get("confidence", 0.0))
[docs] def classify_stub(blocks: list[Block]) -> None: """Offline keyword heuristic. No API calls. Useful for smoke tests.""" rules = [ ( "focal_limitation", ("limitation", "caveat", "however, our", "we did not", "a threat to"), ), ( "focal_method", ("we propose", "we introduce", "our method", "our approach", "we develop"), ), ( "focal_claim", ( "we show", "we find", "we demonstrate", "we suggest", "we clarify", "we establish", "our results", "we report", "this finding", ), ), ( "related_contradictive", ("in contrast", "unlike", "disagree", "contrary to", "fails to"), ), ( "related_supportive", ( "consistent with", "in line with", "as shown by", "supports", "corroborat", ), ), ] for b in blocks: low = b.text.lower() for cat, needles in rules: if any(n in low for n in needles): b.category = cat b.confidence = 0.5 break