Source code for askemblaex.window

"""
askemblaex/window.py

Dynamic extraction windows for windowed entity extraction.

Builds a sliding context window centred on an anchor page, including
surrounding pages for broader context. The window text is formatted
for use as an AI extraction prompt.
"""

from __future__ import annotations

from dataclasses import dataclass, field


[docs] @dataclass class ExtractionWindow: """ The assembled context window for a single anchor page. Attributes: anchor_page: Zero-based page number that is the extraction target. pages_included: Sorted list of all page numbers included in the window (anchor plus any context pages). text: Combined window text, ready to send to an AI model. char_count: Total character count of :attr:`text`. """ anchor_page: int pages_included: list[int] text: str char_count: int
[docs] def build_dynamic_extraction_window( pages: dict[int, str], anchor_page: int, *, context_pages: int = 2, max_chars: int = 30_000, ) -> ExtractionWindow: """ Build a text window centred on *anchor_page* with surrounding context. The anchor page is always included in full. Context pages are added alternately from before and after until the character budget is exhausted or *context_pages* pages on each side have been included. Args: pages: Mapping of page_number → reconciled text for all available pages in the document. anchor_page: The page to extract entities from (the TARGET). context_pages: Maximum number of pages to include on each side. max_chars: Soft character budget for the combined window text. Returns: :class:`ExtractionWindow` with the assembled text and metadata. """ sorted_nums = sorted(pages.keys()) anchor_text = pages.get(anchor_page, "") anchor_block = f"=== PAGE {anchor_page} (TARGET) ===\n{anchor_text}" # If the anchor page alone exceeds the budget, hard-truncate it. if len(anchor_block) > max_chars: anchor_block = anchor_block[:max_chars] remaining = max_chars - len(anchor_block) before = [n for n in sorted_nums if n < anchor_page][-context_pages:] after = [n for n in sorted_nums if n > anchor_page][:context_pages] included: list[int] = [] before_blocks: list[str] = [] after_blocks: list[str] = [] # Alternate adding from before/after to keep the budget balanced. bi, ai = len(before) - 1, 0 while remaining > 0 and (bi >= 0 or ai < len(after)): if bi >= 0: pn = before[bi] block = f"=== PAGE {pn} ===\n{pages[pn]}" if len(block) <= remaining: before_blocks.insert(0, block) included.append(pn) remaining -= len(block) + 2 # +2 for the "\n\n" separator bi -= 1 if ai < len(after) and remaining > 0: pn = after[ai] block = f"=== PAGE {pn} ===\n{pages[pn]}" if len(block) <= remaining: after_blocks.append(block) included.append(pn) remaining -= len(block) + 2 ai += 1 parts = before_blocks + [anchor_block] + after_blocks text = "\n\n".join(parts) return ExtractionWindow( anchor_page=anchor_page, pages_included=sorted([anchor_page] + included), text=text, char_count=len(text), )