Source code for askemblaex.window
"""
askemblaex/window.py
Dynamic extraction windows for windowed entity extraction.
Builds a sliding context window centred on an anchor page, including
surrounding pages for broader context. The window text is formatted
for use as an AI extraction prompt.
"""
from __future__ import annotations
from dataclasses import dataclass, field
[docs]
@dataclass
class ExtractionWindow:
"""
The assembled context window for a single anchor page.
Attributes:
anchor_page: Zero-based page number that is the extraction target.
pages_included: Sorted list of all page numbers included in the window
(anchor plus any context pages).
text: Combined window text, ready to send to an AI model.
char_count: Total character count of :attr:`text`.
"""
anchor_page: int
pages_included: list[int]
text: str
char_count: int
[docs]
def build_dynamic_extraction_window(
pages: dict[int, str],
anchor_page: int,
*,
context_pages: int = 2,
max_chars: int = 30_000,
) -> ExtractionWindow:
"""
Build a text window centred on *anchor_page* with surrounding context.
The anchor page is always included in full. Context pages are added
alternately from before and after until the character budget is
exhausted or *context_pages* pages on each side have been included.
Args:
pages: Mapping of page_number → reconciled text for all
available pages in the document.
anchor_page: The page to extract entities from (the TARGET).
context_pages: Maximum number of pages to include on each side.
max_chars: Soft character budget for the combined window text.
Returns:
:class:`ExtractionWindow` with the assembled text and metadata.
"""
sorted_nums = sorted(pages.keys())
anchor_text = pages.get(anchor_page, "")
anchor_block = f"=== PAGE {anchor_page} (TARGET) ===\n{anchor_text}"
# If the anchor page alone exceeds the budget, hard-truncate it.
if len(anchor_block) > max_chars:
anchor_block = anchor_block[:max_chars]
remaining = max_chars - len(anchor_block)
before = [n for n in sorted_nums if n < anchor_page][-context_pages:]
after = [n for n in sorted_nums if n > anchor_page][:context_pages]
included: list[int] = []
before_blocks: list[str] = []
after_blocks: list[str] = []
# Alternate adding from before/after to keep the budget balanced.
bi, ai = len(before) - 1, 0
while remaining > 0 and (bi >= 0 or ai < len(after)):
if bi >= 0:
pn = before[bi]
block = f"=== PAGE {pn} ===\n{pages[pn]}"
if len(block) <= remaining:
before_blocks.insert(0, block)
included.append(pn)
remaining -= len(block) + 2 # +2 for the "\n\n" separator
bi -= 1
if ai < len(after) and remaining > 0:
pn = after[ai]
block = f"=== PAGE {pn} ===\n{pages[pn]}"
if len(block) <= remaining:
after_blocks.append(block)
included.append(pn)
remaining -= len(block) + 2
ai += 1
parts = before_blocks + [anchor_block] + after_blocks
text = "\n\n".join(parts)
return ExtractionWindow(
anchor_page=anchor_page,
pages_included=sorted([anchor_page] + included),
text=text,
char_count=len(text),
)