Coverage for src/dataknobs_xization/markdown/enrichment.py: 66%
61 statements
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-18 17:41 -0700
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-18 17:41 -0700
1"""Heading enrichment utilities for RAG-optimized chunk embeddings.
3This module provides utilities to enrich chunk content with heading context
4for improved semantic search, while keeping headings out of the displayed content.
5"""
7from __future__ import annotations
9from dataclasses import dataclass
10from typing import Any
13def is_multiword(heading: str) -> bool:
14 """Check if a heading contains multiple words.
16 Args:
17 heading: The heading text to check
19 Returns:
20 True if the heading has more than one word
21 """
22 return len(heading.split()) > 1
25def format_heading_display(
26 heading_path: list[str],
27 separator: str = " > ",
28) -> str:
29 """Format a heading path for display.
31 Args:
32 heading_path: List of headings from root to chunk
33 separator: Separator to use between headings
35 Returns:
36 Formatted heading path string
37 """
38 if not heading_path:
39 return ""
40 return separator.join(heading_path)
43def get_dynamic_heading_display(
44 heading_path: list[str],
45 content_length: int,
46 small_threshold: int = 200,
47 medium_threshold: int = 800,
48) -> str:
49 """Get heading display based on content length.
51 Dynamic heading inclusion:
52 - Small chunks (< small_threshold): Full heading path
53 - Medium chunks (< medium_threshold): Last 2 headings
54 - Large chunks: No headings
56 Args:
57 heading_path: List of headings from root to chunk
58 content_length: Length of chunk content in characters
59 small_threshold: Max chars for "small" chunks
60 medium_threshold: Max chars for "medium" chunks
62 Returns:
63 Formatted heading display string
64 """
65 if not heading_path:
66 return ""
68 if content_length <= small_threshold:
69 # Small chunks: include full heading path
70 return format_heading_display(heading_path)
71 elif content_length <= medium_threshold:
72 # Medium chunks: include last 2 heading levels
73 relevant = heading_path[-2:] if len(heading_path) > 2 else heading_path
74 return format_heading_display(relevant)
75 else:
76 # Large chunks: omit headings
77 return ""
80@dataclass
81class EnrichedChunkData:
82 """Data for a chunk enriched with heading context.
84 Attributes:
85 content: Clean content text (no headings)
86 embedding_text: Text to use for embedding (heading-enriched)
87 heading_path: List of headings from root to chunk
88 heading_display: Formatted heading path for display
89 content_length: Length of clean content in characters
90 """
92 content: str
93 embedding_text: str
94 heading_path: list[str]
95 heading_display: str
96 content_length: int
99def build_enriched_text(heading_path: list[str], content: str) -> str:
100 """Build text for embedding with relevant heading context.
102 Uses a modified approach where headings are included up from the chunk
103 until and including the first multi-word heading. This provides semantic
104 context without over-weighting deep, single-word labels like "Example".
106 Args:
107 heading_path: List of heading texts from root to chunk
108 content: The chunk content text
110 Returns:
111 Enriched text suitable for embedding
113 Examples:
114 >>> build_enriched_text(["Patterns", "Chain-of-Thought", "Example"], "code here")
115 'Chain-of-Thought: Example: code here'
117 >>> build_enriched_text(["Setup"], "install steps")
118 'Setup: install steps'
120 >>> build_enriched_text(["API Reference", "Authentication", "OAuth 2.0"], "...")
121 'Authentication: OAuth 2.0: ...'
123 >>> build_enriched_text([], "standalone content")
124 'standalone content'
125 """
126 if not heading_path:
127 return content
129 # Walk backwards from deepest heading to find relevant context
130 relevant_headings = []
131 for heading in reversed(heading_path):
132 relevant_headings.insert(0, heading)
133 # Stop after including a multi-word heading
134 if len(heading.split()) > 1:
135 break
137 # Build the enriched text
138 if relevant_headings:
139 prefix = ": ".join(relevant_headings)
140 return f"{prefix}: {content}"
142 return content
145def extract_heading_metadata(
146 headings: list[str],
147 heading_levels: list[int],
148 separator: str = " > ",
149) -> dict[str, Any]:
150 """Extract heading metadata for storage.
152 Args:
153 headings: List of heading texts from root to chunk
154 heading_levels: Corresponding heading levels (1-6)
155 separator: Separator for display string
157 Returns:
158 Dictionary with heading metadata fields
159 """
160 return {
161 "heading_path": headings,
162 "heading_levels": heading_levels,
163 "heading_display": separator.join(headings) if headings else "",
164 "heading_depth": len(headings),
165 }
168def get_relevant_headings_for_display(
169 heading_path: list[str],
170 content_length: int,
171 small_threshold: int = 200,
172 medium_threshold: int = 800,
173) -> list[str]:
174 """Get headings to display based on content length.
176 Implements dynamic heading inclusion:
177 - Small chunks: Full heading path (need context)
178 - Medium chunks: Last 2 heading levels
179 - Large chunks: No headings (content is self-contained)
181 Args:
182 heading_path: List of heading texts from root to chunk
183 content_length: Length of chunk content in characters
184 small_threshold: Max chars for "small" chunks
185 medium_threshold: Max chars for "medium" chunks
187 Returns:
188 List of headings to display
189 """
190 if not heading_path:
191 return []
193 if content_length < small_threshold:
194 # Small chunks: include full heading path
195 return heading_path
196 elif content_length < medium_threshold:
197 # Medium chunks: include last 2 heading levels
198 return heading_path[-2:] if len(heading_path) > 2 else heading_path
199 else:
200 # Large chunks: omit headings
201 return []
204def format_heading_for_display(
205 headings: list[str],
206 heading_levels: list[int] | None = None,
207 format_style: str = "markdown",
208) -> str:
209 """Format headings for display in LLM context.
211 Args:
212 headings: List of heading texts to display
213 heading_levels: Corresponding levels (used for markdown format)
214 format_style: "markdown" for # syntax, "path" for > separated
216 Returns:
217 Formatted heading string
218 """
219 if not headings:
220 return ""
222 if format_style == "path":
223 return " > ".join(headings)
225 if format_style == "markdown" and heading_levels:
226 lines = []
227 for heading, level in zip(headings, heading_levels):
228 lines.append(f"{'#' * level} {heading}")
229 return "\n".join(lines)
231 # Default: just join with separator
232 return " > ".join(headings)
235def enrich_chunk(
236 content: str,
237 headings: list[str],
238 heading_levels: list[int],
239) -> EnrichedChunkData:
240 """Create fully enriched chunk data from raw components.
242 Convenience function that combines all enrichment operations.
244 Args:
245 content: Raw chunk content text
246 headings: List of heading texts from root to chunk
247 heading_levels: Corresponding heading levels
249 Returns:
250 EnrichedChunkData with all computed fields
251 """
252 embedding_text = build_enriched_text(headings, content)
254 return EnrichedChunkData(
255 content=content,
256 embedding_text=embedding_text,
257 heading_path=headings,
258 heading_display=" > ".join(headings) if headings else "",
259 content_length=len(content),
260 )