Coverage for src / dataknobs_xization / markdown / md_chunker.py: 66%
157 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-26 16:15 -0700
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-26 16:15 -0700
1"""Markdown chunker for generating RAG-optimized chunks from tree structures.
3This module provides functionality to traverse markdown tree structures and
4generate chunks suitable for RAG (Retrieval-Augmented Generation) applications.
5"""
7from __future__ import annotations
9from dataclasses import dataclass, field
10from enum import Enum
11from typing import Any, Iterator
13from dataknobs_structures.tree import Tree
15from dataknobs_xization.markdown.md_parser import MarkdownNode
16from dataknobs_xization.markdown.enrichment import build_enriched_text
17from dataknobs_xization.markdown.filters import ChunkQualityConfig, ChunkQualityFilter
20class ChunkFormat(Enum):
21 """Output format for chunk text."""
23 MARKDOWN = "markdown" # Include headings as markdown
24 PLAIN = "plain" # Plain text without markdown formatting
25 DICT = "dict" # Return as dictionary
28class HeadingInclusion(Enum):
29 """Strategy for including headings in chunks."""
31 IN_TEXT = "in_text" # Include headings in chunk text
32 IN_METADATA = "in_metadata" # Include headings only in metadata
33 BOTH = "both" # Include in both text and metadata
34 NONE = "none" # Don't include headings
37@dataclass
38class ChunkMetadata:
39 """Metadata for a document chunk.
41 Attributes:
42 headings: List of heading texts from root to chunk
43 heading_levels: List of heading levels corresponding to headings
44 line_number: Starting line number in source document
45 chunk_index: Index of this chunk in the sequence
46 chunk_size: Size of chunk text in characters
47 content_length: Size of content without headings (for quality decisions)
48 heading_display: Formatted heading path for display
49 embedding_text: Heading-enriched text for embedding (optional)
50 custom: Additional custom metadata
51 """
53 headings: list[str] = field(default_factory=list)
54 heading_levels: list[int] = field(default_factory=list)
55 line_number: int = 0
56 chunk_index: int = 0
57 chunk_size: int = 0
58 content_length: int = 0
59 heading_display: str = ""
60 embedding_text: str = ""
61 custom: dict[str, Any] = field(default_factory=dict)
63 def to_dict(self) -> dict[str, Any]:
64 """Convert metadata to dictionary."""
65 result = {
66 "headings": self.headings,
67 "heading_levels": self.heading_levels,
68 "line_number": self.line_number,
69 "chunk_index": self.chunk_index,
70 "chunk_size": self.chunk_size,
71 "content_length": self.content_length,
72 "heading_display": self.heading_display,
73 **self.custom,
74 }
75 # Only include embedding_text if it was generated
76 if self.embedding_text:
77 result["embedding_text"] = self.embedding_text
78 return result
80 def get_heading_path(self, separator: str = " > ") -> str:
81 """Get heading hierarchy as a single string.
83 Args:
84 separator: String to use between headings
86 Returns:
87 Formatted heading path
88 """
89 return separator.join(self.headings)
92@dataclass
93class Chunk:
94 """A chunk of text with associated metadata.
96 Attributes:
97 text: The chunk text content
98 metadata: Metadata for this chunk
99 """
101 text: str
102 metadata: ChunkMetadata
104 def to_dict(self) -> dict[str, Any]:
105 """Convert chunk to dictionary representation."""
106 return {
107 "text": self.text,
108 "metadata": self.metadata.to_dict(),
109 }
111 def to_markdown(self, include_headings: bool = True) -> str:
112 """Convert chunk to markdown format.
114 Args:
115 include_headings: Whether to include heading hierarchy
117 Returns:
118 Markdown-formatted string
119 """
120 if not include_headings or not self.metadata.headings:
121 return self.text
123 # Build heading hierarchy
124 lines = []
125 for heading, level in zip(
126 self.metadata.headings, self.metadata.heading_levels
127 ):
128 lines.append(f"{'#' * level} {heading}")
130 # Add body text
131 if self.text:
132 lines.append("")
133 lines.append(self.text)
135 return "\n".join(lines)
138class MarkdownChunker:
139 """Chunker for generating chunks from markdown tree structures.
141 Traverses a Tree built from markdown and generates chunks with
142 configurable size, heading inclusion, and output format.
143 """
145 def __init__(
146 self,
147 max_chunk_size: int = 1000,
148 chunk_overlap: int = 100,
149 heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
150 chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
151 combine_under_heading: bool = True,
152 quality_filter: ChunkQualityConfig | None = None,
153 generate_embeddings: bool = False,
154 ):
155 """Initialize the markdown chunker.
157 Args:
158 max_chunk_size: Maximum size of chunk text in characters
159 chunk_overlap: Number of characters to overlap between chunks
160 heading_inclusion: How to include headings in chunks
161 chunk_format: Output format for chunks
162 combine_under_heading: Whether to combine body text under same heading
163 quality_filter: Optional config for filtering low-quality chunks
164 generate_embeddings: Whether to generate heading-enriched embedding text
165 """
166 self.max_chunk_size = max_chunk_size
167 self.chunk_overlap = chunk_overlap
168 self.heading_inclusion = heading_inclusion
169 self.chunk_format = chunk_format
170 self.combine_under_heading = combine_under_heading
171 self.generate_embeddings = generate_embeddings
172 self._chunk_index = 0
174 # Initialize quality filter if config provided
175 self._quality_filter = None
176 if quality_filter is not None:
177 self._quality_filter = ChunkQualityFilter(quality_filter)
179 def chunk(self, tree: Tree) -> Iterator[Chunk]:
180 """Generate chunks from a markdown tree.
182 Args:
183 tree: Tree structure built from markdown
185 Yields:
186 Chunk objects with text and metadata
187 """
188 self._chunk_index = 0
190 # Get all terminal (leaf) nodes - not headings or root
191 terminal_nodes = tree.collect_terminal_nodes(
192 accept_node_fn=lambda n: (
193 isinstance(n.data, MarkdownNode)
194 and not n.data.is_heading()
195 and n.data.node_type != "root"
196 )
197 )
199 if self.combine_under_heading:
200 # Group terminal nodes by their parent heading
201 chunk_iter = self._chunk_by_heading(terminal_nodes)
202 else:
203 # Process each terminal node individually
204 chunk_iter = self._chunk_individually(terminal_nodes)
206 # Apply quality filter if configured
207 for chunk in chunk_iter:
208 if self._quality_filter is None or self._quality_filter.is_valid(chunk):
209 yield chunk
211 def _chunk_by_heading(self, terminal_nodes: list[Tree]) -> Iterator[Chunk]:
212 """Group nodes under same heading and chunk them.
214 Args:
215 terminal_nodes: List of terminal tree nodes
217 Yields:
218 Chunk objects
219 """
220 # Group nodes by their immediate parent
221 parent_groups: dict[Tree, list[Tree]] = {}
222 for node in terminal_nodes:
223 parent = node.parent
224 if parent not in parent_groups:
225 parent_groups[parent] = []
226 parent_groups[parent].append(node)
228 # Process each group
229 for parent, nodes in parent_groups.items():
230 # Get heading path for this group
231 headings, levels = self._get_heading_path(parent)
233 # Separate atomic constructs from regular body text
234 atomic_nodes = [n for n in nodes if n.data.is_atomic()]
235 body_nodes = [n for n in nodes if not n.data.is_atomic()]
237 # Process body text nodes (can be combined and split)
238 if body_nodes:
239 combined_text = "\n".join(
240 node.data.text for node in body_nodes if node.data.text.strip()
241 )
243 if combined_text.strip():
244 for chunk_text in self._split_text(combined_text):
245 yield self._create_chunk(
246 text=chunk_text,
247 headings=headings,
248 heading_levels=levels,
249 line_number=body_nodes[0].data.line_number if body_nodes else 0,
250 )
252 # Process atomic constructs (keep as complete units)
253 for atomic_node in atomic_nodes:
254 # Don't split atomic constructs, even if they exceed max_chunk_size
255 yield self._create_chunk(
256 text=atomic_node.data.text,
257 headings=headings,
258 heading_levels=levels,
259 line_number=atomic_node.data.line_number,
260 metadata=atomic_node.data.metadata,
261 node_type=atomic_node.data.node_type,
262 )
264 def _chunk_individually(self, terminal_nodes: list[Tree]) -> Iterator[Chunk]:
265 """Process each terminal node individually.
267 Args:
268 terminal_nodes: List of terminal tree nodes
270 Yields:
271 Chunk objects
272 """
273 for node in terminal_nodes:
274 if not node.data.text.strip():
275 continue
277 headings, levels = self._get_heading_path(node.parent)
279 # Atomic constructs are kept whole
280 if node.data.is_atomic():
281 yield self._create_chunk(
282 text=node.data.text,
283 headings=headings,
284 heading_levels=levels,
285 line_number=node.data.line_number,
286 metadata=node.data.metadata,
287 node_type=node.data.node_type,
288 )
289 else:
290 # Regular body text can be split
291 for chunk_text in self._split_text(node.data.text):
292 yield self._create_chunk(
293 text=chunk_text,
294 headings=headings,
295 heading_levels=levels,
296 line_number=node.data.line_number,
297 )
299 def _get_heading_path(self, node: Tree | None) -> tuple[list[str], list[int]]:
300 """Get the heading path from root to this node.
302 Args:
303 node: Tree node to get path for
305 Returns:
306 Tuple of (heading_texts, heading_levels)
307 """
308 headings = []
309 levels = []
311 current = node
312 while current is not None:
313 if isinstance(current.data, MarkdownNode):
314 if current.data.is_heading():
315 headings.insert(0, current.data.text)
316 levels.insert(0, current.data.level)
317 current = current.parent
319 return headings, levels
321 def _split_text(self, text: str) -> list[str]:
322 """Split text into chunks respecting max_chunk_size.
324 Args:
325 text: Text to split
327 Returns:
328 List of text chunks
329 """
330 if len(text) <= self.max_chunk_size:
331 return [text]
333 chunks = []
334 start = 0
336 while start < len(text):
337 end = start + self.max_chunk_size
339 # If not at the end, try to break at a good boundary
340 if end < len(text):
341 # Try to break at paragraph boundary (double newline)
342 break_pos = text.rfind("\n\n", start, end)
343 if break_pos > start:
344 end = break_pos + 2
345 else:
346 # Try to break at sentence boundary
347 for punct in [". ", "! ", "? ", ".\n", "!\n", "?\n"]:
348 break_pos = text.rfind(punct, start, end)
349 if break_pos > start:
350 end = break_pos + len(punct)
351 break
352 else:
353 # Try to break at word boundary
354 break_pos = text.rfind(" ", start, end)
355 if break_pos > start:
356 end = break_pos + 1
358 chunks.append(text[start:end].strip())
360 # Move start position, accounting for overlap
361 start = max(start + 1, end - self.chunk_overlap)
363 return [c for c in chunks if c] # Filter out empty chunks
365 def _create_chunk(
366 self,
367 text: str,
368 headings: list[str],
369 heading_levels: list[int],
370 line_number: int,
371 metadata: dict[str, Any] | None = None,
372 node_type: str = "body",
373 ) -> Chunk:
374 """Create a chunk with appropriate format and metadata.
376 Args:
377 text: Body text for chunk
378 headings: List of heading texts
379 heading_levels: List of heading levels
380 line_number: Source line number
381 metadata: Optional metadata from the source node
382 node_type: Type of node ('body', 'code', 'list', 'table', etc.)
384 Returns:
385 Formatted Chunk object
386 """
387 # Store content length before adding headings
388 content_length = len(text)
390 # Build chunk text based on heading inclusion setting
391 chunk_text = text
393 if self.heading_inclusion in (HeadingInclusion.IN_TEXT, HeadingInclusion.BOTH):
394 # Prepend headings to text
395 heading_lines = []
396 for heading, level in zip(headings, heading_levels):
397 if self.chunk_format == ChunkFormat.MARKDOWN:
398 heading_lines.append(f"{'#' * level} {heading}")
399 else:
400 heading_lines.append(heading)
402 if heading_lines:
403 chunk_text = "\n".join(heading_lines) + "\n\n" + text
405 # Create custom metadata dict with node type and additional metadata
406 custom_metadata = {"node_type": node_type}
407 if metadata:
408 custom_metadata.update(metadata)
410 # Generate heading display string
411 heading_display = " > ".join(headings) if headings else ""
413 # Generate embedding text if enabled
414 embedding_text = ""
415 if self.generate_embeddings:
416 embedding_text = build_enriched_text(headings, text)
418 # Determine which headings to include in metadata
419 include_headings = self.heading_inclusion in (
420 HeadingInclusion.IN_METADATA,
421 HeadingInclusion.BOTH,
422 )
424 # Create chunk metadata
425 chunk_metadata = ChunkMetadata(
426 headings=headings if include_headings else [],
427 heading_levels=heading_levels if include_headings else [],
428 line_number=line_number,
429 chunk_index=self._chunk_index,
430 chunk_size=len(chunk_text),
431 content_length=content_length,
432 heading_display=heading_display,
433 embedding_text=embedding_text,
434 custom=custom_metadata,
435 )
437 self._chunk_index += 1
439 return Chunk(text=chunk_text, metadata=chunk_metadata)
442def chunk_markdown_tree(
443 tree: Tree,
444 max_chunk_size: int = 1000,
445 chunk_overlap: int = 100,
446 heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH,
447 chunk_format: ChunkFormat = ChunkFormat.MARKDOWN,
448 combine_under_heading: bool = True,
449 quality_filter: ChunkQualityConfig | None = None,
450 generate_embeddings: bool = False,
451) -> list[Chunk]:
452 """Generate chunks from a markdown tree.
454 Convenience function for creating and using a MarkdownChunker.
456 Args:
457 tree: Tree structure built from markdown
458 max_chunk_size: Maximum size of chunk text in characters
459 chunk_overlap: Number of characters to overlap between chunks
460 heading_inclusion: How to include headings in chunks
461 chunk_format: Output format for chunks
462 combine_under_heading: Whether to combine body text under same heading
463 quality_filter: Optional config for filtering low-quality chunks
464 generate_embeddings: Whether to generate heading-enriched embedding text
466 Returns:
467 List of Chunk objects
468 """
469 chunker = MarkdownChunker(
470 max_chunk_size=max_chunk_size,
471 chunk_overlap=chunk_overlap,
472 heading_inclusion=heading_inclusion,
473 chunk_format=chunk_format,
474 combine_under_heading=combine_under_heading,
475 quality_filter=quality_filter,
476 generate_embeddings=generate_embeddings,
477 )
478 return list(chunker.chunk(tree))