Coverage for src / dataknobs_xization / markdown / md_chunker.py: 66%

157 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-26 16:15 -0700

1"""Markdown chunker for generating RAG-optimized chunks from tree structures. 

2 

3This module provides functionality to traverse markdown tree structures and 

4generate chunks suitable for RAG (Retrieval-Augmented Generation) applications. 

5""" 

6 

7from __future__ import annotations 

8 

9from dataclasses import dataclass, field 

10from enum import Enum 

11from typing import Any, Iterator 

12 

13from dataknobs_structures.tree import Tree 

14 

15from dataknobs_xization.markdown.md_parser import MarkdownNode 

16from dataknobs_xization.markdown.enrichment import build_enriched_text 

17from dataknobs_xization.markdown.filters import ChunkQualityConfig, ChunkQualityFilter 

18 

19 

20class ChunkFormat(Enum): 

21 """Output format for chunk text.""" 

22 

23 MARKDOWN = "markdown" # Include headings as markdown 

24 PLAIN = "plain" # Plain text without markdown formatting 

25 DICT = "dict" # Return as dictionary 

26 

27 

28class HeadingInclusion(Enum): 

29 """Strategy for including headings in chunks.""" 

30 

31 IN_TEXT = "in_text" # Include headings in chunk text 

32 IN_METADATA = "in_metadata" # Include headings only in metadata 

33 BOTH = "both" # Include in both text and metadata 

34 NONE = "none" # Don't include headings 

35 

36 

37@dataclass 

38class ChunkMetadata: 

39 """Metadata for a document chunk. 

40 

41 Attributes: 

42 headings: List of heading texts from root to chunk 

43 heading_levels: List of heading levels corresponding to headings 

44 line_number: Starting line number in source document 

45 chunk_index: Index of this chunk in the sequence 

46 chunk_size: Size of chunk text in characters 

47 content_length: Size of content without headings (for quality decisions) 

48 heading_display: Formatted heading path for display 

49 embedding_text: Heading-enriched text for embedding (optional) 

50 custom: Additional custom metadata 

51 """ 

52 

53 headings: list[str] = field(default_factory=list) 

54 heading_levels: list[int] = field(default_factory=list) 

55 line_number: int = 0 

56 chunk_index: int = 0 

57 chunk_size: int = 0 

58 content_length: int = 0 

59 heading_display: str = "" 

60 embedding_text: str = "" 

61 custom: dict[str, Any] = field(default_factory=dict) 

62 

63 def to_dict(self) -> dict[str, Any]: 

64 """Convert metadata to dictionary.""" 

65 result = { 

66 "headings": self.headings, 

67 "heading_levels": self.heading_levels, 

68 "line_number": self.line_number, 

69 "chunk_index": self.chunk_index, 

70 "chunk_size": self.chunk_size, 

71 "content_length": self.content_length, 

72 "heading_display": self.heading_display, 

73 **self.custom, 

74 } 

75 # Only include embedding_text if it was generated 

76 if self.embedding_text: 

77 result["embedding_text"] = self.embedding_text 

78 return result 

79 

80 def get_heading_path(self, separator: str = " > ") -> str: 

81 """Get heading hierarchy as a single string. 

82 

83 Args: 

84 separator: String to use between headings 

85 

86 Returns: 

87 Formatted heading path 

88 """ 

89 return separator.join(self.headings) 

90 

91 

92@dataclass 

93class Chunk: 

94 """A chunk of text with associated metadata. 

95 

96 Attributes: 

97 text: The chunk text content 

98 metadata: Metadata for this chunk 

99 """ 

100 

101 text: str 

102 metadata: ChunkMetadata 

103 

104 def to_dict(self) -> dict[str, Any]: 

105 """Convert chunk to dictionary representation.""" 

106 return { 

107 "text": self.text, 

108 "metadata": self.metadata.to_dict(), 

109 } 

110 

111 def to_markdown(self, include_headings: bool = True) -> str: 

112 """Convert chunk to markdown format. 

113 

114 Args: 

115 include_headings: Whether to include heading hierarchy 

116 

117 Returns: 

118 Markdown-formatted string 

119 """ 

120 if not include_headings or not self.metadata.headings: 

121 return self.text 

122 

123 # Build heading hierarchy 

124 lines = [] 

125 for heading, level in zip( 

126 self.metadata.headings, self.metadata.heading_levels 

127 ): 

128 lines.append(f"{'#' * level} {heading}") 

129 

130 # Add body text 

131 if self.text: 

132 lines.append("") 

133 lines.append(self.text) 

134 

135 return "\n".join(lines) 

136 

137 

138class MarkdownChunker: 

139 """Chunker for generating chunks from markdown tree structures. 

140 

141 Traverses a Tree built from markdown and generates chunks with 

142 configurable size, heading inclusion, and output format. 

143 """ 

144 

145 def __init__( 

146 self, 

147 max_chunk_size: int = 1000, 

148 chunk_overlap: int = 100, 

149 heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH, 

150 chunk_format: ChunkFormat = ChunkFormat.MARKDOWN, 

151 combine_under_heading: bool = True, 

152 quality_filter: ChunkQualityConfig | None = None, 

153 generate_embeddings: bool = False, 

154 ): 

155 """Initialize the markdown chunker. 

156 

157 Args: 

158 max_chunk_size: Maximum size of chunk text in characters 

159 chunk_overlap: Number of characters to overlap between chunks 

160 heading_inclusion: How to include headings in chunks 

161 chunk_format: Output format for chunks 

162 combine_under_heading: Whether to combine body text under same heading 

163 quality_filter: Optional config for filtering low-quality chunks 

164 generate_embeddings: Whether to generate heading-enriched embedding text 

165 """ 

166 self.max_chunk_size = max_chunk_size 

167 self.chunk_overlap = chunk_overlap 

168 self.heading_inclusion = heading_inclusion 

169 self.chunk_format = chunk_format 

170 self.combine_under_heading = combine_under_heading 

171 self.generate_embeddings = generate_embeddings 

172 self._chunk_index = 0 

173 

174 # Initialize quality filter if config provided 

175 self._quality_filter = None 

176 if quality_filter is not None: 

177 self._quality_filter = ChunkQualityFilter(quality_filter) 

178 

179 def chunk(self, tree: Tree) -> Iterator[Chunk]: 

180 """Generate chunks from a markdown tree. 

181 

182 Args: 

183 tree: Tree structure built from markdown 

184 

185 Yields: 

186 Chunk objects with text and metadata 

187 """ 

188 self._chunk_index = 0 

189 

190 # Get all terminal (leaf) nodes - not headings or root 

191 terminal_nodes = tree.collect_terminal_nodes( 

192 accept_node_fn=lambda n: ( 

193 isinstance(n.data, MarkdownNode) 

194 and not n.data.is_heading() 

195 and n.data.node_type != "root" 

196 ) 

197 ) 

198 

199 if self.combine_under_heading: 

200 # Group terminal nodes by their parent heading 

201 chunk_iter = self._chunk_by_heading(terminal_nodes) 

202 else: 

203 # Process each terminal node individually 

204 chunk_iter = self._chunk_individually(terminal_nodes) 

205 

206 # Apply quality filter if configured 

207 for chunk in chunk_iter: 

208 if self._quality_filter is None or self._quality_filter.is_valid(chunk): 

209 yield chunk 

210 

211 def _chunk_by_heading(self, terminal_nodes: list[Tree]) -> Iterator[Chunk]: 

212 """Group nodes under same heading and chunk them. 

213 

214 Args: 

215 terminal_nodes: List of terminal tree nodes 

216 

217 Yields: 

218 Chunk objects 

219 """ 

220 # Group nodes by their immediate parent 

221 parent_groups: dict[Tree, list[Tree]] = {} 

222 for node in terminal_nodes: 

223 parent = node.parent 

224 if parent not in parent_groups: 

225 parent_groups[parent] = [] 

226 parent_groups[parent].append(node) 

227 

228 # Process each group 

229 for parent, nodes in parent_groups.items(): 

230 # Get heading path for this group 

231 headings, levels = self._get_heading_path(parent) 

232 

233 # Separate atomic constructs from regular body text 

234 atomic_nodes = [n for n in nodes if n.data.is_atomic()] 

235 body_nodes = [n for n in nodes if not n.data.is_atomic()] 

236 

237 # Process body text nodes (can be combined and split) 

238 if body_nodes: 

239 combined_text = "\n".join( 

240 node.data.text for node in body_nodes if node.data.text.strip() 

241 ) 

242 

243 if combined_text.strip(): 

244 for chunk_text in self._split_text(combined_text): 

245 yield self._create_chunk( 

246 text=chunk_text, 

247 headings=headings, 

248 heading_levels=levels, 

249 line_number=body_nodes[0].data.line_number if body_nodes else 0, 

250 ) 

251 

252 # Process atomic constructs (keep as complete units) 

253 for atomic_node in atomic_nodes: 

254 # Don't split atomic constructs, even if they exceed max_chunk_size 

255 yield self._create_chunk( 

256 text=atomic_node.data.text, 

257 headings=headings, 

258 heading_levels=levels, 

259 line_number=atomic_node.data.line_number, 

260 metadata=atomic_node.data.metadata, 

261 node_type=atomic_node.data.node_type, 

262 ) 

263 

264 def _chunk_individually(self, terminal_nodes: list[Tree]) -> Iterator[Chunk]: 

265 """Process each terminal node individually. 

266 

267 Args: 

268 terminal_nodes: List of terminal tree nodes 

269 

270 Yields: 

271 Chunk objects 

272 """ 

273 for node in terminal_nodes: 

274 if not node.data.text.strip(): 

275 continue 

276 

277 headings, levels = self._get_heading_path(node.parent) 

278 

279 # Atomic constructs are kept whole 

280 if node.data.is_atomic(): 

281 yield self._create_chunk( 

282 text=node.data.text, 

283 headings=headings, 

284 heading_levels=levels, 

285 line_number=node.data.line_number, 

286 metadata=node.data.metadata, 

287 node_type=node.data.node_type, 

288 ) 

289 else: 

290 # Regular body text can be split 

291 for chunk_text in self._split_text(node.data.text): 

292 yield self._create_chunk( 

293 text=chunk_text, 

294 headings=headings, 

295 heading_levels=levels, 

296 line_number=node.data.line_number, 

297 ) 

298 

299 def _get_heading_path(self, node: Tree | None) -> tuple[list[str], list[int]]: 

300 """Get the heading path from root to this node. 

301 

302 Args: 

303 node: Tree node to get path for 

304 

305 Returns: 

306 Tuple of (heading_texts, heading_levels) 

307 """ 

308 headings = [] 

309 levels = [] 

310 

311 current = node 

312 while current is not None: 

313 if isinstance(current.data, MarkdownNode): 

314 if current.data.is_heading(): 

315 headings.insert(0, current.data.text) 

316 levels.insert(0, current.data.level) 

317 current = current.parent 

318 

319 return headings, levels 

320 

321 def _split_text(self, text: str) -> list[str]: 

322 """Split text into chunks respecting max_chunk_size. 

323 

324 Args: 

325 text: Text to split 

326 

327 Returns: 

328 List of text chunks 

329 """ 

330 if len(text) <= self.max_chunk_size: 

331 return [text] 

332 

333 chunks = [] 

334 start = 0 

335 

336 while start < len(text): 

337 end = start + self.max_chunk_size 

338 

339 # If not at the end, try to break at a good boundary 

340 if end < len(text): 

341 # Try to break at paragraph boundary (double newline) 

342 break_pos = text.rfind("\n\n", start, end) 

343 if break_pos > start: 

344 end = break_pos + 2 

345 else: 

346 # Try to break at sentence boundary 

347 for punct in [". ", "! ", "? ", ".\n", "!\n", "?\n"]: 

348 break_pos = text.rfind(punct, start, end) 

349 if break_pos > start: 

350 end = break_pos + len(punct) 

351 break 

352 else: 

353 # Try to break at word boundary 

354 break_pos = text.rfind(" ", start, end) 

355 if break_pos > start: 

356 end = break_pos + 1 

357 

358 chunks.append(text[start:end].strip()) 

359 

360 # Move start position, accounting for overlap 

361 start = max(start + 1, end - self.chunk_overlap) 

362 

363 return [c for c in chunks if c] # Filter out empty chunks 

364 

365 def _create_chunk( 

366 self, 

367 text: str, 

368 headings: list[str], 

369 heading_levels: list[int], 

370 line_number: int, 

371 metadata: dict[str, Any] | None = None, 

372 node_type: str = "body", 

373 ) -> Chunk: 

374 """Create a chunk with appropriate format and metadata. 

375 

376 Args: 

377 text: Body text for chunk 

378 headings: List of heading texts 

379 heading_levels: List of heading levels 

380 line_number: Source line number 

381 metadata: Optional metadata from the source node 

382 node_type: Type of node ('body', 'code', 'list', 'table', etc.) 

383 

384 Returns: 

385 Formatted Chunk object 

386 """ 

387 # Store content length before adding headings 

388 content_length = len(text) 

389 

390 # Build chunk text based on heading inclusion setting 

391 chunk_text = text 

392 

393 if self.heading_inclusion in (HeadingInclusion.IN_TEXT, HeadingInclusion.BOTH): 

394 # Prepend headings to text 

395 heading_lines = [] 

396 for heading, level in zip(headings, heading_levels): 

397 if self.chunk_format == ChunkFormat.MARKDOWN: 

398 heading_lines.append(f"{'#' * level} {heading}") 

399 else: 

400 heading_lines.append(heading) 

401 

402 if heading_lines: 

403 chunk_text = "\n".join(heading_lines) + "\n\n" + text 

404 

405 # Create custom metadata dict with node type and additional metadata 

406 custom_metadata = {"node_type": node_type} 

407 if metadata: 

408 custom_metadata.update(metadata) 

409 

410 # Generate heading display string 

411 heading_display = " > ".join(headings) if headings else "" 

412 

413 # Generate embedding text if enabled 

414 embedding_text = "" 

415 if self.generate_embeddings: 

416 embedding_text = build_enriched_text(headings, text) 

417 

418 # Determine which headings to include in metadata 

419 include_headings = self.heading_inclusion in ( 

420 HeadingInclusion.IN_METADATA, 

421 HeadingInclusion.BOTH, 

422 ) 

423 

424 # Create chunk metadata 

425 chunk_metadata = ChunkMetadata( 

426 headings=headings if include_headings else [], 

427 heading_levels=heading_levels if include_headings else [], 

428 line_number=line_number, 

429 chunk_index=self._chunk_index, 

430 chunk_size=len(chunk_text), 

431 content_length=content_length, 

432 heading_display=heading_display, 

433 embedding_text=embedding_text, 

434 custom=custom_metadata, 

435 ) 

436 

437 self._chunk_index += 1 

438 

439 return Chunk(text=chunk_text, metadata=chunk_metadata) 

440 

441 

442def chunk_markdown_tree( 

443 tree: Tree, 

444 max_chunk_size: int = 1000, 

445 chunk_overlap: int = 100, 

446 heading_inclusion: HeadingInclusion = HeadingInclusion.BOTH, 

447 chunk_format: ChunkFormat = ChunkFormat.MARKDOWN, 

448 combine_under_heading: bool = True, 

449 quality_filter: ChunkQualityConfig | None = None, 

450 generate_embeddings: bool = False, 

451) -> list[Chunk]: 

452 """Generate chunks from a markdown tree. 

453 

454 Convenience function for creating and using a MarkdownChunker. 

455 

456 Args: 

457 tree: Tree structure built from markdown 

458 max_chunk_size: Maximum size of chunk text in characters 

459 chunk_overlap: Number of characters to overlap between chunks 

460 heading_inclusion: How to include headings in chunks 

461 chunk_format: Output format for chunks 

462 combine_under_heading: Whether to combine body text under same heading 

463 quality_filter: Optional config for filtering low-quality chunks 

464 generate_embeddings: Whether to generate heading-enriched embedding text 

465 

466 Returns: 

467 List of Chunk objects 

468 """ 

469 chunker = MarkdownChunker( 

470 max_chunk_size=max_chunk_size, 

471 chunk_overlap=chunk_overlap, 

472 heading_inclusion=heading_inclusion, 

473 chunk_format=chunk_format, 

474 combine_under_heading=combine_under_heading, 

475 quality_filter=quality_filter, 

476 generate_embeddings=generate_embeddings, 

477 ) 

478 return list(chunker.chunk(tree))