Coverage for src / dataknobs_xization / markdown / enrichment.py: 43%

61 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-26 16:15 -0700

1"""Heading enrichment utilities for RAG-optimized chunk embeddings. 

2 

3This module provides utilities to enrich chunk content with heading context 

4for improved semantic search, while keeping headings out of the displayed content. 

5""" 

6 

7from __future__ import annotations 

8 

9from dataclasses import dataclass 

10from typing import Any 

11 

12 

13def is_multiword(heading: str) -> bool: 

14 """Check if a heading contains multiple words. 

15 

16 Args: 

17 heading: The heading text to check 

18 

19 Returns: 

20 True if the heading has more than one word 

21 """ 

22 return len(heading.split()) > 1 

23 

24 

25def format_heading_display( 

26 heading_path: list[str], 

27 separator: str = " > ", 

28) -> str: 

29 """Format a heading path for display. 

30 

31 Args: 

32 heading_path: List of headings from root to chunk 

33 separator: Separator to use between headings 

34 

35 Returns: 

36 Formatted heading path string 

37 """ 

38 if not heading_path: 

39 return "" 

40 return separator.join(heading_path) 

41 

42 

43def get_dynamic_heading_display( 

44 heading_path: list[str], 

45 content_length: int, 

46 small_threshold: int = 200, 

47 medium_threshold: int = 800, 

48) -> str: 

49 """Get heading display based on content length. 

50 

51 Dynamic heading inclusion: 

52 - Small chunks (< small_threshold): Full heading path 

53 - Medium chunks (< medium_threshold): Last 2 headings 

54 - Large chunks: No headings 

55 

56 Args: 

57 heading_path: List of headings from root to chunk 

58 content_length: Length of chunk content in characters 

59 small_threshold: Max chars for "small" chunks 

60 medium_threshold: Max chars for "medium" chunks 

61 

62 Returns: 

63 Formatted heading display string 

64 """ 

65 if not heading_path: 

66 return "" 

67 

68 if content_length <= small_threshold: 

69 # Small chunks: include full heading path 

70 return format_heading_display(heading_path) 

71 elif content_length <= medium_threshold: 

72 # Medium chunks: include last 2 heading levels 

73 relevant = heading_path[-2:] if len(heading_path) > 2 else heading_path 

74 return format_heading_display(relevant) 

75 else: 

76 # Large chunks: omit headings 

77 return "" 

78 

79 

80@dataclass 

81class EnrichedChunkData: 

82 """Data for a chunk enriched with heading context. 

83 

84 Attributes: 

85 content: Clean content text (no headings) 

86 embedding_text: Text to use for embedding (heading-enriched) 

87 heading_path: List of headings from root to chunk 

88 heading_display: Formatted heading path for display 

89 content_length: Length of clean content in characters 

90 """ 

91 

92 content: str 

93 embedding_text: str 

94 heading_path: list[str] 

95 heading_display: str 

96 content_length: int 

97 

98 

99def build_enriched_text(heading_path: list[str], content: str) -> str: 

100 """Build text for embedding with relevant heading context. 

101 

102 Uses a modified approach where headings are included up from the chunk 

103 until and including the first multi-word heading. This provides semantic 

104 context without over-weighting deep, single-word labels like "Example". 

105 

106 Args: 

107 heading_path: List of heading texts from root to chunk 

108 content: The chunk content text 

109 

110 Returns: 

111 Enriched text suitable for embedding 

112 

113 Examples: 

114 >>> build_enriched_text(["Patterns", "Chain-of-Thought", "Example"], "code here") 

115 'Chain-of-Thought: Example: code here' 

116 

117 >>> build_enriched_text(["Setup"], "install steps") 

118 'Setup: install steps' 

119 

120 >>> build_enriched_text(["API Reference", "Authentication", "OAuth 2.0"], "...") 

121 'Authentication: OAuth 2.0: ...' 

122 

123 >>> build_enriched_text([], "standalone content") 

124 'standalone content' 

125 """ 

126 if not heading_path: 

127 return content 

128 

129 # Walk backwards from deepest heading to find relevant context 

130 relevant_headings = [] 

131 for heading in reversed(heading_path): 

132 relevant_headings.insert(0, heading) 

133 # Stop after including a multi-word heading 

134 if len(heading.split()) > 1: 

135 break 

136 

137 # Build the enriched text 

138 if relevant_headings: 

139 prefix = ": ".join(relevant_headings) 

140 return f"{prefix}: {content}" 

141 

142 return content 

143 

144 

145def extract_heading_metadata( 

146 headings: list[str], 

147 heading_levels: list[int], 

148 separator: str = " > ", 

149) -> dict[str, Any]: 

150 """Extract heading metadata for storage. 

151 

152 Args: 

153 headings: List of heading texts from root to chunk 

154 heading_levels: Corresponding heading levels (1-6) 

155 separator: Separator for display string 

156 

157 Returns: 

158 Dictionary with heading metadata fields 

159 """ 

160 return { 

161 "heading_path": headings, 

162 "heading_levels": heading_levels, 

163 "heading_display": separator.join(headings) if headings else "", 

164 "heading_depth": len(headings), 

165 } 

166 

167 

168def get_relevant_headings_for_display( 

169 heading_path: list[str], 

170 content_length: int, 

171 small_threshold: int = 200, 

172 medium_threshold: int = 800, 

173) -> list[str]: 

174 """Get headings to display based on content length. 

175 

176 Implements dynamic heading inclusion: 

177 - Small chunks: Full heading path (need context) 

178 - Medium chunks: Last 2 heading levels 

179 - Large chunks: No headings (content is self-contained) 

180 

181 Args: 

182 heading_path: List of heading texts from root to chunk 

183 content_length: Length of chunk content in characters 

184 small_threshold: Max chars for "small" chunks 

185 medium_threshold: Max chars for "medium" chunks 

186 

187 Returns: 

188 List of headings to display 

189 """ 

190 if not heading_path: 

191 return [] 

192 

193 if content_length < small_threshold: 

194 # Small chunks: include full heading path 

195 return heading_path 

196 elif content_length < medium_threshold: 

197 # Medium chunks: include last 2 heading levels 

198 return heading_path[-2:] if len(heading_path) > 2 else heading_path 

199 else: 

200 # Large chunks: omit headings 

201 return [] 

202 

203 

204def format_heading_for_display( 

205 headings: list[str], 

206 heading_levels: list[int] | None = None, 

207 format_style: str = "markdown", 

208) -> str: 

209 """Format headings for display in LLM context. 

210 

211 Args: 

212 headings: List of heading texts to display 

213 heading_levels: Corresponding levels (used for markdown format) 

214 format_style: "markdown" for # syntax, "path" for > separated 

215 

216 Returns: 

217 Formatted heading string 

218 """ 

219 if not headings: 

220 return "" 

221 

222 if format_style == "path": 

223 return " > ".join(headings) 

224 

225 if format_style == "markdown" and heading_levels: 

226 lines = [] 

227 for heading, level in zip(headings, heading_levels): 

228 lines.append(f"{'#' * level} {heading}") 

229 return "\n".join(lines) 

230 

231 # Default: just join with separator 

232 return " > ".join(headings) 

233 

234 

235def enrich_chunk( 

236 content: str, 

237 headings: list[str], 

238 heading_levels: list[int], 

239) -> EnrichedChunkData: 

240 """Create fully enriched chunk data from raw components. 

241 

242 Convenience function that combines all enrichment operations. 

243 

244 Args: 

245 content: Raw chunk content text 

246 headings: List of heading texts from root to chunk 

247 heading_levels: Corresponding heading levels 

248 

249 Returns: 

250 EnrichedChunkData with all computed fields 

251 """ 

252 embedding_text = build_enriched_text(headings, content) 

253 

254 return EnrichedChunkData( 

255 content=content, 

256 embedding_text=embedding_text, 

257 heading_path=headings, 

258 heading_display=" > ".join(headings) if headings else "", 

259 content_length=len(content), 

260 )