Coverage for src / dataknobs_xization / markdown / filters.py: 26%

86 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-26 15:46 -0700

1"""Quality filters for markdown chunks. 

2 

3This module provides filtering utilities to identify and remove low-quality 

4chunks that would not contribute meaningful content to RAG retrieval. 

5""" 

6 

7from __future__ import annotations 

8 

9import re 

10from dataclasses import dataclass 

11from typing import TYPE_CHECKING 

12 

13if TYPE_CHECKING: 

14 from dataknobs_xization.markdown.md_chunker import Chunk 

15 

16 

17@dataclass 

18class ChunkQualityConfig: 

19 """Configuration for chunk quality filtering. 

20 

21 Attributes: 

22 min_content_chars: Minimum characters of non-heading content 

23 min_alphanumeric_ratio: Minimum ratio of alphanumeric to total chars 

24 skip_heading_only: Skip chunks with only headings (no body content) 

25 min_words: Minimum word count for content 

26 allow_code_blocks: Allow short code blocks that would otherwise be filtered 

27 allow_tables: Allow short tables that would otherwise be filtered 

28 """ 

29 

30 min_content_chars: int = 50 

31 min_alphanumeric_ratio: float = 0.3 

32 skip_heading_only: bool = True 

33 min_words: int = 5 

34 allow_code_blocks: bool = True 

35 allow_tables: bool = True 

36 

37 

38class ChunkQualityFilter: 

39 """Filter for identifying and removing low-quality chunks. 

40 

41 This filter helps ensure that only meaningful content is indexed 

42 for RAG retrieval, reducing noise and improving retrieval quality. 

43 """ 

44 

45 def __init__(self, config: ChunkQualityConfig | None = None): 

46 """Initialize the quality filter. 

47 

48 Args: 

49 config: Quality configuration, uses defaults if not provided 

50 """ 

51 self.config = config or ChunkQualityConfig() 

52 

53 def is_valid(self, chunk: Chunk) -> bool: 

54 """Check if a chunk meets quality thresholds. 

55 

56 Args: 

57 chunk: The chunk to evaluate 

58 

59 Returns: 

60 True if chunk should be kept, False if it should be filtered 

61 """ 

62 # Get node type from custom metadata 

63 node_type = chunk.metadata.custom.get("node_type", "body") 

64 

65 # Special handling for code blocks and tables 

66 if node_type == "code" and self.config.allow_code_blocks: 

67 return self._is_valid_code_block(chunk) 

68 if node_type == "table" and self.config.allow_tables: 

69 return self._is_valid_table(chunk) 

70 

71 # Extract content without heading markers 

72 content = self._extract_content_text(chunk.text) 

73 

74 # Check for heading-only chunks 

75 if self.config.skip_heading_only and not content.strip(): 

76 return False 

77 

78 # Check minimum content length 

79 if len(content) < self.config.min_content_chars: 

80 return False 

81 

82 # Check alphanumeric ratio 

83 if not self._meets_alphanumeric_threshold(content): 

84 return False 

85 

86 # Check word count 

87 if not self._meets_word_count(content): 

88 return False 

89 

90 return True 

91 

92 def _extract_content_text(self, text: str) -> str: 

93 """Extract content text, removing markdown heading markers. 

94 

95 Args: 

96 text: Raw chunk text 

97 

98 Returns: 

99 Content without heading lines 

100 """ 

101 lines = text.split("\n") 

102 content_lines = [] 

103 

104 for line in lines: 

105 # Skip markdown heading lines 

106 if re.match(r"^#+\s+", line): 

107 continue 

108 content_lines.append(line) 

109 

110 return "\n".join(content_lines) 

111 

112 def _meets_alphanumeric_threshold(self, text: str) -> bool: 

113 """Check if text meets minimum alphanumeric ratio. 

114 

115 Args: 

116 text: Text to check 

117 

118 Returns: 

119 True if ratio is met 

120 """ 

121 if not text: 

122 return False 

123 

124 alphanumeric_count = sum(1 for c in text if c.isalnum()) 

125 total_count = len(text) 

126 

127 if total_count == 0: 

128 return False 

129 

130 ratio = alphanumeric_count / total_count 

131 return ratio >= self.config.min_alphanumeric_ratio 

132 

133 def _meets_word_count(self, text: str) -> bool: 

134 """Check if text meets minimum word count. 

135 

136 Args: 

137 text: Text to check 

138 

139 Returns: 

140 True if word count is met 

141 """ 

142 words = text.split() 

143 return len(words) >= self.config.min_words 

144 

145 def _is_valid_code_block(self, chunk: Chunk) -> bool: 

146 """Check if a code block chunk is valid. 

147 

148 Code blocks are given more lenient filtering since they may be 

149 short but still valuable (e.g., single function definitions). 

150 

151 Args: 

152 chunk: Code block chunk 

153 

154 Returns: 

155 True if code block should be kept 

156 """ 

157 # Code blocks must have at least some content 

158 content = chunk.text.strip() 

159 if not content: 

160 return False 

161 

162 # Allow code blocks with at least one non-whitespace line 

163 lines = [line for line in content.split("\n") if line.strip()] 

164 return len(lines) >= 1 

165 

166 def _is_valid_table(self, chunk: Chunk) -> bool: 

167 """Check if a table chunk is valid. 

168 

169 Tables are given more lenient filtering since they may be 

170 compact but information-rich. 

171 

172 Args: 

173 chunk: Table chunk 

174 

175 Returns: 

176 True if table should be kept 

177 """ 

178 # Tables must have at least some content 

179 content = chunk.text.strip() 

180 if not content: 

181 return False 

182 

183 # Tables should have at least header row and one data row 

184 lines = [line for line in content.split("\n") if line.strip()] 

185 return len(lines) >= 2 

186 

187 def filter_chunks(self, chunks: list[Chunk]) -> list[Chunk]: 

188 """Filter a list of chunks, keeping only valid ones. 

189 

190 Args: 

191 chunks: List of chunks to filter 

192 

193 Returns: 

194 List of chunks that pass quality thresholds 

195 """ 

196 return [chunk for chunk in chunks if self.is_valid(chunk)] 

197 

198 def get_rejection_reason(self, chunk: Chunk) -> str | None: 

199 """Get the reason a chunk would be rejected. 

200 

201 Useful for debugging and understanding filtering behavior. 

202 

203 Args: 

204 chunk: The chunk to evaluate 

205 

206 Returns: 

207 Rejection reason string, or None if chunk is valid 

208 """ 

209 node_type = chunk.metadata.custom.get("node_type", "body") 

210 

211 if node_type == "code" and self.config.allow_code_blocks: 

212 if not self._is_valid_code_block(chunk): 

213 return "Empty code block" 

214 return None 

215 

216 if node_type == "table" and self.config.allow_tables: 

217 if not self._is_valid_table(chunk): 

218 return "Empty or single-row table" 

219 return None 

220 

221 content = self._extract_content_text(chunk.text) 

222 

223 if self.config.skip_heading_only and not content.strip(): 

224 return "Heading-only chunk (no body content)" 

225 

226 if len(content) < self.config.min_content_chars: 

227 return f"Content too short ({len(content)} < {self.config.min_content_chars} chars)" 

228 

229 if not self._meets_alphanumeric_threshold(content): 

230 return f"Alphanumeric ratio below threshold ({self.config.min_alphanumeric_ratio})" 

231 

232 if not self._meets_word_count(content): 

233 words = len(content.split()) 

234 return f"Word count too low ({words} < {self.config.min_words} words)" 

235 

236 return None