Coverage for src/dataknobs_xization/markdown/filters.py: 26%
86 statements
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-18 19:51 -0700
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-18 19:51 -0700
1"""Quality filters for markdown chunks.
3This module provides filtering utilities to identify and remove low-quality
4chunks that would not contribute meaningful content to RAG retrieval.
5"""
7from __future__ import annotations
9import re
10from dataclasses import dataclass
11from typing import TYPE_CHECKING
13if TYPE_CHECKING:
14 from dataknobs_xization.markdown.md_chunker import Chunk
17@dataclass
18class ChunkQualityConfig:
19 """Configuration for chunk quality filtering.
21 Attributes:
22 min_content_chars: Minimum characters of non-heading content
23 min_alphanumeric_ratio: Minimum ratio of alphanumeric to total chars
24 skip_heading_only: Skip chunks with only headings (no body content)
25 min_words: Minimum word count for content
26 allow_code_blocks: Allow short code blocks that would otherwise be filtered
27 allow_tables: Allow short tables that would otherwise be filtered
28 """
30 min_content_chars: int = 50
31 min_alphanumeric_ratio: float = 0.3
32 skip_heading_only: bool = True
33 min_words: int = 5
34 allow_code_blocks: bool = True
35 allow_tables: bool = True
38class ChunkQualityFilter:
39 """Filter for identifying and removing low-quality chunks.
41 This filter helps ensure that only meaningful content is indexed
42 for RAG retrieval, reducing noise and improving retrieval quality.
43 """
45 def __init__(self, config: ChunkQualityConfig | None = None):
46 """Initialize the quality filter.
48 Args:
49 config: Quality configuration, uses defaults if not provided
50 """
51 self.config = config or ChunkQualityConfig()
53 def is_valid(self, chunk: Chunk) -> bool:
54 """Check if a chunk meets quality thresholds.
56 Args:
57 chunk: The chunk to evaluate
59 Returns:
60 True if chunk should be kept, False if it should be filtered
61 """
62 # Get node type from custom metadata
63 node_type = chunk.metadata.custom.get("node_type", "body")
65 # Special handling for code blocks and tables
66 if node_type == "code" and self.config.allow_code_blocks:
67 return self._is_valid_code_block(chunk)
68 if node_type == "table" and self.config.allow_tables:
69 return self._is_valid_table(chunk)
71 # Extract content without heading markers
72 content = self._extract_content_text(chunk.text)
74 # Check for heading-only chunks
75 if self.config.skip_heading_only and not content.strip():
76 return False
78 # Check minimum content length
79 if len(content) < self.config.min_content_chars:
80 return False
82 # Check alphanumeric ratio
83 if not self._meets_alphanumeric_threshold(content):
84 return False
86 # Check word count
87 if not self._meets_word_count(content):
88 return False
90 return True
92 def _extract_content_text(self, text: str) -> str:
93 """Extract content text, removing markdown heading markers.
95 Args:
96 text: Raw chunk text
98 Returns:
99 Content without heading lines
100 """
101 lines = text.split("\n")
102 content_lines = []
104 for line in lines:
105 # Skip markdown heading lines
106 if re.match(r"^#+\s+", line):
107 continue
108 content_lines.append(line)
110 return "\n".join(content_lines)
112 def _meets_alphanumeric_threshold(self, text: str) -> bool:
113 """Check if text meets minimum alphanumeric ratio.
115 Args:
116 text: Text to check
118 Returns:
119 True if ratio is met
120 """
121 if not text:
122 return False
124 alphanumeric_count = sum(1 for c in text if c.isalnum())
125 total_count = len(text)
127 if total_count == 0:
128 return False
130 ratio = alphanumeric_count / total_count
131 return ratio >= self.config.min_alphanumeric_ratio
133 def _meets_word_count(self, text: str) -> bool:
134 """Check if text meets minimum word count.
136 Args:
137 text: Text to check
139 Returns:
140 True if word count is met
141 """
142 words = text.split()
143 return len(words) >= self.config.min_words
145 def _is_valid_code_block(self, chunk: Chunk) -> bool:
146 """Check if a code block chunk is valid.
148 Code blocks are given more lenient filtering since they may be
149 short but still valuable (e.g., single function definitions).
151 Args:
152 chunk: Code block chunk
154 Returns:
155 True if code block should be kept
156 """
157 # Code blocks must have at least some content
158 content = chunk.text.strip()
159 if not content:
160 return False
162 # Allow code blocks with at least one non-whitespace line
163 lines = [line for line in content.split("\n") if line.strip()]
164 return len(lines) >= 1
166 def _is_valid_table(self, chunk: Chunk) -> bool:
167 """Check if a table chunk is valid.
169 Tables are given more lenient filtering since they may be
170 compact but information-rich.
172 Args:
173 chunk: Table chunk
175 Returns:
176 True if table should be kept
177 """
178 # Tables must have at least some content
179 content = chunk.text.strip()
180 if not content:
181 return False
183 # Tables should have at least header row and one data row
184 lines = [line for line in content.split("\n") if line.strip()]
185 return len(lines) >= 2
187 def filter_chunks(self, chunks: list[Chunk]) -> list[Chunk]:
188 """Filter a list of chunks, keeping only valid ones.
190 Args:
191 chunks: List of chunks to filter
193 Returns:
194 List of chunks that pass quality thresholds
195 """
196 return [chunk for chunk in chunks if self.is_valid(chunk)]
198 def get_rejection_reason(self, chunk: Chunk) -> str | None:
199 """Get the reason a chunk would be rejected.
201 Useful for debugging and understanding filtering behavior.
203 Args:
204 chunk: The chunk to evaluate
206 Returns:
207 Rejection reason string, or None if chunk is valid
208 """
209 node_type = chunk.metadata.custom.get("node_type", "body")
211 if node_type == "code" and self.config.allow_code_blocks:
212 if not self._is_valid_code_block(chunk):
213 return "Empty code block"
214 return None
216 if node_type == "table" and self.config.allow_tables:
217 if not self._is_valid_table(chunk):
218 return "Empty or single-row table"
219 return None
221 content = self._extract_content_text(chunk.text)
223 if self.config.skip_heading_only and not content.strip():
224 return "Heading-only chunk (no body content)"
226 if len(content) < self.config.min_content_chars:
227 return f"Content too short ({len(content)} < {self.config.min_content_chars} chars)"
229 if not self._meets_alphanumeric_threshold(content):
230 return f"Alphanumeric ratio below threshold ({self.config.min_alphanumeric_ratio})"
232 if not self._meets_word_count(content):
233 words = len(content.split())
234 return f"Word count too low ({words} < {self.config.min_words} words)"
236 return None