Coverage for mcp_bridge/tools/query_classifier.py: 96%
84 statements
« prev ^ index » next coverage.py v7.10.1, created at 2026-01-10 00:20 -0500
« prev ^ index » next coverage.py v7.10.1, created at 2026-01-10 00:20 -0500
1"""Query classifier for intelligent search routing.
3This module provides a fast, regex-based system that categorizes search queries
4into four types: PATTERN (exact text matching), STRUCTURAL (AST-aware code structure),
5SEMANTIC (conceptual/behavioral), and HYBRID (multi-modal).
7It enables intelligent routing to the optimal search tool without LLM overhead.
9Design Goals:
10- Fast: <10ms classification per query
11- No LLM calls: Pure regex-based detection (no API overhead)
12- Confidence scoring: Return probability (0.0-1.0) for each category
13- Fallback safe: Default to HYBRID when ambiguous
14- Extensible: Easy to add new patterns/indicators
15"""
17import logging
18import re
19from dataclasses import dataclass
20from enum import Enum
21from typing import Literal
23# Module-level logger
24logger = logging.getLogger(__name__)
27class QueryCategory(Enum):
28 """Query classification categories."""
30 SEMANTIC = "semantic" # Conceptual, "what it does" queries
31 PATTERN = "pattern" # Exact text/regex matching
32 STRUCTURAL = "structural" # AST-aware code structure queries
33 HYBRID = "hybrid" # Multi-modal search recommended
36@dataclass
37class QueryClassification:
38 """Result of query classification.
40 Attributes:
41 category: The classified query category (SEMANTIC, PATTERN, STRUCTURAL, HYBRID)
42 confidence: Confidence score from 0.0 (low) to 1.0 (high)
43 indicators: List of matched patterns/reasons that led to this classification
44 suggested_tool: The recommended search tool to use
45 - "grep_search" for PATTERN queries
46 - "ast_grep_search" for STRUCTURAL queries
47 - "semantic_search" for SEMANTIC queries
48 - "enhanced_search" for HYBRID queries
49 reasoning: Human-readable explanation of the classification
50 """
52 category: QueryCategory
53 confidence: float # 0.0-1.0
54 indicators: list[str] # Matched patterns/reasons
55 suggested_tool: Literal[
56 "semantic_search", "grep_search", "ast_grep_search", "enhanced_search"
57 ]
58 reasoning: str # Human-readable explanation
61# Phase 1: Exact Pattern Detection (High Confidence)
62# Triggered when query contains quoted strings, exact identifiers with code syntax,
63# file paths, regular expressions, or known constant patterns.
64PATTERN_INDICATORS = [
65 r'["\'][\w_]+["\']', # Quoted identifiers like "authenticate()" or 'API_KEY'
66 r'\b\w+\(\)', # Function calls with () like authenticate()
67 r'[\w_]+\.[\w_]+', # Dot notation (Class.method) like database.query()
68 r'[\w/]+\.\w{2,4}$', # File paths with extension
69 r'/.*?/', # Regex patterns
70 r'\b[A-Z_]{4,}\b', # CONSTANT_NAMES (4+ uppercase chars)
71]
73# Phase 2: Structural Detection (High Confidence)
74# Triggered when query contains AST keywords, structural relationships,
75# or code structure terms.
76STRUCTURAL_INDICATORS = [
77 r'\b(class|function|method|async|interface)\b', # AST keywords
78 r'\b(inherits?|extends?|implements?|overrides?)\b', # Structural relationships
79 r'\b(decorated?)\s+(with|by)\b', # Decorator patterns
80 r'\@\w+', # Decorator syntax
81 r'\b(definition|declaration|signature)\b', # Code structure terms
82]
84# Phase 3: Conceptual Detection (Medium-High Confidence)
85# Triggered when query contains intent verbs, how/why/where questions,
86# design patterns, conceptual nouns, or cross-cutting concerns.
87SEMANTIC_INDICATORS = [
88 r'\b(how|why|where)\s+(does|is|are)', # How/why/where questions
89 r'\b(handles?|manages?|processes?|validates?|transforms?)\b', # Intent verbs
90 r'\b(logic|mechanism|strategy|approach|workflow|implementation)\b', # Conceptual nouns
91 r'\b(pattern|anti-pattern)\b', # Design patterns
92 r'\b(authentication|authorization|caching|logging|error handling)\b', # Cross-cutting
93 r'\bfind\s+(all\s+)?(code|places|instances|implementations)\s+that\b', # Find code pattern
94]
96# Phase 4: Hybrid Detection (Medium Confidence)
97# Triggered when query contains multiple concepts, both exact + conceptual,
98# broad scopes, or vague qualifiers.
99HYBRID_INDICATORS = [
100 r'\s+(and|then|also|plus|with)\s+', # Conjunctions
101 r'\b(across|throughout|in all|system-wide)\b', # Broad scopes
102 r'\b(similar|related|like|kind of|type of)\b', # Vague qualifiers
103 r'\b(all|every|any)\s+\w+\s+(that|which|where)\b', # Broad quantifiers
104]
106# Tool routing based on category
107TOOL_ROUTING = {
108 QueryCategory.PATTERN: "grep_search",
109 QueryCategory.STRUCTURAL: "ast_grep_search",
110 QueryCategory.SEMANTIC: "semantic_search",
111 QueryCategory.HYBRID: "enhanced_search",
112}
115def classify_query(query: str) -> QueryClassification:
116 """Classify a search query into one of four categories.
118 This function analyzes a search query using regex-based pattern matching
119 to determine its type (PATTERN, STRUCTURAL, SEMANTIC, or HYBRID) and
120 recommends the most appropriate search tool.
122 The classification process has 4 phases:
123 1. Pattern Detection: Looks for exact identifiers, quoted strings, file paths
124 2. Structural Detection: Looks for AST keywords (class, function, etc.)
125 3. Conceptual Detection: Looks for intent verbs and semantic concepts
126 4. Hybrid Detection: Looks for conjunctions and broad scopes
127 5. Fallback: Defaults to HYBRID with 0.5 confidence if no strong match
129 Args:
130 query: Natural language search query (e.g., "Find authenticate()" or
131 "Where is authentication handled?")
133 Returns:
134 QueryClassification object containing:
135 - category: One of SEMANTIC, PATTERN, STRUCTURAL, HYBRID
136 - confidence: Score from 0.0 to 1.0 (capped at 0.95, never 1.0)
137 - indicators: List of matched pattern names
138 - suggested_tool: Recommended tool (grep_search, ast_grep_search,
139 semantic_search, or enhanced_search)
140 - reasoning: Human-readable explanation
142 Examples:
143 >>> result = classify_query("Find all calls to authenticate()")
144 >>> result.category
145 <QueryCategory.PATTERN: 'pattern'>
146 >>> result.confidence
147 0.9
148 >>> result.suggested_tool
149 'grep_search'
151 >>> result = classify_query("Where is authentication handled?")
152 >>> result.category
153 <QueryCategory.SEMANTIC: 'semantic'>
154 >>> result.confidence
155 0.85
156 >>> result.suggested_tool
157 'semantic_search'
159 >>> result = classify_query("Find class definitions inheriting from Base")
160 >>> result.category
161 <QueryCategory.STRUCTURAL: 'structural'>
162 >>> result.confidence
163 0.95
164 >>> result.suggested_tool
165 'ast_grep_search'
167 Performance:
168 - Target: <10ms per classification
169 - Uses only pure Python stdlib (re module)
170 - No external dependencies or API calls
171 """
172 try:
173 # Input validation
174 if not query or not isinstance(query, str):
175 return QueryClassification(
176 category=QueryCategory.HYBRID,
177 confidence=0.5,
178 indicators=["invalid_input"],
179 suggested_tool="enhanced_search",
180 reasoning="Invalid or empty query, using safe default",
181 )
183 # Normalize query
184 query_normalized = query.strip()
185 if len(query_normalized) < 3:
186 return QueryClassification(
187 category=QueryCategory.HYBRID,
188 confidence=0.5,
189 indicators=["too_short"],
190 suggested_tool="enhanced_search",
191 reasoning="Query too short for accurate classification",
192 )
194 query_lower = query_normalized.lower()
196 # Phase 1: Pattern Detection
197 pattern_matches = []
198 for pattern in PATTERN_INDICATORS:
199 if re.search(pattern, query_lower):
200 pattern_matches.append(pattern)
202 # Phase 2: Structural Detection
203 structural_matches = []
204 for pattern in STRUCTURAL_INDICATORS:
205 if re.search(pattern, query_lower):
206 structural_matches.append(pattern)
208 # Phase 3: Semantic Detection
209 semantic_matches = []
210 for pattern in SEMANTIC_INDICATORS:
211 if re.search(pattern, query_lower):
212 semantic_matches.append(pattern)
214 # Phase 4: Hybrid Detection
215 hybrid_matches = []
216 for pattern in HYBRID_INDICATORS:
217 if re.search(pattern, query_lower):
218 hybrid_matches.append(pattern)
220 # Confidence Scoring
221 # Score calculation:
222 # - Each pattern match: +0.15
223 # - Each structural match: +0.20
224 # - Each semantic match: +0.15
225 # - Each hybrid match: +0.10
226 scores = {
227 QueryCategory.PATTERN: len(pattern_matches) * 0.15,
228 QueryCategory.STRUCTURAL: len(structural_matches) * 0.20,
229 QueryCategory.SEMANTIC: len(semantic_matches) * 0.15,
230 QueryCategory.HYBRID: len(hybrid_matches) * 0.10,
231 }
233 # Find maximum score
234 max_score = max(scores.values())
236 # Fallback to HYBRID if no matches
237 if max_score == 0:
238 result = QueryClassification(
239 category=QueryCategory.HYBRID,
240 confidence=0.5,
241 indicators=[],
242 suggested_tool="enhanced_search",
243 reasoning="No clear indicators found, using multi-modal search",
244 )
245 logger.debug(
246 f"QUERY-CLASSIFY: query='{query_normalized[:50]}...' "
247 f"category={result.category.value} "
248 f"confidence={result.confidence:.2f} "
249 f"tool={result.suggested_tool}"
250 )
251 return result
253 # Find all categories with maximum score (potential ties)
254 winners = [cat for cat, score in scores.items() if score == max_score]
256 # If tie, use HYBRID
257 if len(winners) > 1:
258 confidence = min(max_score, 0.95)
259 category = QueryCategory.HYBRID
260 else:
261 confidence = min(max_score, 0.95)
262 category = winners[0]
264 # Gather all indicators for reporting
265 all_indicators = []
266 if pattern_matches:
267 all_indicators.append("pattern_match")
268 if structural_matches:
269 all_indicators.append("structural_match")
270 if semantic_matches:
271 all_indicators.append("semantic_match")
272 if hybrid_matches:
273 all_indicators.append("hybrid_match")
275 # Generate reasoning
276 reasoning_parts = []
277 if category == QueryCategory.PATTERN:
278 reasoning_parts.append(
279 "Query contains exact identifiers or code syntax"
280 )
281 elif category == QueryCategory.STRUCTURAL:
282 reasoning_parts.append(
283 "Query requires AST-level understanding of code structure"
284 )
285 elif category == QueryCategory.SEMANTIC:
286 reasoning_parts.append(
287 "Query asks about conceptual logic or behavior"
288 )
289 elif category == QueryCategory.HYBRID:
290 reasoning_parts.append(
291 "Query combines multiple search approaches or is ambiguous"
292 )
294 reasoning = "; ".join(reasoning_parts)
296 result = QueryClassification(
297 category=category,
298 confidence=confidence,
299 indicators=all_indicators,
300 suggested_tool=TOOL_ROUTING[category],
301 reasoning=reasoning,
302 )
304 # Log classification for analytics
305 logger.debug(
306 f"QUERY-CLASSIFY: query='{query_normalized[:50]}...' "
307 f"category={result.category.value} "
308 f"confidence={result.confidence:.2f} "
309 f"tool={result.suggested_tool}"
310 )
312 return result
314 except Exception as e:
315 # Safe fallback on any error
316 logger.exception(f"Error classifying query: {e}")
317 return QueryClassification(
318 category=QueryCategory.HYBRID,
319 confidence=0.5,
320 indicators=["error"],
321 suggested_tool="enhanced_search",
322 reasoning=f"Classification error: {str(e)}, using safe default",
323 )