Coverage for mcp_bridge/tools/query_classifier.py: 96%

84 statements  

« prev     ^ index     » next       coverage.py v7.10.1, created at 2026-01-10 00:20 -0500

1"""Query classifier for intelligent search routing. 

2 

3This module provides a fast, regex-based system that categorizes search queries 

4into four types: PATTERN (exact text matching), STRUCTURAL (AST-aware code structure), 

5SEMANTIC (conceptual/behavioral), and HYBRID (multi-modal). 

6 

7It enables intelligent routing to the optimal search tool without LLM overhead. 

8 

9Design Goals: 

10- Fast: <10ms classification per query 

11- No LLM calls: Pure regex-based detection (no API overhead) 

12- Confidence scoring: Return probability (0.0-1.0) for each category 

13- Fallback safe: Default to HYBRID when ambiguous 

14- Extensible: Easy to add new patterns/indicators 

15""" 

16 

17import logging 

18import re 

19from dataclasses import dataclass 

20from enum import Enum 

21from typing import Literal 

22 

23# Module-level logger 

24logger = logging.getLogger(__name__) 

25 

26 

27class QueryCategory(Enum): 

28 """Query classification categories.""" 

29 

30 SEMANTIC = "semantic" # Conceptual, "what it does" queries 

31 PATTERN = "pattern" # Exact text/regex matching 

32 STRUCTURAL = "structural" # AST-aware code structure queries 

33 HYBRID = "hybrid" # Multi-modal search recommended 

34 

35 

36@dataclass 

37class QueryClassification: 

38 """Result of query classification. 

39 

40 Attributes: 

41 category: The classified query category (SEMANTIC, PATTERN, STRUCTURAL, HYBRID) 

42 confidence: Confidence score from 0.0 (low) to 1.0 (high) 

43 indicators: List of matched patterns/reasons that led to this classification 

44 suggested_tool: The recommended search tool to use 

45 - "grep_search" for PATTERN queries 

46 - "ast_grep_search" for STRUCTURAL queries 

47 - "semantic_search" for SEMANTIC queries 

48 - "enhanced_search" for HYBRID queries 

49 reasoning: Human-readable explanation of the classification 

50 """ 

51 

52 category: QueryCategory 

53 confidence: float # 0.0-1.0 

54 indicators: list[str] # Matched patterns/reasons 

55 suggested_tool: Literal[ 

56 "semantic_search", "grep_search", "ast_grep_search", "enhanced_search" 

57 ] 

58 reasoning: str # Human-readable explanation 

59 

60 

61# Phase 1: Exact Pattern Detection (High Confidence) 

62# Triggered when query contains quoted strings, exact identifiers with code syntax, 

63# file paths, regular expressions, or known constant patterns. 

64PATTERN_INDICATORS = [ 

65 r'["\'][\w_]+["\']', # Quoted identifiers like "authenticate()" or 'API_KEY' 

66 r'\b\w+\(\)', # Function calls with () like authenticate() 

67 r'[\w_]+\.[\w_]+', # Dot notation (Class.method) like database.query() 

68 r'[\w/]+\.\w{2,4}$', # File paths with extension 

69 r'/.*?/', # Regex patterns 

70 r'\b[A-Z_]{4,}\b', # CONSTANT_NAMES (4+ uppercase chars) 

71] 

72 

73# Phase 2: Structural Detection (High Confidence) 

74# Triggered when query contains AST keywords, structural relationships, 

75# or code structure terms. 

76STRUCTURAL_INDICATORS = [ 

77 r'\b(class|function|method|async|interface)\b', # AST keywords 

78 r'\b(inherits?|extends?|implements?|overrides?)\b', # Structural relationships 

79 r'\b(decorated?)\s+(with|by)\b', # Decorator patterns 

80 r'\@\w+', # Decorator syntax 

81 r'\b(definition|declaration|signature)\b', # Code structure terms 

82] 

83 

84# Phase 3: Conceptual Detection (Medium-High Confidence) 

85# Triggered when query contains intent verbs, how/why/where questions, 

86# design patterns, conceptual nouns, or cross-cutting concerns. 

87SEMANTIC_INDICATORS = [ 

88 r'\b(how|why|where)\s+(does|is|are)', # How/why/where questions 

89 r'\b(handles?|manages?|processes?|validates?|transforms?)\b', # Intent verbs 

90 r'\b(logic|mechanism|strategy|approach|workflow|implementation)\b', # Conceptual nouns 

91 r'\b(pattern|anti-pattern)\b', # Design patterns 

92 r'\b(authentication|authorization|caching|logging|error handling)\b', # Cross-cutting 

93 r'\bfind\s+(all\s+)?(code|places|instances|implementations)\s+that\b', # Find code pattern 

94] 

95 

96# Phase 4: Hybrid Detection (Medium Confidence) 

97# Triggered when query contains multiple concepts, both exact + conceptual, 

98# broad scopes, or vague qualifiers. 

99HYBRID_INDICATORS = [ 

100 r'\s+(and|then|also|plus|with)\s+', # Conjunctions 

101 r'\b(across|throughout|in all|system-wide)\b', # Broad scopes 

102 r'\b(similar|related|like|kind of|type of)\b', # Vague qualifiers 

103 r'\b(all|every|any)\s+\w+\s+(that|which|where)\b', # Broad quantifiers 

104] 

105 

106# Tool routing based on category 

107TOOL_ROUTING = { 

108 QueryCategory.PATTERN: "grep_search", 

109 QueryCategory.STRUCTURAL: "ast_grep_search", 

110 QueryCategory.SEMANTIC: "semantic_search", 

111 QueryCategory.HYBRID: "enhanced_search", 

112} 

113 

114 

115def classify_query(query: str) -> QueryClassification: 

116 """Classify a search query into one of four categories. 

117 

118 This function analyzes a search query using regex-based pattern matching 

119 to determine its type (PATTERN, STRUCTURAL, SEMANTIC, or HYBRID) and 

120 recommends the most appropriate search tool. 

121 

122 The classification process has 4 phases: 

123 1. Pattern Detection: Looks for exact identifiers, quoted strings, file paths 

124 2. Structural Detection: Looks for AST keywords (class, function, etc.) 

125 3. Conceptual Detection: Looks for intent verbs and semantic concepts 

126 4. Hybrid Detection: Looks for conjunctions and broad scopes 

127 5. Fallback: Defaults to HYBRID with 0.5 confidence if no strong match 

128 

129 Args: 

130 query: Natural language search query (e.g., "Find authenticate()" or 

131 "Where is authentication handled?") 

132 

133 Returns: 

134 QueryClassification object containing: 

135 - category: One of SEMANTIC, PATTERN, STRUCTURAL, HYBRID 

136 - confidence: Score from 0.0 to 1.0 (capped at 0.95, never 1.0) 

137 - indicators: List of matched pattern names 

138 - suggested_tool: Recommended tool (grep_search, ast_grep_search, 

139 semantic_search, or enhanced_search) 

140 - reasoning: Human-readable explanation 

141 

142 Examples: 

143 >>> result = classify_query("Find all calls to authenticate()") 

144 >>> result.category 

145 <QueryCategory.PATTERN: 'pattern'> 

146 >>> result.confidence 

147 0.9 

148 >>> result.suggested_tool 

149 'grep_search' 

150 

151 >>> result = classify_query("Where is authentication handled?") 

152 >>> result.category 

153 <QueryCategory.SEMANTIC: 'semantic'> 

154 >>> result.confidence 

155 0.85 

156 >>> result.suggested_tool 

157 'semantic_search' 

158 

159 >>> result = classify_query("Find class definitions inheriting from Base") 

160 >>> result.category 

161 <QueryCategory.STRUCTURAL: 'structural'> 

162 >>> result.confidence 

163 0.95 

164 >>> result.suggested_tool 

165 'ast_grep_search' 

166 

167 Performance: 

168 - Target: <10ms per classification 

169 - Uses only pure Python stdlib (re module) 

170 - No external dependencies or API calls 

171 """ 

172 try: 

173 # Input validation 

174 if not query or not isinstance(query, str): 

175 return QueryClassification( 

176 category=QueryCategory.HYBRID, 

177 confidence=0.5, 

178 indicators=["invalid_input"], 

179 suggested_tool="enhanced_search", 

180 reasoning="Invalid or empty query, using safe default", 

181 ) 

182 

183 # Normalize query 

184 query_normalized = query.strip() 

185 if len(query_normalized) < 3: 

186 return QueryClassification( 

187 category=QueryCategory.HYBRID, 

188 confidence=0.5, 

189 indicators=["too_short"], 

190 suggested_tool="enhanced_search", 

191 reasoning="Query too short for accurate classification", 

192 ) 

193 

194 query_lower = query_normalized.lower() 

195 

196 # Phase 1: Pattern Detection 

197 pattern_matches = [] 

198 for pattern in PATTERN_INDICATORS: 

199 if re.search(pattern, query_lower): 

200 pattern_matches.append(pattern) 

201 

202 # Phase 2: Structural Detection 

203 structural_matches = [] 

204 for pattern in STRUCTURAL_INDICATORS: 

205 if re.search(pattern, query_lower): 

206 structural_matches.append(pattern) 

207 

208 # Phase 3: Semantic Detection 

209 semantic_matches = [] 

210 for pattern in SEMANTIC_INDICATORS: 

211 if re.search(pattern, query_lower): 

212 semantic_matches.append(pattern) 

213 

214 # Phase 4: Hybrid Detection 

215 hybrid_matches = [] 

216 for pattern in HYBRID_INDICATORS: 

217 if re.search(pattern, query_lower): 

218 hybrid_matches.append(pattern) 

219 

220 # Confidence Scoring 

221 # Score calculation: 

222 # - Each pattern match: +0.15 

223 # - Each structural match: +0.20 

224 # - Each semantic match: +0.15 

225 # - Each hybrid match: +0.10 

226 scores = { 

227 QueryCategory.PATTERN: len(pattern_matches) * 0.15, 

228 QueryCategory.STRUCTURAL: len(structural_matches) * 0.20, 

229 QueryCategory.SEMANTIC: len(semantic_matches) * 0.15, 

230 QueryCategory.HYBRID: len(hybrid_matches) * 0.10, 

231 } 

232 

233 # Find maximum score 

234 max_score = max(scores.values()) 

235 

236 # Fallback to HYBRID if no matches 

237 if max_score == 0: 

238 result = QueryClassification( 

239 category=QueryCategory.HYBRID, 

240 confidence=0.5, 

241 indicators=[], 

242 suggested_tool="enhanced_search", 

243 reasoning="No clear indicators found, using multi-modal search", 

244 ) 

245 logger.debug( 

246 f"QUERY-CLASSIFY: query='{query_normalized[:50]}...' " 

247 f"category={result.category.value} " 

248 f"confidence={result.confidence:.2f} " 

249 f"tool={result.suggested_tool}" 

250 ) 

251 return result 

252 

253 # Find all categories with maximum score (potential ties) 

254 winners = [cat for cat, score in scores.items() if score == max_score] 

255 

256 # If tie, use HYBRID 

257 if len(winners) > 1: 

258 confidence = min(max_score, 0.95) 

259 category = QueryCategory.HYBRID 

260 else: 

261 confidence = min(max_score, 0.95) 

262 category = winners[0] 

263 

264 # Gather all indicators for reporting 

265 all_indicators = [] 

266 if pattern_matches: 

267 all_indicators.append("pattern_match") 

268 if structural_matches: 

269 all_indicators.append("structural_match") 

270 if semantic_matches: 

271 all_indicators.append("semantic_match") 

272 if hybrid_matches: 

273 all_indicators.append("hybrid_match") 

274 

275 # Generate reasoning 

276 reasoning_parts = [] 

277 if category == QueryCategory.PATTERN: 

278 reasoning_parts.append( 

279 "Query contains exact identifiers or code syntax" 

280 ) 

281 elif category == QueryCategory.STRUCTURAL: 

282 reasoning_parts.append( 

283 "Query requires AST-level understanding of code structure" 

284 ) 

285 elif category == QueryCategory.SEMANTIC: 

286 reasoning_parts.append( 

287 "Query asks about conceptual logic or behavior" 

288 ) 

289 elif category == QueryCategory.HYBRID: 

290 reasoning_parts.append( 

291 "Query combines multiple search approaches or is ambiguous" 

292 ) 

293 

294 reasoning = "; ".join(reasoning_parts) 

295 

296 result = QueryClassification( 

297 category=category, 

298 confidence=confidence, 

299 indicators=all_indicators, 

300 suggested_tool=TOOL_ROUTING[category], 

301 reasoning=reasoning, 

302 ) 

303 

304 # Log classification for analytics 

305 logger.debug( 

306 f"QUERY-CLASSIFY: query='{query_normalized[:50]}...' " 

307 f"category={result.category.value} " 

308 f"confidence={result.confidence:.2f} " 

309 f"tool={result.suggested_tool}" 

310 ) 

311 

312 return result 

313 

314 except Exception as e: 

315 # Safe fallback on any error 

316 logger.exception(f"Error classifying query: {e}") 

317 return QueryClassification( 

318 category=QueryCategory.HYBRID, 

319 confidence=0.5, 

320 indicators=["error"], 

321 suggested_tool="enhanced_search", 

322 reasoning=f"Classification error: {str(e)}, using safe default", 

323 )