Coverage for src\llm_code_lens\analyzer\sql.py: 75%

177 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2025-01-12 10:23 +0200

1import re 

2import os 

3import pyodbc 

4from pathlib import Path 

5from typing import Dict, List, Optional 

6 

7class SQLServerAnalyzer: 

8 """SQL Server code analyzer for stored procedures and views.""" 

9 

10 def __init__(self): 

11 self.conn = None 

12 self.cursor = None 

13 

14 def connect(self, connection_string: Optional[str] = None) -> None: 

15 """ 

16 Connect to SQL Server using either provided connection string or environment variables. 

17  

18 Args: 

19 connection_string: Optional connection string. If not provided, uses environment variables. 

20 """ 

21 try: 

22 if connection_string: 

23 self.conn = pyodbc.connect(connection_string) 

24 else: 

25 # Use environment variables 

26 server = os.getenv('MSSQL_SERVER') 

27 username = os.getenv('MSSQL_USERNAME') 

28 password = os.getenv('MSSQL_PASSWORD') 

29 

30 if not server: 

31 raise ValueError("No server specified. Provide connection string or set MSSQL_SERVER environment variable") 

32 

33 # Build connection string 

34 conn_str = f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={server}' 

35 if username and password: 

36 conn_str += f';UID={username};PWD={password}' 

37 else: 

38 conn_str += ';Trusted_Connection=yes' 

39 

40 self.conn = pyodbc.connect(conn_str) 

41 

42 self.cursor = self.conn.cursor() 

43 

44 except Exception as e: 

45 raise ConnectionError(f"Failed to connect to SQL Server: {str(e)}") 

46 

47 def list_databases(self) -> List[str]: 

48 """List all accessible databases.""" 

49 if not self.cursor: 

50 raise ConnectionError("Not connected to SQL Server") 

51 

52 self.cursor.execute("SELECT name FROM sys.databases WHERE database_id > 4") # Skip system DBs 

53 return [row.name for row in self.cursor.fetchall()] 

54 

55 def analyze_database(self, database: str) -> Dict: 

56 """ 

57 Analyze a specific database. 

58  

59 Args: 

60 database: Name of the database to analyze 

61  

62 Returns: 

63 Dict containing analysis of stored procedures, views, and functions 

64 """ 

65 if not self.cursor: 

66 raise ConnectionError("Not connected to SQL Server") 

67 

68 # Switch to specified database 

69 self.cursor.execute(f"USE [{database}]") 

70 

71 return { 

72 'stored_procedures': self._analyze_stored_procedures(), 

73 'views': self._analyze_views(), 

74 'functions': self._analyze_functions() 

75 } 

76 

77 def _analyze_stored_procedures(self) -> List[Dict]: 

78 """Analyze stored procedures in current database.""" 

79 self.cursor.execute(""" 

80 SELECT  

81 OBJECT_SCHEMA_NAME(p.object_id) as schema_name, 

82 p.name, 

83 m.definition, 

84 p.create_date, 

85 p.modify_date 

86 FROM sys.procedures p 

87 INNER JOIN sys.sql_modules m ON p.object_id = m.object_id 

88 ORDER BY schema_name, p.name 

89 """) 

90 

91 procedures = [] 

92 for row in self.cursor.fetchall(): 

93 proc_def = row.definition 

94 

95 # Analyze the procedure 

96 proc_analysis = { 

97 'schema': row.schema_name, 

98 'name': row.name, 

99 'definition': proc_def, 

100 'metrics': { 

101 'lines': len(proc_def.splitlines()), 

102 'complexity': self._estimate_complexity(proc_def) 

103 }, 

104 'parameters': self._extract_parameters(proc_def), 

105 'dependencies': self._extract_dependencies(proc_def), 

106 'todos': [], 

107 'comments': [] 

108 } 

109 

110 # Extract comments and TODOs 

111 comments, todos = self._extract_comments_and_todos(proc_def) 

112 proc_analysis['comments'] = comments 

113 proc_analysis['todos'] = todos 

114 

115 procedures.append(proc_analysis) 

116 

117 return procedures 

118 

119 def _analyze_views(self) -> List[Dict]: 

120 """Analyze views in current database.""" 

121 self.cursor.execute(""" 

122 SELECT  

123 OBJECT_SCHEMA_NAME(v.object_id) as schema_name, 

124 v.name, 

125 m.definition, 

126 v.create_date, 

127 v.modify_date 

128 FROM sys.views v 

129 INNER JOIN sys.sql_modules m ON v.object_id = m.object_id 

130 ORDER BY schema_name, v.name 

131 """) 

132 

133 views = [] 

134 for row in self.cursor.fetchall(): 

135 view_def = row.definition 

136 

137 # Analyze the view 

138 view_analysis = { 

139 'schema': row.schema_name, 

140 'name': row.name, 

141 'definition': view_def, 

142 'metrics': { 

143 'lines': len(view_def.splitlines()), 

144 'complexity': self._estimate_complexity(view_def) 

145 }, 

146 'dependencies': self._extract_dependencies(view_def), 

147 'todos': [], 

148 'comments': [] 

149 } 

150 

151 # Extract comments and TODOs 

152 comments, todos = self._extract_comments_and_todos(view_def) 

153 view_analysis['comments'] = comments 

154 view_analysis['todos'] = todos 

155 

156 views.append(view_analysis) 

157 

158 return views 

159 

160 def _analyze_functions(self) -> List[Dict]: 

161 """Analyze functions in current database.""" 

162 self.cursor.execute(""" 

163 SELECT  

164 OBJECT_SCHEMA_NAME(f.object_id) as schema_name, 

165 f.name, 

166 m.definition, 

167 f.create_date, 

168 f.modify_date, 

169 f.type 

170 FROM sys.objects f 

171 INNER JOIN sys.sql_modules m ON f.object_id = m.object_id 

172 WHERE f.type IN ('FN', 'IF', 'TF') -- Scalar, Inline Table, Table-valued 

173 ORDER BY schema_name, f.name 

174 """) 

175 

176 functions = [] 

177 for row in self.cursor.fetchall(): 

178 func_def = row.definition 

179 

180 # Analyze the function 

181 func_analysis = { 

182 'schema': row.schema_name, 

183 'name': row.name, 

184 'definition': func_def, 

185 'metrics': { 

186 'lines': len(func_def.splitlines()), 

187 'complexity': self._estimate_complexity(func_def) 

188 }, 

189 'parameters': self._extract_parameters(func_def), 

190 'dependencies': self._extract_dependencies(func_def), 

191 'todos': [], 

192 'comments': [] 

193 } 

194 

195 # Extract comments and TODOs 

196 comments, todos = self._extract_comments_and_todos(func_def) 

197 func_analysis['comments'] = comments 

198 func_analysis['todos'] = todos 

199 

200 functions.append(func_analysis) 

201 

202 return functions 

203 

204 def analyze_file(self, file_path: Path) -> dict: 

205 """Analyze a SQL file.""" 

206 with open(file_path, 'r', encoding='utf-8') as f: 

207 content = f.read() 

208 

209 analysis = { 

210 'type': 'sql', 

211 'metrics': { 

212 'loc': len(content.splitlines()), 

213 'complexity': self._estimate_complexity(content) 

214 }, 

215 'objects': [], 

216 'parameters': [], 

217 'comments': [], 

218 'todos': [], 

219 'dependencies': self._extract_dependencies(content) 

220 } 

221 

222 # Extract SQL objects 

223 objects = self._extract_sql_objects(content) 

224 if objects: 

225 analysis['objects'] = objects 

226 

227 # Extract and update parameters with comments 

228 params = self._extract_parameters(content) 

229 self._update_params_with_comments(params, content) 

230 if params: 

231 analysis['parameters'] = params 

232 

233 # Extract comments and TODOs 

234 comments, todos = self._extract_comments_and_todos(content) 

235 analysis['comments'] = comments 

236 analysis['todos'] = todos 

237 

238 return analysis 

239 

240 def __del__(self): 

241 """Cleanup database connections.""" 

242 if self.cursor: 

243 self.cursor.close() 

244 if self.conn: 

245 self.conn.close() 

246 

247 # Rest of the existing methods remain the same 

248 def _extract_sql_objects(self, content: str) -> List[dict]: 

249 """Extract SQL objects like procedures, functions, and views.""" 

250 objects = [] 

251 

252 # Match CREATE/ALTER statements 

253 patterns = { 

254 'procedure': r'CREATE\s+(?:OR\s+ALTER\s+)?PROCEDURE\s+([^\s]+)', 

255 'function': r'CREATE\s+(?:OR\s+ALTER\s+)?FUNCTION\s+([^\s]+)', 

256 'view': r'CREATE\s+(?:OR\s+ALTER\s+)?VIEW\s+([^\s]+)' 

257 } 

258 

259 for obj_type, pattern in patterns.items(): 

260 matches = re.finditer(pattern, content, re.IGNORECASE) 

261 for match in matches: 

262 name = match.group(1) 

263 # Find the object's body 

264 start_pos = match.start() 

265 # Look for GO or end of file 

266 end_match = re.search(r'\bGO\b', content[start_pos:], re.IGNORECASE) 

267 if end_match: 

268 end_pos = start_pos + end_match.start() 

269 definition = content[start_pos:end_pos].strip() 

270 else: 

271 definition = content[start_pos:].strip() 

272 

273 objects.append({ 

274 'type': obj_type, 

275 'name': name, 

276 'definition': definition, 

277 'loc': len(definition.splitlines()), 

278 'complexity': self._estimate_complexity(definition) 

279 }) 

280 

281 return objects 

282 

283 def _extract_parameters(self, content: str) -> List[dict]: 

284 """Extract parameters from procedure or function definitions.""" 

285 params = [] 

286 # Find the procedure declaration 

287 proc_match = re.search( 

288 r'CREATE\s+(?:OR\s+ALTER\s+)?(?:PROCEDURE|FUNCTION)\s+([^\s]+)([\s\S]+?)AS\b', 

289 content, 

290 re.IGNORECASE 

291 ) 

292 

293 if proc_match: 

294 param_section = proc_match.group(2) 

295 # Extract each parameter line, handling multiline declarations 

296 param_lines = re.findall( 

297 r'@\w+\s+[^,@]+(?:\s*=\s*[^,]+)?(?=\s*,|\s*AS\b|\s*$)', 

298 param_section, 

299 re.IGNORECASE | re.DOTALL 

300 ) 

301 

302 for param_line in param_lines: 

303 # Extract individual parameter components 

304 param_match = re.match( 

305 r'@(\w+)\s+([^=\s]+(?:\([^)]*\))?)\s*(?:=\s*([^,\s][^,]*)?)?', 

306 param_line.strip() 

307 ) 

308 

309 if param_match: 

310 name, data_type, default = param_match.groups() 

311 param_info = { 

312 'name': name, 

313 'data_type': data_type.strip() 

314 } 

315 

316 if default: 

317 param_info['default'] = default.strip() 

318 

319 # Look for inline comment on the same line 

320 comment_match = re.search(r'--\s*(.*?)(?:\r?\n|$)', param_line) 

321 if comment_match: 

322 param_info['description'] = comment_match.group(1).strip() 

323 

324 params.append(param_info) 

325 

326 return params 

327 

328 def _update_params_with_comments(self, params: List[dict], content: str) -> None: 

329 """Update parameter documentation from nearby comments.""" 

330 lines = content.splitlines() 

331 for i, line in enumerate(lines): 

332 if '--' in line and any(param['name'] in line for param in params): 

333 comment = line[line.index('--')+2:].strip() 

334 param_name = next( 

335 (param['name'] for param in params if param['name'] in line), 

336 None 

337 ) 

338 if param_name: 

339 param = next(p for p in params if p['name'] == param_name) 

340 if 'description' not in param: 

341 param['description'] = comment 

342 

343 def _extract_dependencies(self, content: str) -> List[str]: 

344 """Extract table and view dependencies.""" 

345 deps = set() 

346 

347 # Define patterns for table references 

348 patterns = [ 

349 # FROM, JOIN, UPDATE, etc. followed by table name 

350 r'(?:FROM|JOIN|INTO|UPDATE)\s+([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)?)\b(?!\s*[=@])', 

351 # INSERT INTO pattern 

352 r'INSERT\s+INTO\s+([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)?)\b', 

353 # REFERENCES in constraints 

354 r'REFERENCES\s+([a-zA-Z_]\w*(?:\.[a-zA-Z_]\w*)?)\b' 

355 ] 

356 

357 # Define words that should not be treated as table names 

358 excluded_words = { 

359 'null', 'select', 'where', 'group', 'order', 'having', 

360 'exists', 'between', 'like', 'in', 'is', 'not', 'and', 'or', 

361 'operation', 'existing' # Add common variables 

362 } 

363 

364 for pattern in patterns: 

365 for match in re.finditer(pattern, content, re.IGNORECASE): 

366 table_name = match.group(1).strip() 

367 if table_name.lower() not in excluded_words: 

368 deps.add(table_name) 

369 

370 return sorted(list(deps)) 

371 

372 def _extract_comments_and_todos(self, content: str) -> tuple: 

373 """Extract comments and TODOs from SQL code.""" 

374 comments = [] 

375 todos = [] 

376 

377 # Match inline comments and block comments 

378 patterns = [ 

379 (r'--([^\n]+)', False), # Inline comments 

380 (r'/\*[\s\S]*?\*/', True) # Block comments 

381 ] 

382 

383 for pattern, is_multiline in patterns: 

384 for match in re.finditer(pattern, content): 

385 comment = match.group() 

386 if is_multiline: 

387 comment = comment.strip('/*').strip('*/') 

388 else: 

389 comment = comment.strip('--') 

390 comment = comment.strip() 

391 

392 # Skip empty comments and parameter comments 

393 if not comment or comment.startswith('@'): 

394 continue 

395 

396 line_num = content[:match.start()].count('\n') + 1 

397 

398 if any(marker in comment.upper() 

399 for marker in ['TODO', 'FIXME', 'XXX']): 

400 todos.append({ 

401 'text': comment, 

402 'line': line_num 

403 }) 

404 else: 

405 comments.append({ 

406 'text': comment, 

407 'line': line_num 

408 }) 

409 

410 return comments, todos 

411 

412 def _estimate_complexity(self, content: str) -> int: 

413 """Estimate SQL complexity based on various factors.""" 

414 complexity = 0 

415 content_lower = content.lower() 

416 

417 # Control flow complexity 

418 complexity += content_lower.count('if ') * 2 

419 complexity += content_lower.count('else ') * 2 

420 complexity += content_lower.count('case ') * 2 

421 complexity += content_lower.count('while ') * 3 

422 complexity += content_lower.count('cursor') * 4 

423 

424 # Query complexity 

425 complexity += content_lower.count('join ') * 2 

426 complexity += content_lower.count('where ') * 2 

427 complexity += content_lower.count('group by ') * 2 

428 complexity += content_lower.count('having ') * 3 

429 complexity += content_lower.count('union ') * 3 

430 

431 # Transaction complexity 

432 complexity += content_lower.count('transaction') * 2 

433 complexity += content_lower.count('try') * 2 

434 complexity += content_lower.count('catch') * 2 

435 

436 return complexity