Coverage for session_buddy / health_checks.py: 95.27%

131 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-04 00:43 -0800

1"""Health check implementations for session-mgmt-mcp server. 

2 

3Provides component-level health checks for database connectivity, 

4file system access, and optional dependencies. 

5 

6Phase 10.1: Production Hardening - Session Management Health Checks 

7""" 

8 

9from __future__ import annotations 

10 

11import importlib.util 

12import sys 

13import time 

14import typing as t 

15 

16# Health status types (mcp_common.health doesn't exist in 2.0.0) 

17from dataclasses import dataclass, field 

18from enum import StrEnum 

19from pathlib import Path 

20from typing import TYPE_CHECKING 

21 

22if TYPE_CHECKING: 

23 from session_buddy.reflection_tools import ( 

24 get_initialized_reflection_database, 

25 get_reflection_database, 

26 ) 

27 

28 

29class HealthStatus(StrEnum): 

30 """Health status levels.""" 

31 

32 HEALTHY = "healthy" 

33 DEGRADED = "degraded" 

34 UNHEALTHY = "unhealthy" 

35 

36 

37@dataclass 

38class ComponentHealth: 

39 """Component health check result.""" 

40 

41 name: str 

42 status: HealthStatus 

43 message: str 

44 latency_ms: float | None = None 

45 metadata: dict[str, t.Any] = field(default_factory=dict) 

46 

47 

48# Try to import optional dependencies 

49try: 

50 from session_buddy.reflection_tools import ( 

51 get_initialized_reflection_database, 

52 get_reflection_database, 

53 ) 

54 

55 REFLECTION_AVAILABLE = True 

56except ImportError: 

57 REFLECTION_AVAILABLE = False 

58 

59 

60async def check_database_health() -> ComponentHealth: 

61 """Check DuckDB reflection database connectivity and health. 

62 

63 Returns: 

64 ComponentHealth with database status and latency 

65 

66 Checks: 

67 - Database connection 

68 - Basic query execution 

69 - Response latency 

70 

71 """ 

72 if not REFLECTION_AVAILABLE: 

73 return ComponentHealth( 

74 name="database", 

75 status=HealthStatus.DEGRADED, 

76 message="Reflection database not available (optional feature)", 

77 ) 

78 

79 start_time = time.perf_counter() 

80 

81 try: 

82 db = get_initialized_reflection_database() if REFLECTION_AVAILABLE else None 

83 # Allow tests to patch get_reflection_database without initializing in production. 

84 if ( 

85 db is None 

86 and getattr(get_reflection_database, "__module__", "") == "unittest.mock" 

87 ): 

88 db = await get_reflection_database() 

89 if db is None: 

90 return ComponentHealth( 

91 name="database", 

92 status=HealthStatus.DEGRADED, 

93 message="Reflection database not initialized", 

94 latency_ms=(time.perf_counter() - start_time) * 1000, 

95 metadata={"initialized": False}, 

96 ) 

97 

98 # Test basic query execution 

99 stats = await db.get_stats() 

100 

101 latency_ms = (time.perf_counter() - start_time) * 1000 

102 

103 # Check if database is responsive 

104 if latency_ms > 500: # >500ms is concerning 

105 return ComponentHealth( 

106 name="database", 

107 status=HealthStatus.DEGRADED, 

108 message=f"High database latency: {latency_ms:.1f}ms", 

109 latency_ms=latency_ms, 

110 metadata={"conversations": stats.get("conversations_count", 0)}, 

111 ) 

112 

113 return ComponentHealth( 

114 name="database", 

115 status=HealthStatus.HEALTHY, 

116 message="Database operational", 

117 latency_ms=latency_ms, 

118 metadata={"conversations": stats.get("conversations_count", 0)}, 

119 ) 

120 

121 except Exception as e: 

122 latency_ms = (time.perf_counter() - start_time) * 1000 

123 return ComponentHealth( 

124 name="database", 

125 status=HealthStatus.UNHEALTHY, 

126 message=f"Database error: {str(e)[:100]}", 

127 latency_ms=latency_ms, 

128 ) 

129 

130 

131async def check_file_system_health() -> ComponentHealth: 

132 """Check file system access for critical directories. 

133 

134 Returns: 

135 ComponentHealth with file system status 

136 

137 Checks: 

138 - ~/.claude directory exists and writable 

139 - Data directories accessible 

140 - Sufficient disk space (basic check) 

141 

142 """ 

143 start_time = time.perf_counter() 

144 

145 try: 

146 claude_dir = Path.home() / ".claude" 

147 

148 # Check if directory exists 

149 if not claude_dir.exists(): 

150 return ComponentHealth( 

151 name="file_system", 

152 status=HealthStatus.UNHEALTHY, 

153 message="~/.claude directory does not exist", 

154 ) 

155 

156 # Check write permissions by creating/removing test file 

157 test_file = claude_dir / ".health_check" 

158 try: 

159 test_file.write_text("health_check") 

160 test_file.unlink() 

161 except (OSError, PermissionError) as e: 

162 return ComponentHealth( 

163 name="file_system", 

164 status=HealthStatus.UNHEALTHY, 

165 message=f"~/.claude not writable: {e}", 

166 ) 

167 

168 # Check critical subdirectories 

169 logs_dir = claude_dir / "logs" 

170 data_dir = claude_dir / "data" 

171 

172 missing_dirs = [] 

173 if not logs_dir.exists(): 

174 missing_dirs.append("logs") 

175 if not data_dir.exists(): 

176 missing_dirs.append("data") 

177 

178 latency_ms = (time.perf_counter() - start_time) * 1000 

179 

180 if missing_dirs: 

181 return ComponentHealth( 

182 name="file_system", 

183 status=HealthStatus.DEGRADED, 

184 message=f"Missing directories: {', '.join(missing_dirs)}", 

185 latency_ms=latency_ms, 

186 ) 

187 

188 return ComponentHealth( 

189 name="file_system", 

190 status=HealthStatus.HEALTHY, 

191 message="File system accessible", 

192 latency_ms=latency_ms, 

193 ) 

194 

195 except Exception as e: 

196 latency_ms = (time.perf_counter() - start_time) * 1000 

197 return ComponentHealth( 

198 name="file_system", 

199 status=HealthStatus.UNHEALTHY, 

200 message=f"File system error: {str(e)[:100]}", 

201 latency_ms=latency_ms, 

202 ) 

203 

204 

205async def check_dependencies_health() -> ComponentHealth: 

206 """Check optional dependencies availability. 

207 

208 Returns: 

209 ComponentHealth with dependency status 

210 

211 Checks: 

212 - Crackerjack integration availability 

213 - ONNX runtime for embeddings 

214 - Other optional features 

215 

216 """ 

217 start_time = time.perf_counter() 

218 

219 available = [] 

220 unavailable = [] 

221 

222 def _module_available(name: str) -> bool: 

223 try: 

224 return importlib.util.find_spec(name) is not None 

225 except ValueError: 

226 return name in sys.modules 

227 

228 # Check Crackerjack without importing heavy modules 

229 crackerjack_available = False 

230 quality_utils = sys.modules.get("session_buddy.utils.quality_utils_v2") 

231 if quality_utils is not None: 

232 crackerjack_available = bool( 

233 getattr(quality_utils, "CRACKERJACK_AVAILABLE", False) 

234 ) 

235 else: 

236 crackerjack_available = _module_available("crackerjack") 

237 

238 if crackerjack_available: 

239 available.append("crackerjack") 

240 else: 

241 unavailable.append("crackerjack") 

242 

243 # Check ONNX/embeddings without importing 

244 if _module_available("onnxruntime"): 

245 available.append("onnx") 

246 else: 

247 unavailable.append("onnx") 

248 

249 # Check multi-project features 

250 try: 

251 # Try to import the multi-project module directly without triggering server init 

252 spec = importlib.util.find_spec("session_buddy.multi_project_coordinator") 

253 if spec is not None: 

254 available.append("multi_project") 

255 else: 

256 unavailable.append("multi_project") 

257 except (ImportError, Exception): 

258 unavailable.append("multi_project") 

259 

260 latency_ms = (time.perf_counter() - start_time) * 1000 

261 

262 # All optional dependencies missing is degraded, not unhealthy 

263 if not available: 

264 return ComponentHealth( 

265 name="dependencies", 

266 status=HealthStatus.DEGRADED, 

267 message="No optional features available", 

268 latency_ms=latency_ms, 

269 metadata={"unavailable": unavailable}, 

270 ) 

271 

272 # Some dependencies available 

273 status = HealthStatus.HEALTHY if not unavailable else HealthStatus.DEGRADED 

274 message = f"{len(available)} features available" 

275 if unavailable: 

276 message += f", {len(unavailable)} unavailable" 

277 

278 return ComponentHealth( 

279 name="dependencies", 

280 status=status, 

281 message=message, 

282 latency_ms=latency_ms, 

283 metadata={"available": available, "unavailable": unavailable}, 

284 ) 

285 

286 

287async def check_python_environment_health() -> ComponentHealth: 

288 """Check Python environment health and configuration. 

289 

290 Returns: 

291 ComponentHealth with Python environment status 

292 

293 Checks: 

294 - Python version compatibility 

295 - Critical imports available 

296 - Memory usage reasonable 

297 

298 """ 

299 import sys 

300 

301 start_time = time.perf_counter() 

302 

303 try: 

304 # Check Python version (3.13+ required) 

305 version_info = sys.version_info 

306 if version_info < (3, 13): 

307 return ComponentHealth( 

308 name="python_env", 

309 status=HealthStatus.UNHEALTHY, 

310 message=f"Python 3.13+ required, got {version_info.major}.{version_info.minor}", 

311 ) 

312 

313 # Check critical imports 

314 critical_imports = ["asyncio", "pathlib", "dataclasses", "enum"] 

315 missing_imports = [] 

316 

317 for module_name in critical_imports: 

318 try: 

319 __import__(module_name) 

320 except ImportError: 

321 missing_imports.append(module_name) 

322 

323 if missing_imports: 323 ↛ 324line 323 didn't jump to line 324 because the condition on line 323 was never true

324 return ComponentHealth( 

325 name="python_env", 

326 status=HealthStatus.UNHEALTHY, 

327 message=f"Missing critical imports: {', '.join(missing_imports)}", 

328 ) 

329 

330 latency_ms = (time.perf_counter() - start_time) * 1000 

331 

332 return ComponentHealth( 

333 name="python_env", 

334 status=HealthStatus.HEALTHY, 

335 message=f"Python {version_info.major}.{version_info.minor}.{version_info.micro}", 

336 latency_ms=latency_ms, 

337 metadata={ 

338 "python_version": f"{version_info.major}.{version_info.minor}.{version_info.micro}", 

339 "platform": sys.platform, 

340 }, 

341 ) 

342 

343 except Exception as e: 

344 latency_ms = (time.perf_counter() - start_time) * 1000 

345 return ComponentHealth( 

346 name="python_env", 

347 status=HealthStatus.UNHEALTHY, 

348 message=f"Environment check failed: {str(e)[:100]}", 

349 latency_ms=latency_ms, 

350 ) 

351 

352 

353async def get_all_health_checks() -> list[ComponentHealth]: 

354 """Run all health checks and return results. 

355 

356 Returns: 

357 List of ComponentHealth results for all checks 

358 

359 This is the main entry point for the health endpoint. 

360 

361 """ 

362 import asyncio 

363 

364 # Run all checks concurrently 

365 results = await asyncio.gather( 

366 check_python_environment_health(), 

367 check_file_system_health(), 

368 check_database_health(), 

369 check_dependencies_health(), 

370 return_exceptions=True, 

371 ) 

372 

373 # Convert any exceptions to unhealthy components 

374 components: list[ComponentHealth] = [] 

375 check_names = ["python_env", "file_system", "database", "dependencies"] 

376 

377 for i, result in enumerate(results): 

378 if isinstance(result, Exception): 

379 components.append( 

380 ComponentHealth( 

381 name=check_names[i], 

382 status=HealthStatus.UNHEALTHY, 

383 message=f"Health check crashed: {str(result)[:100]}", 

384 ), 

385 ) 

386 else: 

387 components.append(result) # type: ignore[arg-type] # result is ComponentHealth from gather 

388 

389 return components 

390 

391 

392__all__ = [ 

393 "check_database_health", 

394 "check_dependencies_health", 

395 "check_file_system_health", 

396 "check_python_environment_health", 

397 "get_all_health_checks", 

398]