Coverage for session_buddy / health_checks.py: 95.27%
131 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-04 00:43 -0800
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-04 00:43 -0800
1"""Health check implementations for session-mgmt-mcp server.
3Provides component-level health checks for database connectivity,
4file system access, and optional dependencies.
6Phase 10.1: Production Hardening - Session Management Health Checks
7"""
9from __future__ import annotations
11import importlib.util
12import sys
13import time
14import typing as t
16# Health status types (mcp_common.health doesn't exist in 2.0.0)
17from dataclasses import dataclass, field
18from enum import StrEnum
19from pathlib import Path
20from typing import TYPE_CHECKING
22if TYPE_CHECKING:
23 from session_buddy.reflection_tools import (
24 get_initialized_reflection_database,
25 get_reflection_database,
26 )
29class HealthStatus(StrEnum):
30 """Health status levels."""
32 HEALTHY = "healthy"
33 DEGRADED = "degraded"
34 UNHEALTHY = "unhealthy"
37@dataclass
38class ComponentHealth:
39 """Component health check result."""
41 name: str
42 status: HealthStatus
43 message: str
44 latency_ms: float | None = None
45 metadata: dict[str, t.Any] = field(default_factory=dict)
48# Try to import optional dependencies
49try:
50 from session_buddy.reflection_tools import (
51 get_initialized_reflection_database,
52 get_reflection_database,
53 )
55 REFLECTION_AVAILABLE = True
56except ImportError:
57 REFLECTION_AVAILABLE = False
60async def check_database_health() -> ComponentHealth:
61 """Check DuckDB reflection database connectivity and health.
63 Returns:
64 ComponentHealth with database status and latency
66 Checks:
67 - Database connection
68 - Basic query execution
69 - Response latency
71 """
72 if not REFLECTION_AVAILABLE:
73 return ComponentHealth(
74 name="database",
75 status=HealthStatus.DEGRADED,
76 message="Reflection database not available (optional feature)",
77 )
79 start_time = time.perf_counter()
81 try:
82 db = get_initialized_reflection_database() if REFLECTION_AVAILABLE else None
83 # Allow tests to patch get_reflection_database without initializing in production.
84 if (
85 db is None
86 and getattr(get_reflection_database, "__module__", "") == "unittest.mock"
87 ):
88 db = await get_reflection_database()
89 if db is None:
90 return ComponentHealth(
91 name="database",
92 status=HealthStatus.DEGRADED,
93 message="Reflection database not initialized",
94 latency_ms=(time.perf_counter() - start_time) * 1000,
95 metadata={"initialized": False},
96 )
98 # Test basic query execution
99 stats = await db.get_stats()
101 latency_ms = (time.perf_counter() - start_time) * 1000
103 # Check if database is responsive
104 if latency_ms > 500: # >500ms is concerning
105 return ComponentHealth(
106 name="database",
107 status=HealthStatus.DEGRADED,
108 message=f"High database latency: {latency_ms:.1f}ms",
109 latency_ms=latency_ms,
110 metadata={"conversations": stats.get("conversations_count", 0)},
111 )
113 return ComponentHealth(
114 name="database",
115 status=HealthStatus.HEALTHY,
116 message="Database operational",
117 latency_ms=latency_ms,
118 metadata={"conversations": stats.get("conversations_count", 0)},
119 )
121 except Exception as e:
122 latency_ms = (time.perf_counter() - start_time) * 1000
123 return ComponentHealth(
124 name="database",
125 status=HealthStatus.UNHEALTHY,
126 message=f"Database error: {str(e)[:100]}",
127 latency_ms=latency_ms,
128 )
131async def check_file_system_health() -> ComponentHealth:
132 """Check file system access for critical directories.
134 Returns:
135 ComponentHealth with file system status
137 Checks:
138 - ~/.claude directory exists and writable
139 - Data directories accessible
140 - Sufficient disk space (basic check)
142 """
143 start_time = time.perf_counter()
145 try:
146 claude_dir = Path.home() / ".claude"
148 # Check if directory exists
149 if not claude_dir.exists():
150 return ComponentHealth(
151 name="file_system",
152 status=HealthStatus.UNHEALTHY,
153 message="~/.claude directory does not exist",
154 )
156 # Check write permissions by creating/removing test file
157 test_file = claude_dir / ".health_check"
158 try:
159 test_file.write_text("health_check")
160 test_file.unlink()
161 except (OSError, PermissionError) as e:
162 return ComponentHealth(
163 name="file_system",
164 status=HealthStatus.UNHEALTHY,
165 message=f"~/.claude not writable: {e}",
166 )
168 # Check critical subdirectories
169 logs_dir = claude_dir / "logs"
170 data_dir = claude_dir / "data"
172 missing_dirs = []
173 if not logs_dir.exists():
174 missing_dirs.append("logs")
175 if not data_dir.exists():
176 missing_dirs.append("data")
178 latency_ms = (time.perf_counter() - start_time) * 1000
180 if missing_dirs:
181 return ComponentHealth(
182 name="file_system",
183 status=HealthStatus.DEGRADED,
184 message=f"Missing directories: {', '.join(missing_dirs)}",
185 latency_ms=latency_ms,
186 )
188 return ComponentHealth(
189 name="file_system",
190 status=HealthStatus.HEALTHY,
191 message="File system accessible",
192 latency_ms=latency_ms,
193 )
195 except Exception as e:
196 latency_ms = (time.perf_counter() - start_time) * 1000
197 return ComponentHealth(
198 name="file_system",
199 status=HealthStatus.UNHEALTHY,
200 message=f"File system error: {str(e)[:100]}",
201 latency_ms=latency_ms,
202 )
205async def check_dependencies_health() -> ComponentHealth:
206 """Check optional dependencies availability.
208 Returns:
209 ComponentHealth with dependency status
211 Checks:
212 - Crackerjack integration availability
213 - ONNX runtime for embeddings
214 - Other optional features
216 """
217 start_time = time.perf_counter()
219 available = []
220 unavailable = []
222 def _module_available(name: str) -> bool:
223 try:
224 return importlib.util.find_spec(name) is not None
225 except ValueError:
226 return name in sys.modules
228 # Check Crackerjack without importing heavy modules
229 crackerjack_available = False
230 quality_utils = sys.modules.get("session_buddy.utils.quality_utils_v2")
231 if quality_utils is not None:
232 crackerjack_available = bool(
233 getattr(quality_utils, "CRACKERJACK_AVAILABLE", False)
234 )
235 else:
236 crackerjack_available = _module_available("crackerjack")
238 if crackerjack_available:
239 available.append("crackerjack")
240 else:
241 unavailable.append("crackerjack")
243 # Check ONNX/embeddings without importing
244 if _module_available("onnxruntime"):
245 available.append("onnx")
246 else:
247 unavailable.append("onnx")
249 # Check multi-project features
250 try:
251 # Try to import the multi-project module directly without triggering server init
252 spec = importlib.util.find_spec("session_buddy.multi_project_coordinator")
253 if spec is not None:
254 available.append("multi_project")
255 else:
256 unavailable.append("multi_project")
257 except (ImportError, Exception):
258 unavailable.append("multi_project")
260 latency_ms = (time.perf_counter() - start_time) * 1000
262 # All optional dependencies missing is degraded, not unhealthy
263 if not available:
264 return ComponentHealth(
265 name="dependencies",
266 status=HealthStatus.DEGRADED,
267 message="No optional features available",
268 latency_ms=latency_ms,
269 metadata={"unavailable": unavailable},
270 )
272 # Some dependencies available
273 status = HealthStatus.HEALTHY if not unavailable else HealthStatus.DEGRADED
274 message = f"{len(available)} features available"
275 if unavailable:
276 message += f", {len(unavailable)} unavailable"
278 return ComponentHealth(
279 name="dependencies",
280 status=status,
281 message=message,
282 latency_ms=latency_ms,
283 metadata={"available": available, "unavailable": unavailable},
284 )
287async def check_python_environment_health() -> ComponentHealth:
288 """Check Python environment health and configuration.
290 Returns:
291 ComponentHealth with Python environment status
293 Checks:
294 - Python version compatibility
295 - Critical imports available
296 - Memory usage reasonable
298 """
299 import sys
301 start_time = time.perf_counter()
303 try:
304 # Check Python version (3.13+ required)
305 version_info = sys.version_info
306 if version_info < (3, 13):
307 return ComponentHealth(
308 name="python_env",
309 status=HealthStatus.UNHEALTHY,
310 message=f"Python 3.13+ required, got {version_info.major}.{version_info.minor}",
311 )
313 # Check critical imports
314 critical_imports = ["asyncio", "pathlib", "dataclasses", "enum"]
315 missing_imports = []
317 for module_name in critical_imports:
318 try:
319 __import__(module_name)
320 except ImportError:
321 missing_imports.append(module_name)
323 if missing_imports: 323 ↛ 324line 323 didn't jump to line 324 because the condition on line 323 was never true
324 return ComponentHealth(
325 name="python_env",
326 status=HealthStatus.UNHEALTHY,
327 message=f"Missing critical imports: {', '.join(missing_imports)}",
328 )
330 latency_ms = (time.perf_counter() - start_time) * 1000
332 return ComponentHealth(
333 name="python_env",
334 status=HealthStatus.HEALTHY,
335 message=f"Python {version_info.major}.{version_info.minor}.{version_info.micro}",
336 latency_ms=latency_ms,
337 metadata={
338 "python_version": f"{version_info.major}.{version_info.minor}.{version_info.micro}",
339 "platform": sys.platform,
340 },
341 )
343 except Exception as e:
344 latency_ms = (time.perf_counter() - start_time) * 1000
345 return ComponentHealth(
346 name="python_env",
347 status=HealthStatus.UNHEALTHY,
348 message=f"Environment check failed: {str(e)[:100]}",
349 latency_ms=latency_ms,
350 )
353async def get_all_health_checks() -> list[ComponentHealth]:
354 """Run all health checks and return results.
356 Returns:
357 List of ComponentHealth results for all checks
359 This is the main entry point for the health endpoint.
361 """
362 import asyncio
364 # Run all checks concurrently
365 results = await asyncio.gather(
366 check_python_environment_health(),
367 check_file_system_health(),
368 check_database_health(),
369 check_dependencies_health(),
370 return_exceptions=True,
371 )
373 # Convert any exceptions to unhealthy components
374 components: list[ComponentHealth] = []
375 check_names = ["python_env", "file_system", "database", "dependencies"]
377 for i, result in enumerate(results):
378 if isinstance(result, Exception):
379 components.append(
380 ComponentHealth(
381 name=check_names[i],
382 status=HealthStatus.UNHEALTHY,
383 message=f"Health check crashed: {str(result)[:100]}",
384 ),
385 )
386 else:
387 components.append(result) # type: ignore[arg-type] # result is ComponentHealth from gather
389 return components
392__all__ = [
393 "check_database_health",
394 "check_dependencies_health",
395 "check_file_system_health",
396 "check_python_environment_health",
397 "get_all_health_checks",
398]