streamlit_healthcheck.healthcheck
1import streamlit as st 2import psutil 3import numpy as np 4import pandas as pd 5import requests 6import time 7import threading 8import json 9import os 10from datetime import datetime 11from typing import Dict, List, Any, Optional, Callable 12import threading 13import functools 14import traceback 15import logging 16 17# Set up logging 18logging.basicConfig( 19 level=logging.INFO, 20 format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 21 handlers=[ 22 logging.StreamHandler() 23 ] 24) 25logger = logging.getLogger(__name__) 26 27class StreamlitPageMonitor: 28 """ 29 Singleton class to monitor and record errors and exceptions occurring in Streamlit pages. 30 This class monkey-patches `st.error` to capture error messages and provides decorators and methods 31 to track exceptions and errors per Streamlit page. Errors are stored in a class-level dictionary 32 and can be retrieved or cleared as needed. 33 Attributes: 34 _instance (StreamlitPageMonitor): Singleton instance of the monitor. 35 _errors (Dict[str, List[Dict[str, Any]]]): Dictionary mapping page names to lists of error records. 36 _st_error (Callable): Original `st.error` function before monkey-patching. 37 _current_page (str): Name of the current page being monitored. 38 Methods: 39 __new__(cls): 40 Ensures singleton behavior and monkey-patches `st.error` to record error messages. 41 _handle_st_error(cls, error_message: str): 42 Handles calls to `st.error` and records error information for the current page. 43 set_page_context(cls, page_name: str): 44 Sets the current page context for error recording. 45 monitor_page(cls, page_name: str): 46 Decorator to monitor a Streamlit page for exceptions and `st.error` calls. 47 Records exceptions and errors under the specified page name. 48 get_page_errors(cls): 49 Retrieves all recorded errors for all pages, grouped by page name. 50 clear_errors(cls, page_name: Optional[str] = None): 51 Clears recorded errors for a specific page or all pages. 52 """ 53 54 _instance = None 55 _errors: Dict[str, List[Dict[str, Any]]] = {} 56 _st_error = st.error 57 _current_page = None 58 59 def __new__(cls): 60 if cls._instance is None: 61 cls._instance = super(StreamlitPageMonitor, cls).__new__(cls) 62 63 # Monkey patch st.error to capture error messages 64 def patched_error(*args, **kwargs): 65 error_message = " ".join(str(arg) for arg in args) 66 current_page = cls._current_page 67 68 error_info = { 69 'error': error_message, 70 'traceback': traceback.format_stack(), 71 'timestamp': datetime.now().isoformat(), 72 'status': 'critical', 73 'type': 'streamlit_error', 74 'page': current_page 75 } 76 77 # Ensure current_page is a string, not None 78 if current_page is None: 79 current_page = "unknown_page" 80 if current_page not in cls._errors: 81 cls._errors[current_page] = [] 82 83 cls._errors[current_page].append(error_info) 84 85 # Call original st.error 86 return cls._st_error(*args, **kwargs) 87 88 st.error = patched_error 89 return cls._instance 90 91 @classmethod 92 def _handle_st_error(cls, error_message: str): 93 """ 94 Handles Streamlit-specific errors by recording error details for the current page. 95 Args: 96 error_message (str): The error message to be logged. 97 Side Effects: 98 Updates the class-level _errors dictionary with error information for the current Streamlit page. 99 Error Information Stored: 100 - error: Formatted error message. 101 - traceback: Stack trace at the point of error. 102 - timestamp: Time when the error occurred (ISO format). 103 - status: Error severity ('critical'). 104 - type: Error type ('streamlit_error'). 105 """ 106 107 # Get current page name from Streamlit context 108 current_page = getattr(st, '_current_page', 'unknown_page') 109 110 error_info = { 111 'error': f"Streamlit Error: {error_message}", 112 'traceback': traceback.format_stack(), 113 'timestamp': datetime.now().isoformat(), 114 'status': 'critical', 115 'type': 'streamlit_error' 116 } 117 118 # Initialize list for page if not exists 119 if current_page not in cls._errors: 120 cls._errors[current_page] = [] 121 122 # Add new error 123 cls._errors[current_page].append(error_info) 124 125 @classmethod 126 def set_page_context(cls, page_name: str): 127 """Set the current page context""" 128 cls._current_page = page_name 129 130 @classmethod 131 def monitor_page(cls, page_name: str): 132 """ 133 Decorator to monitor and log exceptions for a specific Streamlit page. 134 Args: 135 page_name (str): The name of the page to monitor. 136 Returns: 137 Callable: A decorator that wraps the target function, sets the page context, 138 clears previous non-Streamlit errors, and logs any exceptions that occur during execution. 139 The decorator performs the following actions: 140 - Sets the current page context using `cls.set_page_context`. 141 - Clears previous exception errors for the page, retaining only those marked as 'streamlit_error'. 142 - Executes the wrapped function. 143 - If an exception occurs, logs detailed error information (error message, traceback, timestamp, status, type, and page) 144 to `cls._errors` under the given page name, then re-raises the exception. 145 """ 146 147 def decorator(func): 148 """ 149 Decorator to manage page-specific error handling and context setting. 150 This decorator sets the current page context before executing the decorated function. 151 It clears previous exception errors for the page, retaining only Streamlit error calls. 152 If an exception occurs during function execution, it captures error details including 153 the error message, traceback, timestamp, status, type, and page name, and appends them 154 to the page's error log. The exception is then re-raised. 155 Args: 156 func (Callable): The function to be decorated. 157 Returns: 158 Callable: The wrapped function with error handling and context management. 159 """ 160 161 @functools.wraps(func) 162 def wrapper(*args, **kwargs): 163 # Set the current page context 164 cls.set_page_context(page_name) 165 try: 166 # Clear previous exception errors but keep st.error calls 167 if page_name in cls._errors: 168 cls._errors[page_name] = [ 169 e for e in cls._errors[page_name] 170 if e.get('type') == 'streamlit_error' 171 ] 172 result = func(*args, **kwargs) 173 return result 174 except Exception as e: 175 error_info = { 176 'error': str(e), 177 'traceback': traceback.format_exc(), 178 'timestamp': datetime.now().isoformat(), 179 'status': 'critical', 180 'type': 'exception', 181 'page': page_name 182 } 183 if page_name not in cls._errors: 184 cls._errors[page_name] = [] 185 cls._errors[page_name].append(error_info) 186 raise 187 return wrapper 188 return decorator 189 190 @classmethod 191 def get_page_errors(cls): 192 """ 193 Collects and returns errors for each page that has recorded errors. 194 Iterates through the internal `_errors` dictionary, and for each page with errors, 195 constructs a list of error details including the error message, traceback, timestamp, 196 and error type. 197 Returns: 198 dict: A dictionary where keys are page names and values are lists of error details. 199 Each error detail is a dictionary with the following keys: 200 - 'error' (str): The error message or 'Unknown error' if not present. 201 - 'traceback' (list): The traceback information or empty list if not present. 202 - 'timestamp' (str): The timestamp of the error or empty string if not present. 203 - 'type' (str): The type of error or 'unknown' if not present. 204 """ 205 206 result = {} 207 for page, errors in cls._errors.items(): 208 if errors: # Only include pages with errors 209 result[page] = [ 210 { 211 'error': err.get('error', 'Unknown error'), 212 'traceback': err.get('traceback', []), 213 'timestamp': err.get('timestamp', ''), 214 'type': err.get('type', 'unknown') 215 } 216 for err in errors 217 ] 218 return result 219 220 @classmethod 221 def clear_errors(cls, page_name: Optional[str] = None): 222 """Clear errors for a specific page or all pages""" 223 if page_name: 224 if page_name in cls._errors: 225 del cls._errors[page_name] 226 else: 227 cls._errors = {} 228 229class HealthCheckService: 230 """ 231 HealthCheckService provides a comprehensive health monitoring solution for Streamlit applications. 232 It periodically checks system resources, external dependencies, custom health checks, and Streamlit server/page status, 233 updating and reporting the overall health status. 234 Attributes: 235 logger (logging.Logger): Logger for health check events. 236 config_path (str): Path to the health check configuration file. 237 health_data (Dict[str, Any]): Stores the latest health check results. 238 config (Dict): Loaded health check configuration. 239 check_interval (int): Interval (in seconds) between health checks. 240 _running (bool): Indicates if the health check service is running. 241 _thread (threading.Thread): Background thread for periodic checks. 242 streamlit_url (str): URL of the Streamlit server. 243 streamlit_port (int): Port of the Streamlit server. 244 Methods: 245 __init__(config_path: str = "health_check_config.json"): 246 Initializes the HealthCheckService with configuration and default health data. 247 _load_config() -> Dict: 248 Loads health check configuration from file or returns default configuration. 249 _get_default_config() -> Dict: 250 Returns the default health check configuration. 251 start(): 252 Starts the health check service in a background thread. 253 stop(): 254 Stops the health check service. 255 _run_checks_periodically(): 256 Runs health checks periodically based on the configured interval. 257 run_all_checks(): 258 Executes all configured health checks and updates health data. 259 check_cpu(): 260 Checks CPU usage and updates health data. 261 check_memory(): 262 Checks memory usage and updates health data. 263 check_disk(): 264 Checks disk usage and updates health data. 265 check_dependencies(): 266 Checks external dependencies such as APIs and databases. 267 _check_api_endpoint(endpoint: Dict): 268 Checks if an API endpoint is accessible and updates health data. 269 _check_database(db_config: Dict): 270 Checks database connection (placeholder for actual implementation). 271 register_custom_check(name: str, check_func: Callable[[], Dict[str, Any]]): 272 Registers a custom health check function. 273 run_custom_checks(): 274 Executes all registered custom health checks. 275 _update_overall_status(): 276 Updates the overall health status based on individual checks. 277 get_health_data() -> Dict: 278 Returns the latest health check data, excluding function references. 279 save_config(): 280 Saves the current configuration to file. 281 check_streamlit_pages(): 282 Checks for errors in Streamlit pages and updates health data. 283 check_streamlit_server() -> Dict[str, Any]: 284 Checks if the Streamlit server is running and responding. 285 """ 286 def __init__(self, config_path: str = "health_check_config.json"): 287 """ 288 Initializes the HealthCheckService instance. 289 Args: 290 config_path (str): Path to the health check configuration file. Defaults to "health_check_config.json". 291 Attributes: 292 logger (logging.Logger): Logger for the HealthCheckService. 293 config_path (str): Path to the configuration file. 294 health_data (Dict[str, Any]): Dictionary storing health check data. 295 config (dict): Loaded configuration from the config file. 296 check_interval (int): Interval in seconds between health checks. Defaults to 60. 297 _running (bool): Indicates if the health check service is running. 298 _thread (threading.Thread or None): Thread running the health check loop. 299 streamlit_url (str): URL of the Streamlit service. Defaults to "http://localhost". 300 streamlit_port (int): Port of the Streamlit service. Defaults to 8501. 301 """ 302 self.logger = logging.getLogger(f"{__name__}.HealthCheckService") 303 self.logger.info("Initializing HealthCheckService") 304 self.config_path = config_path 305 self.health_data: Dict[str, Any] = { 306 "last_updated": None, 307 "system": {}, 308 "dependencies": {}, 309 "custom_checks": {}, 310 "overall_status": "unknown" 311 } 312 self.config = self._load_config() 313 self.check_interval = self.config.get("check_interval", 60) # Default: 60 seconds 314 self._running = False 315 self._thread = None 316 self.streamlit_url = self.config.get("streamlit_url", "http://localhost") 317 self.streamlit_port = self.config.get("streamlit_port", 8501) # Default: 8501 318 def _load_config(self) -> Dict: 319 """Load health check configuration from file.""" 320 if os.path.exists(self.config_path): 321 try: 322 with open(self.config_path, "r") as f: 323 return json.load(f) 324 except Exception as e: 325 st.error(f"Error loading health check config: {str(e)}") 326 return self._get_default_config() 327 else: 328 return self._get_default_config() 329 330 def _get_default_config(self) -> Dict: 331 """Return default health check configuration.""" 332 return { 333 "check_interval": 60, 334 "streamlit_url": "http://localhost", 335 "streamlit_port": 8501, 336 "system_checks": { 337 "cpu": True, 338 "memory": True, 339 "disk": True 340 }, 341 "dependencies": { 342 "api_endpoints": [ 343 # Example API endpoint to check 344 {"name": "example_api", "url": "https://httpbin.org/get", "timeout": 5} 345 ], 346 "databases": [ 347 # Example database connection to check 348 {"name": "main_db", "type": "postgres", "connection_string": "..."} 349 ] 350 }, 351 "thresholds": { 352 "cpu_warning": 70, 353 "cpu_critical": 90, 354 "memory_warning": 70, 355 "memory_critical": 90, 356 "disk_warning": 70, 357 "disk_critical": 90 358 } 359 } 360 361 def start(self): 362 """Start the health check service in a background thread.""" 363 if self._running: 364 return 365 366 self._running = True 367 self._thread = threading.Thread(target=self._run_checks_periodically, daemon=True) 368 self._thread.start() 369 370 def stop(self): 371 """Stop the health check service.""" 372 self._running = False 373 if self._thread: 374 self._thread.join(timeout=1) 375 376 def _run_checks_periodically(self): 377 """Run health checks periodically based on check interval.""" 378 while self._running: 379 self.run_all_checks() 380 time.sleep(self.check_interval) 381 382 def run_all_checks(self): 383 """Run all configured health checks and update health data.""" 384 # Update timestamp 385 self.health_data["last_updated"] = datetime.now().isoformat() 386 387 # Check Streamlit server 388 self.health_data["streamlit_server"] = self.check_streamlit_server() 389 390 # System checks 391 if self.config["system_checks"].get("cpu", True): 392 self.check_cpu() 393 if self.config["system_checks"].get("memory", True): 394 self.check_memory() 395 if self.config["system_checks"].get("disk", True): 396 self.check_disk() 397 398 # Rest of the existing checks... 399 self.check_dependencies() 400 self.run_custom_checks() 401 self.check_streamlit_pages() 402 self._update_overall_status() 403 404 def check_cpu(self): 405 """ 406 Checks the current CPU usage and updates the health status based on configured thresholds. 407 Measures the CPU usage percentage over a 1-second interval using psutil. Compares the result 408 against warning and critical thresholds defined in the configuration. Sets the status to 409 'healthy', 'warning', or 'critical' accordingly, and updates the health data dictionary. 410 Returns: 411 None 412 """ 413 414 cpu_percent = psutil.cpu_percent(interval=1) 415 warning_threshold = self.config["thresholds"].get("cpu_warning", 70) 416 critical_threshold = self.config["thresholds"].get("cpu_critical", 90) 417 418 status = "healthy" 419 if cpu_percent >= critical_threshold: 420 status = "critical" 421 elif cpu_percent >= warning_threshold: 422 status = "warning" 423 424 self.health_data["system"]["cpu"] = { 425 "usage_percent": cpu_percent, 426 "status": status 427 } 428 429 def check_memory(self): 430 """ 431 Checks the system's memory usage and updates the health status accordingly. 432 Retrieves the current memory usage statistics using psutil, compares the usage percentage 433 against configured warning and critical thresholds, and sets the memory status to 'healthy', 434 'warning', or 'critical'. Updates the health_data dictionary with total memory, available memory, 435 usage percentage, and status. 436 Returns: 437 None 438 """ 439 440 memory = psutil.virtual_memory() 441 memory_percent = memory.percent 442 warning_threshold = self.config["thresholds"].get("memory_warning", 70) 443 critical_threshold = self.config["thresholds"].get("memory_critical", 90) 444 445 status = "healthy" 446 if memory_percent >= critical_threshold: 447 status = "critical" 448 elif memory_percent >= warning_threshold: 449 status = "warning" 450 451 self.health_data["system"]["memory"] = { 452 "total_gb": round(memory.total / (1024**3), 2), 453 "available_gb": round(memory.available / (1024**3), 2), 454 "usage_percent": memory_percent, 455 "status": status 456 } 457 458 def check_disk(self): 459 """ 460 Checks the disk usage of the root filesystem and updates the health status. 461 Retrieves disk usage statistics using psutil, compares the usage percentage 462 against configured warning and critical thresholds, and sets the disk status 463 accordingly ("healthy", "warning", or "critical"). Updates the health_data 464 dictionary with total disk size, free space, usage percentage, and status. 465 Returns: 466 None 467 """ 468 469 disk = psutil.disk_usage('/') 470 disk_percent = disk.percent 471 warning_threshold = self.config["thresholds"].get("disk_warning", 70) 472 critical_threshold = self.config["thresholds"].get("disk_critical", 90) 473 474 status = "healthy" 475 if disk_percent >= critical_threshold: 476 status = "critical" 477 elif disk_percent >= warning_threshold: 478 status = "warning" 479 480 self.health_data["system"]["disk"] = { 481 "total_gb": round(disk.total / (1024**3), 2), 482 "free_gb": round(disk.free / (1024**3), 2), 483 "usage_percent": disk_percent, 484 "status": status 485 } 486 487 def check_dependencies(self): 488 """ 489 Checks the health of configured dependencies, including API endpoints and databases. 490 Iterates through the list of API endpoints and databases specified in the configuration, 491 and performs health checks on each by invoking the corresponding internal methods. 492 Raises: 493 Exception: If any dependency check fails. 494 """ 495 496 # Check API endpoints 497 for endpoint in self.config["dependencies"].get("api_endpoints", []): 498 self._check_api_endpoint(endpoint) 499 500 # Check database connections 501 for db in self.config["dependencies"].get("databases", []): 502 self._check_database(db) 503 504 def _check_api_endpoint(self, endpoint: Dict): 505 """ 506 Check if an API endpoint is accessible. 507 508 Args: 509 endpoint: Dictionary with endpoint configuration 510 """ 511 name = endpoint.get("name", "unknown_api") 512 url = endpoint.get("url", "") 513 timeout = endpoint.get("timeout", 5) 514 515 if not url: 516 return 517 518 try: 519 start_time = time.time() 520 response = requests.get(url, timeout=timeout) 521 response_time = time.time() - start_time 522 523 status = "healthy" if response.status_code < 400 else "critical" 524 525 self.health_data["dependencies"][name] = { 526 "type": "api", 527 "url": url, 528 "status": status, 529 "response_time_ms": round(response_time * 1000, 2), 530 "status_code": response.status_code 531 } 532 except Exception as e: 533 self.health_data["dependencies"][name] = { 534 "type": "api", 535 "url": url, 536 "status": "critical", 537 "error": str(e) 538 } 539 540 def _check_database(self, db_config: Dict): 541 """ 542 Check database connection. 543 Note: This is a placeholder. You'll need to implement specific database checks 544 based on your application's needs. 545 546 Args: 547 db_config: Dictionary with database configuration 548 """ 549 name = db_config.get("name", "unknown_db") 550 db_type = db_config.get("type", "") 551 552 # Placeholder for database connection check 553 # In a real implementation, you would check the specific database connection 554 self.health_data["dependencies"][name] = { 555 "type": "database", 556 "db_type": db_type, 557 "status": "unknown", 558 "message": "Database check not implemented" 559 } 560 561 def register_custom_check(self, name: str, check_func: Callable[[], Dict[str, Any]]): 562 """ 563 Register a custom health check function. 564 565 Args: 566 name: Name of the custom check 567 check_func: Function that performs the check and returns a dictionary with results 568 """ 569 if "custom_checks" not in self.health_data: 570 self.health_data["custom_checks"] = {} 571 572 self.health_data["custom_checks"][name] = { 573 "status": "unknown", 574 "check_func": check_func 575 } 576 577 def run_custom_checks(self): 578 """Run all registered custom health checks.""" 579 if "custom_checks" not in self.health_data: 580 return 581 582 for name, check_info in list(self.health_data["custom_checks"].items()): 583 if "check_func" in check_info and callable(check_info["check_func"]): 584 try: 585 result = check_info["check_func"]() 586 # Remove the function reference from the result 587 func = check_info["check_func"] 588 self.health_data["custom_checks"][name] = result 589 # Add the function back 590 self.health_data["custom_checks"][name]["check_func"] = func 591 except Exception as e: 592 self.health_data["custom_checks"][name] = { 593 "status": "critical", 594 "error": str(e), 595 "check_func": check_info["check_func"] 596 } 597 598 def _update_overall_status(self): 599 """ 600 Updates the overall health status of the application based on the statuses of various components. 601 The method checks the health status of the following components: 602 - Streamlit server 603 - System checks 604 - Dependencies 605 - Custom checks (excluding those with a 'check_func' key) 606 - Streamlit pages 607 The overall status is determined using the following priority order: 608 1. "critical" if any component is critical 609 2. "warning" if any component is warning and none are critical 610 3. "unknown" if any component is unknown and none are critical or warning, and no healthy components exist 611 4. "healthy" if any component is healthy and none are critical, warning, or unknown 612 5. "unknown" if no statuses are found 613 The result is stored in `self.health_data["overall_status"]`. 614 """ 615 616 has_critical = False 617 has_warning = False 618 has_healthy = False 619 has_unknown = False 620 621 # Helper function to check status 622 def check_component_status(status): 623 nonlocal has_critical, has_warning, has_healthy, has_unknown 624 if status == "critical": 625 has_critical = True 626 elif status == "warning": 627 has_warning = True 628 elif status == "healthy": 629 has_healthy = True 630 elif status == "unknown": 631 has_unknown = True 632 633 # Check Streamlit server status 634 server_status = self.health_data.get("streamlit_server", {}).get("status") 635 check_component_status(server_status) 636 637 # Check system status 638 for system_check in self.health_data.get("system", {}).values(): 639 check_component_status(system_check.get("status")) 640 641 # Check dependencies status 642 for dep_check in self.health_data.get("dependencies", {}).values(): 643 check_component_status(dep_check.get("status")) 644 645 # Check custom checks status 646 for custom_check in self.health_data.get("custom_checks", {}).values(): 647 if isinstance(custom_check, dict) and "check_func" not in custom_check: 648 check_component_status(custom_check.get("status")) 649 650 # Check Streamlit pages status 651 pages_status = self.health_data.get("streamlit_pages", {}).get("status") 652 check_component_status(pages_status) 653 654 # Determine overall status with priority: 655 # critical > warning > unknown > healthy 656 if has_critical: 657 self.health_data["overall_status"] = "critical" 658 elif has_warning: 659 self.health_data["overall_status"] = "warning" 660 elif has_unknown and not has_healthy: 661 self.health_data["overall_status"] = "unknown" 662 elif has_healthy: 663 self.health_data["overall_status"] = "healthy" 664 else: 665 self.health_data["overall_status"] = "unknown" 666 667 def get_health_data(self) -> Dict: 668 """Get the latest health check data.""" 669 # Create a copy without the function references 670 result: Dict[str, Any] = {} 671 for key, value in self.health_data.items(): 672 if key == "custom_checks": 673 result[key] = {} 674 for check_name, check_data in value.items(): 675 if isinstance(check_data, dict): 676 check_copy = check_data.copy() 677 if "check_func" in check_copy: 678 del check_copy["check_func"] 679 result[key][check_name] = check_copy 680 else: 681 result[key] = value 682 return result 683 684 def save_config(self): 685 """ 686 Saves the current health check configuration to a JSON file. 687 Attempts to write the configuration stored in `self.config` to the file specified by `self.config_path`. 688 Displays a success message in the Streamlit app upon successful save. 689 Handles and displays appropriate error messages for file not found, permission issues, JSON decoding errors, and other exceptions. 690 Raises: 691 FileNotFoundError: If the configuration file path does not exist. 692 PermissionError: If there are insufficient permissions to write to the file. 693 json.JSONDecodeError: If there is an error decoding the JSON data. 694 Exception: For any other exceptions encountered during the save process. 695 """ 696 697 try: 698 with open(self.config_path, "w") as f: 699 json.dump(self.config, f, indent=2) 700 st.success(f"Health check config saved successfully to {self.config_path}") 701 except FileNotFoundError: 702 st.error(f"Configuration file not found: {self.config_path}") 703 except PermissionError: 704 st.error(f"Permission denied: Unable to write to {self.config_path}") 705 except json.JSONDecodeError: 706 st.error(f"Error decoding JSON in config file: {self.config_path}") 707 except Exception as e: 708 st.error(f"Error saving health check config: {str(e)}") 709 def check_streamlit_pages(self): 710 """ 711 Checks for errors in Streamlit pages and updates the health data accordingly. 712 This method retrieves page errors using StreamlitPageMonitor.get_page_errors(). 713 If errors are found, it sets the 'streamlit_pages' status to 'critical' and updates 714 the overall health status to 'critical'. If no errors are found, it marks the 715 'streamlit_pages' status as 'healthy'. 716 Updates: 717 self.health_data["streamlit_pages"]: Dict containing status, error count, errors, and details. 718 self.health_data["overall_status"]: Set to 'critical' if errors are detected. 719 Returns: 720 None 721 """ 722 723 page_errors = StreamlitPageMonitor.get_page_errors() 724 725 if "streamlit_pages" not in self.health_data: 726 self.health_data["streamlit_pages"] = {} 727 728 if page_errors: 729 self.health_data["streamlit_pages"] = { 730 "status": "critical", 731 "error_count": len(page_errors), 732 "errors": page_errors, 733 "details": "Errors detected in Streamlit pages" 734 } 735 # This affects overall status 736 self.health_data["overall_status"] = "critical" 737 else: 738 self.health_data["streamlit_pages"] = { 739 "status": "healthy", 740 "error_count": 0, 741 "errors": {}, 742 "details": "All pages functioning normally" 743 } 744 745 def check_streamlit_server(self) -> Dict[str, Any]: 746 """ 747 Checks the health status of the Streamlit server by sending a GET request to the /healthz endpoint. 748 Returns: 749 Dict[str, Any]: A dictionary containing the health status, response code, latency in milliseconds, 750 message, and the URL checked. If the server is healthy (HTTP 200), status is "healthy". 751 Otherwise, status is "critical" with error details. 752 Handles: 753 - Connection errors: Returns critical status with connection error details. 754 - Timeout errors: Returns critical status with timeout error details. 755 - Other exceptions: Returns critical status with unknown error details. 756 Logs: 757 - The URL being checked. 758 - The response status code and text. 759 - Health status and response time if healthy. 760 - Warnings and errors for unhealthy or failed checks. 761 """ 762 763 try: 764 host = self.streamlit_url.rstrip('/') 765 if not host.startswith(('http://', 'https://')): 766 host = f"http://{host}" 767 768 url = f"{host}:{self.streamlit_port}/healthz" 769 self.logger.info(f"Checking Streamlit server health at: {url}") 770 771 start_time = time.time() 772 response = requests.get(url, timeout=3) 773 total_time = (time.time() - start_time) * 1000 774 self.logger.info(f"{response.status_code} - {response.text}") 775 # Check if the response is healthy 776 if response.status_code == 200: 777 self.logger.info(f"Streamlit server healthy - Response time: {round(total_time, 2)}ms") 778 return { 779 "status": "healthy", 780 "response_code": response.status_code, 781 "latency_ms": round(total_time, 2), 782 "message": "Streamlit server is running", 783 "url": url 784 } 785 else: 786 self.logger.warning(f"Unhealthy response from server: {response.status_code}") 787 return { 788 "status": "critical", 789 "response_code": response.status_code, 790 "error": f"Unhealthy response from server: {response.status_code}", 791 "message": "Streamlit server is not healthy", 792 "url": url 793 } 794 795 except requests.exceptions.ConnectionError as e: 796 self.logger.error(f"Connection error while checking Streamlit server: {str(e)}") 797 return { 798 "status": "critical", 799 "error": f"Connection error: {str(e)}", 800 "message": "Cannot connect to Streamlit server", 801 "url": url 802 } 803 except requests.exceptions.Timeout as e: 804 self.logger.error(f"Timeout while checking Streamlit server: {str(e)}") 805 return { 806 "status": "critical", 807 "error": f"Timeout error: {str(e)}", 808 "message": "Streamlit server is not responding", 809 "url": url 810 } 811 except Exception as e: 812 self.logger.error(f"Unexpected error while checking Streamlit server: {str(e)}") 813 return { 814 "status": "critical", 815 "error": f"Unknown error: {str(e)}", 816 "message": "Failed to check Streamlit server", 817 "url": url 818 } 819 820def health_check(config_path:str = "health_check_config.json"): 821 """ 822 Displays an interactive Streamlit dashboard for monitoring application health. 823 This function initializes and manages a health check service, presenting real-time system metrics, 824 dependency statuses, custom checks, and Streamlit page health in a user-friendly dashboard. 825 Users can manually refresh health checks, view detailed error information, and adjust configuration 826 thresholds and intervals directly from the UI. 827 Args: 828 config_path (str, optional): Path to the health check configuration JSON file. 829 Defaults to "health_check_config.json". 830 Features: 831 - Displays overall health status with color-coded indicators. 832 - Shows last updated timestamp for health data. 833 - Monitors Streamlit server status, latency, and errors. 834 - Provides tabs for: 835 * System Resources (CPU, Memory, Disk usage and status) 836 * Dependencies (external services and their health) 837 * Custom Checks (user-defined health checks) 838 * Streamlit Pages (page-specific errors and status) 839 - Allows configuration of system thresholds, check intervals, and Streamlit server settings. 840 - Supports manual refresh and saving configuration changes. 841 Raises: 842 Displays error messages in the UI for any exceptions encountered during health data retrieval or processing. 843 Returns: 844 None. The dashboard is rendered in the Streamlit app. 845 """ 846 847 logger = logging.getLogger(f"{__name__}.health_check") 848 logger.info("Starting health check dashboard") 849 st.title("Application Health Dashboard") 850 851 # Initialize or get the health check service 852 if "health_service" not in st.session_state: 853 logger.info("Initializing new health check service") 854 st.session_state.health_service = HealthCheckService(config_path = config_path) 855 st.session_state.health_service.start() 856 857 health_service = st.session_state.health_service 858 859 # Add controls for manual refresh and configuration 860 col1, col2 = st.columns([3, 1]) 861 with col1: 862 st.subheader("System Health Status") 863 with col2: 864 if st.button("Refresh Now"): 865 health_service.run_all_checks() 866 867 # Get the latest health data 868 health_data = health_service.get_health_data() 869 870 # Display overall status with appropriate color 871 overall_status = health_data.get("overall_status", "unknown") 872 status_color = { 873 "healthy": "green", 874 "warning": "orange", 875 "critical": "red", 876 "unknown": "gray" 877 }.get(overall_status, "gray") 878 879 st.markdown( 880 f"<h3 style='color: {status_color};'>Overall Status: {overall_status.upper()}</h3>", 881 unsafe_allow_html=True 882 ) 883 884 # Display last updated time 885 if health_data.get("last_updated"): 886 try: 887 last_updated = datetime.fromisoformat(health_data["last_updated"]) 888 st.text(f"Last updated: {last_updated.strftime('%Y-%m-%d %H:%M:%S')}") 889 except Exception as e: 890 st.error(f"Last updated: {health_data['last_updated']}") 891 st.exception(e) 892 893 server_health = health_data.get("streamlit_server", {}) 894 server_status = server_health.get("status", "unknown") 895 server_color = { 896 "healthy": "green", 897 "critical": "red", 898 "unknown": "gray" 899 }.get(server_status, "gray") 900 901 st.markdown( 902 f"### Streamlit Server Status: <span style='color: {server_color}'>{server_status.upper()}</span>", 903 unsafe_allow_html=True 904 ) 905 906 if server_status != "healthy": 907 st.error(server_health.get("message", "Server status unknown")) 908 if "error" in server_health: 909 st.code(server_health["error"]) 910 else: 911 st.success(server_health.get("message", "Server is running")) 912 if "latency_ms" in server_health: 913 latency = server_health["latency_ms"] 914 # Define color based on latency thresholds 915 if latency <= 50: 916 latency_color = "green" 917 performance = "Excellent" 918 elif latency <= 100: 919 latency_color = "blue" 920 performance = "Good" 921 elif latency <= 200: 922 latency_color = "orange" 923 performance = "Fair" 924 else: 925 latency_color = "red" 926 performance = "Poor" 927 928 st.markdown( 929 f""" 930 <div style='display: flex; align-items: center; gap: 10px;'> 931 <div>Server Response Time:</div> 932 <div style='color: {latency_color}; font-weight: bold;'> 933 {latency} ms 934 </div> 935 <div style='color: {latency_color};'> 936 ({performance}) 937 </div> 938 </div> 939 """, 940 unsafe_allow_html=True 941 ) 942 943 # Create tabs for different categories of health checks 944 tab1, tab2, tab3, tab4 = st.tabs(["System Resources", "Dependencies", "Custom Checks", "Streamlit Pages"]) 945 946 with tab1: 947 # Display system health checks 948 system_data = health_data.get("system", {}) 949 950 # CPU 951 if "cpu" in system_data: 952 cpu_data = system_data["cpu"] 953 cpu_status = cpu_data.get("status", "unknown") 954 cpu_color = {"healthy": "green", "warning": "orange", "critical": "red"}.get(cpu_status, "gray") 955 956 st.markdown(f"### CPU Status: <span style='color:{cpu_color}'>{cpu_status.upper()}</span>", unsafe_allow_html=True) 957 st.progress(cpu_data.get("usage_percent", 0) / 100) 958 st.text(f"CPU Usage: {cpu_data.get('usage_percent', 0)}%") 959 960 # Memory 961 if "memory" in system_data: 962 memory_data = system_data["memory"] 963 memory_status = memory_data.get("status", "unknown") 964 memory_color = {"healthy": "green", "warning": "orange", "critical": "red"}.get(memory_status, "gray") 965 966 st.markdown(f"### Memory Status: <span style='color:{memory_color}'>{memory_status.upper()}</span>", unsafe_allow_html=True) 967 st.progress(memory_data.get("usage_percent", 0) / 100) 968 st.text(f"Memory Usage: {memory_data.get('usage_percent', 0)}%") 969 st.text(f"Total Memory: {memory_data.get('total_gb', 0)} GB") 970 st.text(f"Available Memory: {memory_data.get('available_gb', 0)} GB") 971 972 # Disk 973 if "disk" in system_data: 974 disk_data = system_data["disk"] 975 disk_status = disk_data.get("status", "unknown") 976 disk_color = {"healthy": "green", "warning": "orange", "critical": "red"}.get(disk_status, "gray") 977 978 st.markdown(f"### Disk Status: <span style='color:{disk_color}'>{disk_status.upper()}</span>", unsafe_allow_html=True) 979 st.progress(disk_data.get("usage_percent", 0) / 100) 980 st.text(f"Disk Usage: {disk_data.get('usage_percent', 0)}%") 981 st.text(f"Total Disk Space: {disk_data.get('total_gb', 0)} GB") 982 st.text(f"Free Disk Space: {disk_data.get('free_gb', 0)} GB") 983 984 with tab2: 985 # Display dependency health checks 986 dependencies = health_data.get("dependencies", {}) 987 if dependencies: 988 # Create a dataframe for all dependencies 989 dep_data = [] 990 for name, dep_info in dependencies.items(): 991 dep_data.append({ 992 "Name": name, 993 "Type": dep_info.get("type", "unknown"), 994 "Status": dep_info.get("status", "unknown"), 995 "Details": ", ".join([f"{k}: {v}" for k, v in dep_info.items() 996 if k not in ["name", "type", "status", "error"] and not isinstance(v, dict)]) 997 }) 998 999 if dep_data: 1000 df = pd.DataFrame(dep_data) 1001 1002 # Apply color formatting to status column 1003 def color_status(val): 1004 colors = { 1005 "healthy": "background-color: #c6efce; color: #006100", 1006 "warning": "background-color: #ffeb9c; color: #9c5700", 1007 "critical": "background-color: #ffc7ce; color: #9c0006", 1008 "unknown": "background-color: #eeeeee; color: #7f7f7f" 1009 } 1010 return colors.get(val.lower(), "") 1011 1012 st.dataframe(df.style.map(color_status, subset=["Status"])) 1013 else: 1014 st.info("No dependencies configured") 1015 else: 1016 st.info("No dependencies configured") 1017 1018 with tab3: 1019 # Display custom checks 1020 custom_checks = health_data.get("custom_checks", {}) 1021 if custom_checks: 1022 # Create a dataframe for all custom checks 1023 check_data = [] 1024 for name, check_info in custom_checks.items(): 1025 if isinstance(check_info, dict) and "check_func" not in check_info: 1026 check_data.append({ 1027 "Name": name, 1028 "Status": check_info.get("status", "unknown"), 1029 "Details": ", ".join([f"{k}: {v}" for k, v in check_info.items() 1030 if k not in ["name", "status", "check_func", "error"] and not isinstance(v, dict)]), 1031 "Error": check_info.get("error", "") 1032 }) 1033 1034 if check_data: 1035 df = pd.DataFrame(check_data) 1036 1037 # Apply color formatting to status column 1038 def color_status(val): 1039 colors = { 1040 "healthy": "background-color: #c6efce; color: #006100", 1041 "warning": "background-color: #ffeb9c; color: #9c5700", 1042 "critical": "background-color: #ffc7ce; color: #9c0006", 1043 "unknown": "background-color: #eeeeee; color: #7f7f7f" 1044 } 1045 return colors.get(val.lower(), "") 1046 1047 st.dataframe(df.style.map(color_status, subset=["Status"])) 1048 else: 1049 st.info("No custom checks configured") 1050 else: 1051 st.info("No custom checks configured") 1052 with tab4: 1053 page_health = health_data.get("streamlit_pages", {}) 1054 status = page_health.get("status", "unknown") 1055 error_count = page_health.get("error_count", 0) 1056 status_color = { 1057 "healthy": "green", 1058 "critical": "red", 1059 "unknown": "gray" 1060 }.get(status, "gray") 1061 1062 st.markdown(f"### Page Status: <span style='color:{status_color}'>{status.upper()}</span>", unsafe_allow_html=True) 1063 st.metric("Error Count", error_count) 1064 if error_count > 0: 1065 st.error("Pages with errors:") 1066 errors_dict = page_health.get("errors", {}) 1067 1068 if not isinstance(errors_dict, dict): 1069 st.error("Invalid error data format") 1070 return 1071 1072 for page_name, page_errors in errors_dict.items(): 1073 # Create a meaningful page name for display 1074 display_name = page_name.split("/")[-1] if "/" in page_name else page_name 1075 1076 for error_info in page_errors: 1077 if isinstance(error_info, dict): 1078 with st.expander(f"Error in {display_name}"): 1079 # Display error message without the "Streamlit Error:" prefix 1080 st.error(error_info.get('error', 'Unknown error')) 1081 1082 # Show additional error details 1083 if error_info.get('type') == 'streamlit_error': 1084 st.text("Type: Streamlit Error") 1085 else: 1086 st.text("Type: Exception") 1087 1088 st.text("Traceback:") 1089 st.code("".join(error_info.get('traceback', ['No traceback available']))) 1090 st.text(f"Timestamp: {error_info.get('timestamp', 'No timestamp')}") 1091 1092 # Configuration section 1093 with st.expander("Health Check Configuration"): 1094 st.subheader("System Check Thresholds") 1095 1096 col1, col2 = st.columns(2) 1097 with col1: 1098 cpu_warning = st.slider("CPU Warning Threshold (%)", 1099 min_value=10, max_value=90, 1100 value=health_service.config["thresholds"].get("cpu_warning", 70), 1101 step=5) 1102 memory_warning = st.slider("Memory Warning Threshold (%)", 1103 min_value=10, max_value=90, 1104 value=health_service.config["thresholds"].get("memory_warning", 70), 1105 step=5) 1106 disk_warning = st.slider("Disk Warning Threshold (%)", 1107 min_value=10, max_value=90, 1108 value=health_service.config["thresholds"].get("disk_warning", 70), 1109 step=5) 1110 streamlit_url_update = st.text_input( 1111 "Streamlit Server URL", 1112 value=health_service.config.get("streamlit_url", "http://localhost") 1113 ) 1114 1115 with col2: 1116 cpu_critical = st.slider("CPU Critical Threshold (%)", 1117 min_value=20, max_value=95, 1118 value=health_service.config["thresholds"].get("cpu_critical", 90), 1119 step=5) 1120 memory_critical = st.slider("Memory Critical Threshold (%)", 1121 min_value=20, max_value=95, 1122 value=health_service.config["thresholds"].get("memory_critical", 90), 1123 step=5) 1124 disk_critical = st.slider("Disk Critical Threshold (%)", 1125 min_value=20, max_value=95, 1126 value=health_service.config["thresholds"].get("disk_critical", 90), 1127 step=5) 1128 1129 check_interval = st.slider("Check Interval (seconds)", 1130 min_value=10, max_value=300, 1131 value=health_service.config.get("check_interval", 60), 1132 step=10) 1133 streamlit_port_update = st.number_input( 1134 "Streamlit Server Port", 1135 value=health_service.config.get("streamlit_port", 8501), 1136 step=1 1137 ) 1138 1139 if st.button("Save Configuration"): 1140 # Update configuration 1141 health_service.config["thresholds"]["cpu_warning"] = cpu_warning 1142 health_service.config["thresholds"]["cpu_critical"] = cpu_critical 1143 health_service.config["thresholds"]["memory_warning"] = memory_warning 1144 health_service.config["thresholds"]["memory_critical"] = memory_critical 1145 health_service.config["thresholds"]["disk_warning"] = disk_warning 1146 health_service.config["thresholds"]["disk_critical"] = disk_critical 1147 health_service.config["check_interval"] = check_interval 1148 health_service.config["streamlit_url"] = streamlit_url_update 1149 health_service.config["streamlit_port"] = streamlit_port_update 1150 1151 # Save to file 1152 health_service.save_config() 1153 st.success("Configuration saved successfully") 1154 1155 # Restart the service if interval changed 1156 health_service.stop() 1157 health_service.start()
28class StreamlitPageMonitor: 29 """ 30 Singleton class to monitor and record errors and exceptions occurring in Streamlit pages. 31 This class monkey-patches `st.error` to capture error messages and provides decorators and methods 32 to track exceptions and errors per Streamlit page. Errors are stored in a class-level dictionary 33 and can be retrieved or cleared as needed. 34 Attributes: 35 _instance (StreamlitPageMonitor): Singleton instance of the monitor. 36 _errors (Dict[str, List[Dict[str, Any]]]): Dictionary mapping page names to lists of error records. 37 _st_error (Callable): Original `st.error` function before monkey-patching. 38 _current_page (str): Name of the current page being monitored. 39 Methods: 40 __new__(cls): 41 Ensures singleton behavior and monkey-patches `st.error` to record error messages. 42 _handle_st_error(cls, error_message: str): 43 Handles calls to `st.error` and records error information for the current page. 44 set_page_context(cls, page_name: str): 45 Sets the current page context for error recording. 46 monitor_page(cls, page_name: str): 47 Decorator to monitor a Streamlit page for exceptions and `st.error` calls. 48 Records exceptions and errors under the specified page name. 49 get_page_errors(cls): 50 Retrieves all recorded errors for all pages, grouped by page name. 51 clear_errors(cls, page_name: Optional[str] = None): 52 Clears recorded errors for a specific page or all pages. 53 """ 54 55 _instance = None 56 _errors: Dict[str, List[Dict[str, Any]]] = {} 57 _st_error = st.error 58 _current_page = None 59 60 def __new__(cls): 61 if cls._instance is None: 62 cls._instance = super(StreamlitPageMonitor, cls).__new__(cls) 63 64 # Monkey patch st.error to capture error messages 65 def patched_error(*args, **kwargs): 66 error_message = " ".join(str(arg) for arg in args) 67 current_page = cls._current_page 68 69 error_info = { 70 'error': error_message, 71 'traceback': traceback.format_stack(), 72 'timestamp': datetime.now().isoformat(), 73 'status': 'critical', 74 'type': 'streamlit_error', 75 'page': current_page 76 } 77 78 # Ensure current_page is a string, not None 79 if current_page is None: 80 current_page = "unknown_page" 81 if current_page not in cls._errors: 82 cls._errors[current_page] = [] 83 84 cls._errors[current_page].append(error_info) 85 86 # Call original st.error 87 return cls._st_error(*args, **kwargs) 88 89 st.error = patched_error 90 return cls._instance 91 92 @classmethod 93 def _handle_st_error(cls, error_message: str): 94 """ 95 Handles Streamlit-specific errors by recording error details for the current page. 96 Args: 97 error_message (str): The error message to be logged. 98 Side Effects: 99 Updates the class-level _errors dictionary with error information for the current Streamlit page. 100 Error Information Stored: 101 - error: Formatted error message. 102 - traceback: Stack trace at the point of error. 103 - timestamp: Time when the error occurred (ISO format). 104 - status: Error severity ('critical'). 105 - type: Error type ('streamlit_error'). 106 """ 107 108 # Get current page name from Streamlit context 109 current_page = getattr(st, '_current_page', 'unknown_page') 110 111 error_info = { 112 'error': f"Streamlit Error: {error_message}", 113 'traceback': traceback.format_stack(), 114 'timestamp': datetime.now().isoformat(), 115 'status': 'critical', 116 'type': 'streamlit_error' 117 } 118 119 # Initialize list for page if not exists 120 if current_page not in cls._errors: 121 cls._errors[current_page] = [] 122 123 # Add new error 124 cls._errors[current_page].append(error_info) 125 126 @classmethod 127 def set_page_context(cls, page_name: str): 128 """Set the current page context""" 129 cls._current_page = page_name 130 131 @classmethod 132 def monitor_page(cls, page_name: str): 133 """ 134 Decorator to monitor and log exceptions for a specific Streamlit page. 135 Args: 136 page_name (str): The name of the page to monitor. 137 Returns: 138 Callable: A decorator that wraps the target function, sets the page context, 139 clears previous non-Streamlit errors, and logs any exceptions that occur during execution. 140 The decorator performs the following actions: 141 - Sets the current page context using `cls.set_page_context`. 142 - Clears previous exception errors for the page, retaining only those marked as 'streamlit_error'. 143 - Executes the wrapped function. 144 - If an exception occurs, logs detailed error information (error message, traceback, timestamp, status, type, and page) 145 to `cls._errors` under the given page name, then re-raises the exception. 146 """ 147 148 def decorator(func): 149 """ 150 Decorator to manage page-specific error handling and context setting. 151 This decorator sets the current page context before executing the decorated function. 152 It clears previous exception errors for the page, retaining only Streamlit error calls. 153 If an exception occurs during function execution, it captures error details including 154 the error message, traceback, timestamp, status, type, and page name, and appends them 155 to the page's error log. The exception is then re-raised. 156 Args: 157 func (Callable): The function to be decorated. 158 Returns: 159 Callable: The wrapped function with error handling and context management. 160 """ 161 162 @functools.wraps(func) 163 def wrapper(*args, **kwargs): 164 # Set the current page context 165 cls.set_page_context(page_name) 166 try: 167 # Clear previous exception errors but keep st.error calls 168 if page_name in cls._errors: 169 cls._errors[page_name] = [ 170 e for e in cls._errors[page_name] 171 if e.get('type') == 'streamlit_error' 172 ] 173 result = func(*args, **kwargs) 174 return result 175 except Exception as e: 176 error_info = { 177 'error': str(e), 178 'traceback': traceback.format_exc(), 179 'timestamp': datetime.now().isoformat(), 180 'status': 'critical', 181 'type': 'exception', 182 'page': page_name 183 } 184 if page_name not in cls._errors: 185 cls._errors[page_name] = [] 186 cls._errors[page_name].append(error_info) 187 raise 188 return wrapper 189 return decorator 190 191 @classmethod 192 def get_page_errors(cls): 193 """ 194 Collects and returns errors for each page that has recorded errors. 195 Iterates through the internal `_errors` dictionary, and for each page with errors, 196 constructs a list of error details including the error message, traceback, timestamp, 197 and error type. 198 Returns: 199 dict: A dictionary where keys are page names and values are lists of error details. 200 Each error detail is a dictionary with the following keys: 201 - 'error' (str): The error message or 'Unknown error' if not present. 202 - 'traceback' (list): The traceback information or empty list if not present. 203 - 'timestamp' (str): The timestamp of the error or empty string if not present. 204 - 'type' (str): The type of error or 'unknown' if not present. 205 """ 206 207 result = {} 208 for page, errors in cls._errors.items(): 209 if errors: # Only include pages with errors 210 result[page] = [ 211 { 212 'error': err.get('error', 'Unknown error'), 213 'traceback': err.get('traceback', []), 214 'timestamp': err.get('timestamp', ''), 215 'type': err.get('type', 'unknown') 216 } 217 for err in errors 218 ] 219 return result 220 221 @classmethod 222 def clear_errors(cls, page_name: Optional[str] = None): 223 """Clear errors for a specific page or all pages""" 224 if page_name: 225 if page_name in cls._errors: 226 del cls._errors[page_name] 227 else: 228 cls._errors = {}
Singleton class to monitor and record errors and exceptions occurring in Streamlit pages.
This class monkey-patches st.error to capture error messages and provides decorators and methods
to track exceptions and errors per Streamlit page. Errors are stored in a class-level dictionary
and can be retrieved or cleared as needed.
Attributes:
_instance (StreamlitPageMonitor): Singleton instance of the monitor.
_errors (Dict[str, List[Dict[str, Any]]]): Dictionary mapping page names to lists of error records.
_st_error (Callable): Original st.error function before monkey-patching.
_current_page (str): Name of the current page being monitored.
Methods:
__new__(cls):
Ensures singleton behavior and monkey-patches st.error to record error messages.
_handle_st_error(cls, error_message: str):
Handles calls to st.error and records error information for the current page.
set_page_context(cls, page_name: str):
Sets the current page context for error recording.
monitor_page(cls, page_name: str):
Decorator to monitor a Streamlit page for exceptions and st.error calls.
Records exceptions and errors under the specified page name.
get_page_errors(cls):
Retrieves all recorded errors for all pages, grouped by page name.
clear_errors(cls, page_name: Optional[str] = None):
Clears recorded errors for a specific page or all pages.
126 @classmethod 127 def set_page_context(cls, page_name: str): 128 """Set the current page context""" 129 cls._current_page = page_name
Set the current page context
131 @classmethod 132 def monitor_page(cls, page_name: str): 133 """ 134 Decorator to monitor and log exceptions for a specific Streamlit page. 135 Args: 136 page_name (str): The name of the page to monitor. 137 Returns: 138 Callable: A decorator that wraps the target function, sets the page context, 139 clears previous non-Streamlit errors, and logs any exceptions that occur during execution. 140 The decorator performs the following actions: 141 - Sets the current page context using `cls.set_page_context`. 142 - Clears previous exception errors for the page, retaining only those marked as 'streamlit_error'. 143 - Executes the wrapped function. 144 - If an exception occurs, logs detailed error information (error message, traceback, timestamp, status, type, and page) 145 to `cls._errors` under the given page name, then re-raises the exception. 146 """ 147 148 def decorator(func): 149 """ 150 Decorator to manage page-specific error handling and context setting. 151 This decorator sets the current page context before executing the decorated function. 152 It clears previous exception errors for the page, retaining only Streamlit error calls. 153 If an exception occurs during function execution, it captures error details including 154 the error message, traceback, timestamp, status, type, and page name, and appends them 155 to the page's error log. The exception is then re-raised. 156 Args: 157 func (Callable): The function to be decorated. 158 Returns: 159 Callable: The wrapped function with error handling and context management. 160 """ 161 162 @functools.wraps(func) 163 def wrapper(*args, **kwargs): 164 # Set the current page context 165 cls.set_page_context(page_name) 166 try: 167 # Clear previous exception errors but keep st.error calls 168 if page_name in cls._errors: 169 cls._errors[page_name] = [ 170 e for e in cls._errors[page_name] 171 if e.get('type') == 'streamlit_error' 172 ] 173 result = func(*args, **kwargs) 174 return result 175 except Exception as e: 176 error_info = { 177 'error': str(e), 178 'traceback': traceback.format_exc(), 179 'timestamp': datetime.now().isoformat(), 180 'status': 'critical', 181 'type': 'exception', 182 'page': page_name 183 } 184 if page_name not in cls._errors: 185 cls._errors[page_name] = [] 186 cls._errors[page_name].append(error_info) 187 raise 188 return wrapper 189 return decorator
Decorator to monitor and log exceptions for a specific Streamlit page.
Args:
page_name (str): The name of the page to monitor.
Returns:
Callable: A decorator that wraps the target function, sets the page context,
clears previous non-Streamlit errors, and logs any exceptions that occur during execution.
The decorator performs the following actions:
- Sets the current page context using cls.set_page_context.
- Clears previous exception errors for the page, retaining only those marked as 'streamlit_error'.
- Executes the wrapped function.
- If an exception occurs, logs detailed error information (error message, traceback, timestamp, status, type, and page)
to cls._errors under the given page name, then re-raises the exception.
191 @classmethod 192 def get_page_errors(cls): 193 """ 194 Collects and returns errors for each page that has recorded errors. 195 Iterates through the internal `_errors` dictionary, and for each page with errors, 196 constructs a list of error details including the error message, traceback, timestamp, 197 and error type. 198 Returns: 199 dict: A dictionary where keys are page names and values are lists of error details. 200 Each error detail is a dictionary with the following keys: 201 - 'error' (str): The error message or 'Unknown error' if not present. 202 - 'traceback' (list): The traceback information or empty list if not present. 203 - 'timestamp' (str): The timestamp of the error or empty string if not present. 204 - 'type' (str): The type of error or 'unknown' if not present. 205 """ 206 207 result = {} 208 for page, errors in cls._errors.items(): 209 if errors: # Only include pages with errors 210 result[page] = [ 211 { 212 'error': err.get('error', 'Unknown error'), 213 'traceback': err.get('traceback', []), 214 'timestamp': err.get('timestamp', ''), 215 'type': err.get('type', 'unknown') 216 } 217 for err in errors 218 ] 219 return result
Collects and returns errors for each page that has recorded errors.
Iterates through the internal _errors dictionary, and for each page with errors,
constructs a list of error details including the error message, traceback, timestamp,
and error type.
Returns:
dict: A dictionary where keys are page names and values are lists of error details.
Each error detail is a dictionary with the following keys:
- 'error' (str): The error message or 'Unknown error' if not present.
- 'traceback' (list): The traceback information or empty list if not present.
- 'timestamp' (str): The timestamp of the error or empty string if not present.
- 'type' (str): The type of error or 'unknown' if not present.
221 @classmethod 222 def clear_errors(cls, page_name: Optional[str] = None): 223 """Clear errors for a specific page or all pages""" 224 if page_name: 225 if page_name in cls._errors: 226 del cls._errors[page_name] 227 else: 228 cls._errors = {}
Clear errors for a specific page or all pages
230class HealthCheckService: 231 """ 232 HealthCheckService provides a comprehensive health monitoring solution for Streamlit applications. 233 It periodically checks system resources, external dependencies, custom health checks, and Streamlit server/page status, 234 updating and reporting the overall health status. 235 Attributes: 236 logger (logging.Logger): Logger for health check events. 237 config_path (str): Path to the health check configuration file. 238 health_data (Dict[str, Any]): Stores the latest health check results. 239 config (Dict): Loaded health check configuration. 240 check_interval (int): Interval (in seconds) between health checks. 241 _running (bool): Indicates if the health check service is running. 242 _thread (threading.Thread): Background thread for periodic checks. 243 streamlit_url (str): URL of the Streamlit server. 244 streamlit_port (int): Port of the Streamlit server. 245 Methods: 246 __init__(config_path: str = "health_check_config.json"): 247 Initializes the HealthCheckService with configuration and default health data. 248 _load_config() -> Dict: 249 Loads health check configuration from file or returns default configuration. 250 _get_default_config() -> Dict: 251 Returns the default health check configuration. 252 start(): 253 Starts the health check service in a background thread. 254 stop(): 255 Stops the health check service. 256 _run_checks_periodically(): 257 Runs health checks periodically based on the configured interval. 258 run_all_checks(): 259 Executes all configured health checks and updates health data. 260 check_cpu(): 261 Checks CPU usage and updates health data. 262 check_memory(): 263 Checks memory usage and updates health data. 264 check_disk(): 265 Checks disk usage and updates health data. 266 check_dependencies(): 267 Checks external dependencies such as APIs and databases. 268 _check_api_endpoint(endpoint: Dict): 269 Checks if an API endpoint is accessible and updates health data. 270 _check_database(db_config: Dict): 271 Checks database connection (placeholder for actual implementation). 272 register_custom_check(name: str, check_func: Callable[[], Dict[str, Any]]): 273 Registers a custom health check function. 274 run_custom_checks(): 275 Executes all registered custom health checks. 276 _update_overall_status(): 277 Updates the overall health status based on individual checks. 278 get_health_data() -> Dict: 279 Returns the latest health check data, excluding function references. 280 save_config(): 281 Saves the current configuration to file. 282 check_streamlit_pages(): 283 Checks for errors in Streamlit pages and updates health data. 284 check_streamlit_server() -> Dict[str, Any]: 285 Checks if the Streamlit server is running and responding. 286 """ 287 def __init__(self, config_path: str = "health_check_config.json"): 288 """ 289 Initializes the HealthCheckService instance. 290 Args: 291 config_path (str): Path to the health check configuration file. Defaults to "health_check_config.json". 292 Attributes: 293 logger (logging.Logger): Logger for the HealthCheckService. 294 config_path (str): Path to the configuration file. 295 health_data (Dict[str, Any]): Dictionary storing health check data. 296 config (dict): Loaded configuration from the config file. 297 check_interval (int): Interval in seconds between health checks. Defaults to 60. 298 _running (bool): Indicates if the health check service is running. 299 _thread (threading.Thread or None): Thread running the health check loop. 300 streamlit_url (str): URL of the Streamlit service. Defaults to "http://localhost". 301 streamlit_port (int): Port of the Streamlit service. Defaults to 8501. 302 """ 303 self.logger = logging.getLogger(f"{__name__}.HealthCheckService") 304 self.logger.info("Initializing HealthCheckService") 305 self.config_path = config_path 306 self.health_data: Dict[str, Any] = { 307 "last_updated": None, 308 "system": {}, 309 "dependencies": {}, 310 "custom_checks": {}, 311 "overall_status": "unknown" 312 } 313 self.config = self._load_config() 314 self.check_interval = self.config.get("check_interval", 60) # Default: 60 seconds 315 self._running = False 316 self._thread = None 317 self.streamlit_url = self.config.get("streamlit_url", "http://localhost") 318 self.streamlit_port = self.config.get("streamlit_port", 8501) # Default: 8501 319 def _load_config(self) -> Dict: 320 """Load health check configuration from file.""" 321 if os.path.exists(self.config_path): 322 try: 323 with open(self.config_path, "r") as f: 324 return json.load(f) 325 except Exception as e: 326 st.error(f"Error loading health check config: {str(e)}") 327 return self._get_default_config() 328 else: 329 return self._get_default_config() 330 331 def _get_default_config(self) -> Dict: 332 """Return default health check configuration.""" 333 return { 334 "check_interval": 60, 335 "streamlit_url": "http://localhost", 336 "streamlit_port": 8501, 337 "system_checks": { 338 "cpu": True, 339 "memory": True, 340 "disk": True 341 }, 342 "dependencies": { 343 "api_endpoints": [ 344 # Example API endpoint to check 345 {"name": "example_api", "url": "https://httpbin.org/get", "timeout": 5} 346 ], 347 "databases": [ 348 # Example database connection to check 349 {"name": "main_db", "type": "postgres", "connection_string": "..."} 350 ] 351 }, 352 "thresholds": { 353 "cpu_warning": 70, 354 "cpu_critical": 90, 355 "memory_warning": 70, 356 "memory_critical": 90, 357 "disk_warning": 70, 358 "disk_critical": 90 359 } 360 } 361 362 def start(self): 363 """Start the health check service in a background thread.""" 364 if self._running: 365 return 366 367 self._running = True 368 self._thread = threading.Thread(target=self._run_checks_periodically, daemon=True) 369 self._thread.start() 370 371 def stop(self): 372 """Stop the health check service.""" 373 self._running = False 374 if self._thread: 375 self._thread.join(timeout=1) 376 377 def _run_checks_periodically(self): 378 """Run health checks periodically based on check interval.""" 379 while self._running: 380 self.run_all_checks() 381 time.sleep(self.check_interval) 382 383 def run_all_checks(self): 384 """Run all configured health checks and update health data.""" 385 # Update timestamp 386 self.health_data["last_updated"] = datetime.now().isoformat() 387 388 # Check Streamlit server 389 self.health_data["streamlit_server"] = self.check_streamlit_server() 390 391 # System checks 392 if self.config["system_checks"].get("cpu", True): 393 self.check_cpu() 394 if self.config["system_checks"].get("memory", True): 395 self.check_memory() 396 if self.config["system_checks"].get("disk", True): 397 self.check_disk() 398 399 # Rest of the existing checks... 400 self.check_dependencies() 401 self.run_custom_checks() 402 self.check_streamlit_pages() 403 self._update_overall_status() 404 405 def check_cpu(self): 406 """ 407 Checks the current CPU usage and updates the health status based on configured thresholds. 408 Measures the CPU usage percentage over a 1-second interval using psutil. Compares the result 409 against warning and critical thresholds defined in the configuration. Sets the status to 410 'healthy', 'warning', or 'critical' accordingly, and updates the health data dictionary. 411 Returns: 412 None 413 """ 414 415 cpu_percent = psutil.cpu_percent(interval=1) 416 warning_threshold = self.config["thresholds"].get("cpu_warning", 70) 417 critical_threshold = self.config["thresholds"].get("cpu_critical", 90) 418 419 status = "healthy" 420 if cpu_percent >= critical_threshold: 421 status = "critical" 422 elif cpu_percent >= warning_threshold: 423 status = "warning" 424 425 self.health_data["system"]["cpu"] = { 426 "usage_percent": cpu_percent, 427 "status": status 428 } 429 430 def check_memory(self): 431 """ 432 Checks the system's memory usage and updates the health status accordingly. 433 Retrieves the current memory usage statistics using psutil, compares the usage percentage 434 against configured warning and critical thresholds, and sets the memory status to 'healthy', 435 'warning', or 'critical'. Updates the health_data dictionary with total memory, available memory, 436 usage percentage, and status. 437 Returns: 438 None 439 """ 440 441 memory = psutil.virtual_memory() 442 memory_percent = memory.percent 443 warning_threshold = self.config["thresholds"].get("memory_warning", 70) 444 critical_threshold = self.config["thresholds"].get("memory_critical", 90) 445 446 status = "healthy" 447 if memory_percent >= critical_threshold: 448 status = "critical" 449 elif memory_percent >= warning_threshold: 450 status = "warning" 451 452 self.health_data["system"]["memory"] = { 453 "total_gb": round(memory.total / (1024**3), 2), 454 "available_gb": round(memory.available / (1024**3), 2), 455 "usage_percent": memory_percent, 456 "status": status 457 } 458 459 def check_disk(self): 460 """ 461 Checks the disk usage of the root filesystem and updates the health status. 462 Retrieves disk usage statistics using psutil, compares the usage percentage 463 against configured warning and critical thresholds, and sets the disk status 464 accordingly ("healthy", "warning", or "critical"). Updates the health_data 465 dictionary with total disk size, free space, usage percentage, and status. 466 Returns: 467 None 468 """ 469 470 disk = psutil.disk_usage('/') 471 disk_percent = disk.percent 472 warning_threshold = self.config["thresholds"].get("disk_warning", 70) 473 critical_threshold = self.config["thresholds"].get("disk_critical", 90) 474 475 status = "healthy" 476 if disk_percent >= critical_threshold: 477 status = "critical" 478 elif disk_percent >= warning_threshold: 479 status = "warning" 480 481 self.health_data["system"]["disk"] = { 482 "total_gb": round(disk.total / (1024**3), 2), 483 "free_gb": round(disk.free / (1024**3), 2), 484 "usage_percent": disk_percent, 485 "status": status 486 } 487 488 def check_dependencies(self): 489 """ 490 Checks the health of configured dependencies, including API endpoints and databases. 491 Iterates through the list of API endpoints and databases specified in the configuration, 492 and performs health checks on each by invoking the corresponding internal methods. 493 Raises: 494 Exception: If any dependency check fails. 495 """ 496 497 # Check API endpoints 498 for endpoint in self.config["dependencies"].get("api_endpoints", []): 499 self._check_api_endpoint(endpoint) 500 501 # Check database connections 502 for db in self.config["dependencies"].get("databases", []): 503 self._check_database(db) 504 505 def _check_api_endpoint(self, endpoint: Dict): 506 """ 507 Check if an API endpoint is accessible. 508 509 Args: 510 endpoint: Dictionary with endpoint configuration 511 """ 512 name = endpoint.get("name", "unknown_api") 513 url = endpoint.get("url", "") 514 timeout = endpoint.get("timeout", 5) 515 516 if not url: 517 return 518 519 try: 520 start_time = time.time() 521 response = requests.get(url, timeout=timeout) 522 response_time = time.time() - start_time 523 524 status = "healthy" if response.status_code < 400 else "critical" 525 526 self.health_data["dependencies"][name] = { 527 "type": "api", 528 "url": url, 529 "status": status, 530 "response_time_ms": round(response_time * 1000, 2), 531 "status_code": response.status_code 532 } 533 except Exception as e: 534 self.health_data["dependencies"][name] = { 535 "type": "api", 536 "url": url, 537 "status": "critical", 538 "error": str(e) 539 } 540 541 def _check_database(self, db_config: Dict): 542 """ 543 Check database connection. 544 Note: This is a placeholder. You'll need to implement specific database checks 545 based on your application's needs. 546 547 Args: 548 db_config: Dictionary with database configuration 549 """ 550 name = db_config.get("name", "unknown_db") 551 db_type = db_config.get("type", "") 552 553 # Placeholder for database connection check 554 # In a real implementation, you would check the specific database connection 555 self.health_data["dependencies"][name] = { 556 "type": "database", 557 "db_type": db_type, 558 "status": "unknown", 559 "message": "Database check not implemented" 560 } 561 562 def register_custom_check(self, name: str, check_func: Callable[[], Dict[str, Any]]): 563 """ 564 Register a custom health check function. 565 566 Args: 567 name: Name of the custom check 568 check_func: Function that performs the check and returns a dictionary with results 569 """ 570 if "custom_checks" not in self.health_data: 571 self.health_data["custom_checks"] = {} 572 573 self.health_data["custom_checks"][name] = { 574 "status": "unknown", 575 "check_func": check_func 576 } 577 578 def run_custom_checks(self): 579 """Run all registered custom health checks.""" 580 if "custom_checks" not in self.health_data: 581 return 582 583 for name, check_info in list(self.health_data["custom_checks"].items()): 584 if "check_func" in check_info and callable(check_info["check_func"]): 585 try: 586 result = check_info["check_func"]() 587 # Remove the function reference from the result 588 func = check_info["check_func"] 589 self.health_data["custom_checks"][name] = result 590 # Add the function back 591 self.health_data["custom_checks"][name]["check_func"] = func 592 except Exception as e: 593 self.health_data["custom_checks"][name] = { 594 "status": "critical", 595 "error": str(e), 596 "check_func": check_info["check_func"] 597 } 598 599 def _update_overall_status(self): 600 """ 601 Updates the overall health status of the application based on the statuses of various components. 602 The method checks the health status of the following components: 603 - Streamlit server 604 - System checks 605 - Dependencies 606 - Custom checks (excluding those with a 'check_func' key) 607 - Streamlit pages 608 The overall status is determined using the following priority order: 609 1. "critical" if any component is critical 610 2. "warning" if any component is warning and none are critical 611 3. "unknown" if any component is unknown and none are critical or warning, and no healthy components exist 612 4. "healthy" if any component is healthy and none are critical, warning, or unknown 613 5. "unknown" if no statuses are found 614 The result is stored in `self.health_data["overall_status"]`. 615 """ 616 617 has_critical = False 618 has_warning = False 619 has_healthy = False 620 has_unknown = False 621 622 # Helper function to check status 623 def check_component_status(status): 624 nonlocal has_critical, has_warning, has_healthy, has_unknown 625 if status == "critical": 626 has_critical = True 627 elif status == "warning": 628 has_warning = True 629 elif status == "healthy": 630 has_healthy = True 631 elif status == "unknown": 632 has_unknown = True 633 634 # Check Streamlit server status 635 server_status = self.health_data.get("streamlit_server", {}).get("status") 636 check_component_status(server_status) 637 638 # Check system status 639 for system_check in self.health_data.get("system", {}).values(): 640 check_component_status(system_check.get("status")) 641 642 # Check dependencies status 643 for dep_check in self.health_data.get("dependencies", {}).values(): 644 check_component_status(dep_check.get("status")) 645 646 # Check custom checks status 647 for custom_check in self.health_data.get("custom_checks", {}).values(): 648 if isinstance(custom_check, dict) and "check_func" not in custom_check: 649 check_component_status(custom_check.get("status")) 650 651 # Check Streamlit pages status 652 pages_status = self.health_data.get("streamlit_pages", {}).get("status") 653 check_component_status(pages_status) 654 655 # Determine overall status with priority: 656 # critical > warning > unknown > healthy 657 if has_critical: 658 self.health_data["overall_status"] = "critical" 659 elif has_warning: 660 self.health_data["overall_status"] = "warning" 661 elif has_unknown and not has_healthy: 662 self.health_data["overall_status"] = "unknown" 663 elif has_healthy: 664 self.health_data["overall_status"] = "healthy" 665 else: 666 self.health_data["overall_status"] = "unknown" 667 668 def get_health_data(self) -> Dict: 669 """Get the latest health check data.""" 670 # Create a copy without the function references 671 result: Dict[str, Any] = {} 672 for key, value in self.health_data.items(): 673 if key == "custom_checks": 674 result[key] = {} 675 for check_name, check_data in value.items(): 676 if isinstance(check_data, dict): 677 check_copy = check_data.copy() 678 if "check_func" in check_copy: 679 del check_copy["check_func"] 680 result[key][check_name] = check_copy 681 else: 682 result[key] = value 683 return result 684 685 def save_config(self): 686 """ 687 Saves the current health check configuration to a JSON file. 688 Attempts to write the configuration stored in `self.config` to the file specified by `self.config_path`. 689 Displays a success message in the Streamlit app upon successful save. 690 Handles and displays appropriate error messages for file not found, permission issues, JSON decoding errors, and other exceptions. 691 Raises: 692 FileNotFoundError: If the configuration file path does not exist. 693 PermissionError: If there are insufficient permissions to write to the file. 694 json.JSONDecodeError: If there is an error decoding the JSON data. 695 Exception: For any other exceptions encountered during the save process. 696 """ 697 698 try: 699 with open(self.config_path, "w") as f: 700 json.dump(self.config, f, indent=2) 701 st.success(f"Health check config saved successfully to {self.config_path}") 702 except FileNotFoundError: 703 st.error(f"Configuration file not found: {self.config_path}") 704 except PermissionError: 705 st.error(f"Permission denied: Unable to write to {self.config_path}") 706 except json.JSONDecodeError: 707 st.error(f"Error decoding JSON in config file: {self.config_path}") 708 except Exception as e: 709 st.error(f"Error saving health check config: {str(e)}") 710 def check_streamlit_pages(self): 711 """ 712 Checks for errors in Streamlit pages and updates the health data accordingly. 713 This method retrieves page errors using StreamlitPageMonitor.get_page_errors(). 714 If errors are found, it sets the 'streamlit_pages' status to 'critical' and updates 715 the overall health status to 'critical'. If no errors are found, it marks the 716 'streamlit_pages' status as 'healthy'. 717 Updates: 718 self.health_data["streamlit_pages"]: Dict containing status, error count, errors, and details. 719 self.health_data["overall_status"]: Set to 'critical' if errors are detected. 720 Returns: 721 None 722 """ 723 724 page_errors = StreamlitPageMonitor.get_page_errors() 725 726 if "streamlit_pages" not in self.health_data: 727 self.health_data["streamlit_pages"] = {} 728 729 if page_errors: 730 self.health_data["streamlit_pages"] = { 731 "status": "critical", 732 "error_count": len(page_errors), 733 "errors": page_errors, 734 "details": "Errors detected in Streamlit pages" 735 } 736 # This affects overall status 737 self.health_data["overall_status"] = "critical" 738 else: 739 self.health_data["streamlit_pages"] = { 740 "status": "healthy", 741 "error_count": 0, 742 "errors": {}, 743 "details": "All pages functioning normally" 744 } 745 746 def check_streamlit_server(self) -> Dict[str, Any]: 747 """ 748 Checks the health status of the Streamlit server by sending a GET request to the /healthz endpoint. 749 Returns: 750 Dict[str, Any]: A dictionary containing the health status, response code, latency in milliseconds, 751 message, and the URL checked. If the server is healthy (HTTP 200), status is "healthy". 752 Otherwise, status is "critical" with error details. 753 Handles: 754 - Connection errors: Returns critical status with connection error details. 755 - Timeout errors: Returns critical status with timeout error details. 756 - Other exceptions: Returns critical status with unknown error details. 757 Logs: 758 - The URL being checked. 759 - The response status code and text. 760 - Health status and response time if healthy. 761 - Warnings and errors for unhealthy or failed checks. 762 """ 763 764 try: 765 host = self.streamlit_url.rstrip('/') 766 if not host.startswith(('http://', 'https://')): 767 host = f"http://{host}" 768 769 url = f"{host}:{self.streamlit_port}/healthz" 770 self.logger.info(f"Checking Streamlit server health at: {url}") 771 772 start_time = time.time() 773 response = requests.get(url, timeout=3) 774 total_time = (time.time() - start_time) * 1000 775 self.logger.info(f"{response.status_code} - {response.text}") 776 # Check if the response is healthy 777 if response.status_code == 200: 778 self.logger.info(f"Streamlit server healthy - Response time: {round(total_time, 2)}ms") 779 return { 780 "status": "healthy", 781 "response_code": response.status_code, 782 "latency_ms": round(total_time, 2), 783 "message": "Streamlit server is running", 784 "url": url 785 } 786 else: 787 self.logger.warning(f"Unhealthy response from server: {response.status_code}") 788 return { 789 "status": "critical", 790 "response_code": response.status_code, 791 "error": f"Unhealthy response from server: {response.status_code}", 792 "message": "Streamlit server is not healthy", 793 "url": url 794 } 795 796 except requests.exceptions.ConnectionError as e: 797 self.logger.error(f"Connection error while checking Streamlit server: {str(e)}") 798 return { 799 "status": "critical", 800 "error": f"Connection error: {str(e)}", 801 "message": "Cannot connect to Streamlit server", 802 "url": url 803 } 804 except requests.exceptions.Timeout as e: 805 self.logger.error(f"Timeout while checking Streamlit server: {str(e)}") 806 return { 807 "status": "critical", 808 "error": f"Timeout error: {str(e)}", 809 "message": "Streamlit server is not responding", 810 "url": url 811 } 812 except Exception as e: 813 self.logger.error(f"Unexpected error while checking Streamlit server: {str(e)}") 814 return { 815 "status": "critical", 816 "error": f"Unknown error: {str(e)}", 817 "message": "Failed to check Streamlit server", 818 "url": url 819 }
HealthCheckService provides a comprehensive health monitoring solution for Streamlit applications. It periodically checks system resources, external dependencies, custom health checks, and Streamlit server/page status, updating and reporting the overall health status. Attributes: logger (logging.Logger): Logger for health check events. config_path (str): Path to the health check configuration file. health_data (Dict[str, Any]): Stores the latest health check results. config (Dict): Loaded health check configuration. check_interval (int): Interval (in seconds) between health checks. _running (bool): Indicates if the health check service is running. _thread (threading.Thread): Background thread for periodic checks. streamlit_url (str): URL of the Streamlit server. streamlit_port (int): Port of the Streamlit server. Methods: __init__(config_path: str = "health_check_config.json"): Initializes the HealthCheckService with configuration and default health data. _load_config() -> Dict: Loads health check configuration from file or returns default configuration. _get_default_config() -> Dict: Returns the default health check configuration. start(): Starts the health check service in a background thread. stop(): Stops the health check service. _run_checks_periodically(): Runs health checks periodically based on the configured interval. run_all_checks(): Executes all configured health checks and updates health data. check_cpu(): Checks CPU usage and updates health data. check_memory(): Checks memory usage and updates health data. check_disk(): Checks disk usage and updates health data. check_dependencies(): Checks external dependencies such as APIs and databases. _check_api_endpoint(endpoint: Dict): Checks if an API endpoint is accessible and updates health data. _check_database(db_config: Dict): Checks database connection (placeholder for actual implementation). register_custom_check(name: str, check_func: Callable[[], Dict[str, Any]]): Registers a custom health check function. run_custom_checks(): Executes all registered custom health checks. _update_overall_status(): Updates the overall health status based on individual checks. get_health_data() -> Dict: Returns the latest health check data, excluding function references. save_config(): Saves the current configuration to file. check_streamlit_pages(): Checks for errors in Streamlit pages and updates health data. check_streamlit_server() -> Dict[str, Any]: Checks if the Streamlit server is running and responding.
287 def __init__(self, config_path: str = "health_check_config.json"): 288 """ 289 Initializes the HealthCheckService instance. 290 Args: 291 config_path (str): Path to the health check configuration file. Defaults to "health_check_config.json". 292 Attributes: 293 logger (logging.Logger): Logger for the HealthCheckService. 294 config_path (str): Path to the configuration file. 295 health_data (Dict[str, Any]): Dictionary storing health check data. 296 config (dict): Loaded configuration from the config file. 297 check_interval (int): Interval in seconds between health checks. Defaults to 60. 298 _running (bool): Indicates if the health check service is running. 299 _thread (threading.Thread or None): Thread running the health check loop. 300 streamlit_url (str): URL of the Streamlit service. Defaults to "http://localhost". 301 streamlit_port (int): Port of the Streamlit service. Defaults to 8501. 302 """ 303 self.logger = logging.getLogger(f"{__name__}.HealthCheckService") 304 self.logger.info("Initializing HealthCheckService") 305 self.config_path = config_path 306 self.health_data: Dict[str, Any] = { 307 "last_updated": None, 308 "system": {}, 309 "dependencies": {}, 310 "custom_checks": {}, 311 "overall_status": "unknown" 312 } 313 self.config = self._load_config() 314 self.check_interval = self.config.get("check_interval", 60) # Default: 60 seconds 315 self._running = False 316 self._thread = None 317 self.streamlit_url = self.config.get("streamlit_url", "http://localhost") 318 self.streamlit_port = self.config.get("streamlit_port", 8501) # Default: 8501
Initializes the HealthCheckService instance. Args: config_path (str): Path to the health check configuration file. Defaults to "health_check_config.json". Attributes: logger (logging.Logger): Logger for the HealthCheckService. config_path (str): Path to the configuration file. health_data (Dict[str, Any]): Dictionary storing health check data. config (dict): Loaded configuration from the config file. check_interval (int): Interval in seconds between health checks. Defaults to 60. _running (bool): Indicates if the health check service is running. _thread (threading.Thread or None): Thread running the health check loop. streamlit_url (str): URL of the Streamlit service. Defaults to "http://localhost". streamlit_port (int): Port of the Streamlit service. Defaults to 8501.
362 def start(self): 363 """Start the health check service in a background thread.""" 364 if self._running: 365 return 366 367 self._running = True 368 self._thread = threading.Thread(target=self._run_checks_periodically, daemon=True) 369 self._thread.start()
Start the health check service in a background thread.
371 def stop(self): 372 """Stop the health check service.""" 373 self._running = False 374 if self._thread: 375 self._thread.join(timeout=1)
Stop the health check service.
383 def run_all_checks(self): 384 """Run all configured health checks and update health data.""" 385 # Update timestamp 386 self.health_data["last_updated"] = datetime.now().isoformat() 387 388 # Check Streamlit server 389 self.health_data["streamlit_server"] = self.check_streamlit_server() 390 391 # System checks 392 if self.config["system_checks"].get("cpu", True): 393 self.check_cpu() 394 if self.config["system_checks"].get("memory", True): 395 self.check_memory() 396 if self.config["system_checks"].get("disk", True): 397 self.check_disk() 398 399 # Rest of the existing checks... 400 self.check_dependencies() 401 self.run_custom_checks() 402 self.check_streamlit_pages() 403 self._update_overall_status()
Run all configured health checks and update health data.
405 def check_cpu(self): 406 """ 407 Checks the current CPU usage and updates the health status based on configured thresholds. 408 Measures the CPU usage percentage over a 1-second interval using psutil. Compares the result 409 against warning and critical thresholds defined in the configuration. Sets the status to 410 'healthy', 'warning', or 'critical' accordingly, and updates the health data dictionary. 411 Returns: 412 None 413 """ 414 415 cpu_percent = psutil.cpu_percent(interval=1) 416 warning_threshold = self.config["thresholds"].get("cpu_warning", 70) 417 critical_threshold = self.config["thresholds"].get("cpu_critical", 90) 418 419 status = "healthy" 420 if cpu_percent >= critical_threshold: 421 status = "critical" 422 elif cpu_percent >= warning_threshold: 423 status = "warning" 424 425 self.health_data["system"]["cpu"] = { 426 "usage_percent": cpu_percent, 427 "status": status 428 }
Checks the current CPU usage and updates the health status based on configured thresholds. Measures the CPU usage percentage over a 1-second interval using psutil. Compares the result against warning and critical thresholds defined in the configuration. Sets the status to 'healthy', 'warning', or 'critical' accordingly, and updates the health data dictionary. Returns: None
430 def check_memory(self): 431 """ 432 Checks the system's memory usage and updates the health status accordingly. 433 Retrieves the current memory usage statistics using psutil, compares the usage percentage 434 against configured warning and critical thresholds, and sets the memory status to 'healthy', 435 'warning', or 'critical'. Updates the health_data dictionary with total memory, available memory, 436 usage percentage, and status. 437 Returns: 438 None 439 """ 440 441 memory = psutil.virtual_memory() 442 memory_percent = memory.percent 443 warning_threshold = self.config["thresholds"].get("memory_warning", 70) 444 critical_threshold = self.config["thresholds"].get("memory_critical", 90) 445 446 status = "healthy" 447 if memory_percent >= critical_threshold: 448 status = "critical" 449 elif memory_percent >= warning_threshold: 450 status = "warning" 451 452 self.health_data["system"]["memory"] = { 453 "total_gb": round(memory.total / (1024**3), 2), 454 "available_gb": round(memory.available / (1024**3), 2), 455 "usage_percent": memory_percent, 456 "status": status 457 }
Checks the system's memory usage and updates the health status accordingly. Retrieves the current memory usage statistics using psutil, compares the usage percentage against configured warning and critical thresholds, and sets the memory status to 'healthy', 'warning', or 'critical'. Updates the health_data dictionary with total memory, available memory, usage percentage, and status. Returns: None
459 def check_disk(self): 460 """ 461 Checks the disk usage of the root filesystem and updates the health status. 462 Retrieves disk usage statistics using psutil, compares the usage percentage 463 against configured warning and critical thresholds, and sets the disk status 464 accordingly ("healthy", "warning", or "critical"). Updates the health_data 465 dictionary with total disk size, free space, usage percentage, and status. 466 Returns: 467 None 468 """ 469 470 disk = psutil.disk_usage('/') 471 disk_percent = disk.percent 472 warning_threshold = self.config["thresholds"].get("disk_warning", 70) 473 critical_threshold = self.config["thresholds"].get("disk_critical", 90) 474 475 status = "healthy" 476 if disk_percent >= critical_threshold: 477 status = "critical" 478 elif disk_percent >= warning_threshold: 479 status = "warning" 480 481 self.health_data["system"]["disk"] = { 482 "total_gb": round(disk.total / (1024**3), 2), 483 "free_gb": round(disk.free / (1024**3), 2), 484 "usage_percent": disk_percent, 485 "status": status 486 }
Checks the disk usage of the root filesystem and updates the health status. Retrieves disk usage statistics using psutil, compares the usage percentage against configured warning and critical thresholds, and sets the disk status accordingly ("healthy", "warning", or "critical"). Updates the health_data dictionary with total disk size, free space, usage percentage, and status. Returns: None
488 def check_dependencies(self): 489 """ 490 Checks the health of configured dependencies, including API endpoints and databases. 491 Iterates through the list of API endpoints and databases specified in the configuration, 492 and performs health checks on each by invoking the corresponding internal methods. 493 Raises: 494 Exception: If any dependency check fails. 495 """ 496 497 # Check API endpoints 498 for endpoint in self.config["dependencies"].get("api_endpoints", []): 499 self._check_api_endpoint(endpoint) 500 501 # Check database connections 502 for db in self.config["dependencies"].get("databases", []): 503 self._check_database(db)
Checks the health of configured dependencies, including API endpoints and databases. Iterates through the list of API endpoints and databases specified in the configuration, and performs health checks on each by invoking the corresponding internal methods. Raises: Exception: If any dependency check fails.
562 def register_custom_check(self, name: str, check_func: Callable[[], Dict[str, Any]]): 563 """ 564 Register a custom health check function. 565 566 Args: 567 name: Name of the custom check 568 check_func: Function that performs the check and returns a dictionary with results 569 """ 570 if "custom_checks" not in self.health_data: 571 self.health_data["custom_checks"] = {} 572 573 self.health_data["custom_checks"][name] = { 574 "status": "unknown", 575 "check_func": check_func 576 }
Register a custom health check function.
Args: name: Name of the custom check check_func: Function that performs the check and returns a dictionary with results
578 def run_custom_checks(self): 579 """Run all registered custom health checks.""" 580 if "custom_checks" not in self.health_data: 581 return 582 583 for name, check_info in list(self.health_data["custom_checks"].items()): 584 if "check_func" in check_info and callable(check_info["check_func"]): 585 try: 586 result = check_info["check_func"]() 587 # Remove the function reference from the result 588 func = check_info["check_func"] 589 self.health_data["custom_checks"][name] = result 590 # Add the function back 591 self.health_data["custom_checks"][name]["check_func"] = func 592 except Exception as e: 593 self.health_data["custom_checks"][name] = { 594 "status": "critical", 595 "error": str(e), 596 "check_func": check_info["check_func"] 597 }
Run all registered custom health checks.
668 def get_health_data(self) -> Dict: 669 """Get the latest health check data.""" 670 # Create a copy without the function references 671 result: Dict[str, Any] = {} 672 for key, value in self.health_data.items(): 673 if key == "custom_checks": 674 result[key] = {} 675 for check_name, check_data in value.items(): 676 if isinstance(check_data, dict): 677 check_copy = check_data.copy() 678 if "check_func" in check_copy: 679 del check_copy["check_func"] 680 result[key][check_name] = check_copy 681 else: 682 result[key] = value 683 return result
Get the latest health check data.
685 def save_config(self): 686 """ 687 Saves the current health check configuration to a JSON file. 688 Attempts to write the configuration stored in `self.config` to the file specified by `self.config_path`. 689 Displays a success message in the Streamlit app upon successful save. 690 Handles and displays appropriate error messages for file not found, permission issues, JSON decoding errors, and other exceptions. 691 Raises: 692 FileNotFoundError: If the configuration file path does not exist. 693 PermissionError: If there are insufficient permissions to write to the file. 694 json.JSONDecodeError: If there is an error decoding the JSON data. 695 Exception: For any other exceptions encountered during the save process. 696 """ 697 698 try: 699 with open(self.config_path, "w") as f: 700 json.dump(self.config, f, indent=2) 701 st.success(f"Health check config saved successfully to {self.config_path}") 702 except FileNotFoundError: 703 st.error(f"Configuration file not found: {self.config_path}") 704 except PermissionError: 705 st.error(f"Permission denied: Unable to write to {self.config_path}") 706 except json.JSONDecodeError: 707 st.error(f"Error decoding JSON in config file: {self.config_path}") 708 except Exception as e: 709 st.error(f"Error saving health check config: {str(e)}")
Saves the current health check configuration to a JSON file.
Attempts to write the configuration stored in self.config to the file specified by self.config_path.
Displays a success message in the Streamlit app upon successful save.
Handles and displays appropriate error messages for file not found, permission issues, JSON decoding errors, and other exceptions.
Raises:
FileNotFoundError: If the configuration file path does not exist.
PermissionError: If there are insufficient permissions to write to the file.
json.JSONDecodeError: If there is an error decoding the JSON data.
Exception: For any other exceptions encountered during the save process.
710 def check_streamlit_pages(self): 711 """ 712 Checks for errors in Streamlit pages and updates the health data accordingly. 713 This method retrieves page errors using StreamlitPageMonitor.get_page_errors(). 714 If errors are found, it sets the 'streamlit_pages' status to 'critical' and updates 715 the overall health status to 'critical'. If no errors are found, it marks the 716 'streamlit_pages' status as 'healthy'. 717 Updates: 718 self.health_data["streamlit_pages"]: Dict containing status, error count, errors, and details. 719 self.health_data["overall_status"]: Set to 'critical' if errors are detected. 720 Returns: 721 None 722 """ 723 724 page_errors = StreamlitPageMonitor.get_page_errors() 725 726 if "streamlit_pages" not in self.health_data: 727 self.health_data["streamlit_pages"] = {} 728 729 if page_errors: 730 self.health_data["streamlit_pages"] = { 731 "status": "critical", 732 "error_count": len(page_errors), 733 "errors": page_errors, 734 "details": "Errors detected in Streamlit pages" 735 } 736 # This affects overall status 737 self.health_data["overall_status"] = "critical" 738 else: 739 self.health_data["streamlit_pages"] = { 740 "status": "healthy", 741 "error_count": 0, 742 "errors": {}, 743 "details": "All pages functioning normally" 744 }
Checks for errors in Streamlit pages and updates the health data accordingly. This method retrieves page errors using StreamlitPageMonitor.get_page_errors(). If errors are found, it sets the 'streamlit_pages' status to 'critical' and updates the overall health status to 'critical'. If no errors are found, it marks the 'streamlit_pages' status as 'healthy'. Updates: self.health_data["streamlit_pages"]: Dict containing status, error count, errors, and details. self.health_data["overall_status"]: Set to 'critical' if errors are detected. Returns: None
746 def check_streamlit_server(self) -> Dict[str, Any]: 747 """ 748 Checks the health status of the Streamlit server by sending a GET request to the /healthz endpoint. 749 Returns: 750 Dict[str, Any]: A dictionary containing the health status, response code, latency in milliseconds, 751 message, and the URL checked. If the server is healthy (HTTP 200), status is "healthy". 752 Otherwise, status is "critical" with error details. 753 Handles: 754 - Connection errors: Returns critical status with connection error details. 755 - Timeout errors: Returns critical status with timeout error details. 756 - Other exceptions: Returns critical status with unknown error details. 757 Logs: 758 - The URL being checked. 759 - The response status code and text. 760 - Health status and response time if healthy. 761 - Warnings and errors for unhealthy or failed checks. 762 """ 763 764 try: 765 host = self.streamlit_url.rstrip('/') 766 if not host.startswith(('http://', 'https://')): 767 host = f"http://{host}" 768 769 url = f"{host}:{self.streamlit_port}/healthz" 770 self.logger.info(f"Checking Streamlit server health at: {url}") 771 772 start_time = time.time() 773 response = requests.get(url, timeout=3) 774 total_time = (time.time() - start_time) * 1000 775 self.logger.info(f"{response.status_code} - {response.text}") 776 # Check if the response is healthy 777 if response.status_code == 200: 778 self.logger.info(f"Streamlit server healthy - Response time: {round(total_time, 2)}ms") 779 return { 780 "status": "healthy", 781 "response_code": response.status_code, 782 "latency_ms": round(total_time, 2), 783 "message": "Streamlit server is running", 784 "url": url 785 } 786 else: 787 self.logger.warning(f"Unhealthy response from server: {response.status_code}") 788 return { 789 "status": "critical", 790 "response_code": response.status_code, 791 "error": f"Unhealthy response from server: {response.status_code}", 792 "message": "Streamlit server is not healthy", 793 "url": url 794 } 795 796 except requests.exceptions.ConnectionError as e: 797 self.logger.error(f"Connection error while checking Streamlit server: {str(e)}") 798 return { 799 "status": "critical", 800 "error": f"Connection error: {str(e)}", 801 "message": "Cannot connect to Streamlit server", 802 "url": url 803 } 804 except requests.exceptions.Timeout as e: 805 self.logger.error(f"Timeout while checking Streamlit server: {str(e)}") 806 return { 807 "status": "critical", 808 "error": f"Timeout error: {str(e)}", 809 "message": "Streamlit server is not responding", 810 "url": url 811 } 812 except Exception as e: 813 self.logger.error(f"Unexpected error while checking Streamlit server: {str(e)}") 814 return { 815 "status": "critical", 816 "error": f"Unknown error: {str(e)}", 817 "message": "Failed to check Streamlit server", 818 "url": url 819 }
Checks the health status of the Streamlit server by sending a GET request to the /healthz endpoint. Returns: Dict[str, Any]: A dictionary containing the health status, response code, latency in milliseconds, message, and the URL checked. If the server is healthy (HTTP 200), status is "healthy". Otherwise, status is "critical" with error details. Handles: - Connection errors: Returns critical status with connection error details. - Timeout errors: Returns critical status with timeout error details. - Other exceptions: Returns critical status with unknown error details. Logs: - The URL being checked. - The response status code and text. - Health status and response time if healthy. - Warnings and errors for unhealthy or failed checks.
821def health_check(config_path:str = "health_check_config.json"): 822 """ 823 Displays an interactive Streamlit dashboard for monitoring application health. 824 This function initializes and manages a health check service, presenting real-time system metrics, 825 dependency statuses, custom checks, and Streamlit page health in a user-friendly dashboard. 826 Users can manually refresh health checks, view detailed error information, and adjust configuration 827 thresholds and intervals directly from the UI. 828 Args: 829 config_path (str, optional): Path to the health check configuration JSON file. 830 Defaults to "health_check_config.json". 831 Features: 832 - Displays overall health status with color-coded indicators. 833 - Shows last updated timestamp for health data. 834 - Monitors Streamlit server status, latency, and errors. 835 - Provides tabs for: 836 * System Resources (CPU, Memory, Disk usage and status) 837 * Dependencies (external services and their health) 838 * Custom Checks (user-defined health checks) 839 * Streamlit Pages (page-specific errors and status) 840 - Allows configuration of system thresholds, check intervals, and Streamlit server settings. 841 - Supports manual refresh and saving configuration changes. 842 Raises: 843 Displays error messages in the UI for any exceptions encountered during health data retrieval or processing. 844 Returns: 845 None. The dashboard is rendered in the Streamlit app. 846 """ 847 848 logger = logging.getLogger(f"{__name__}.health_check") 849 logger.info("Starting health check dashboard") 850 st.title("Application Health Dashboard") 851 852 # Initialize or get the health check service 853 if "health_service" not in st.session_state: 854 logger.info("Initializing new health check service") 855 st.session_state.health_service = HealthCheckService(config_path = config_path) 856 st.session_state.health_service.start() 857 858 health_service = st.session_state.health_service 859 860 # Add controls for manual refresh and configuration 861 col1, col2 = st.columns([3, 1]) 862 with col1: 863 st.subheader("System Health Status") 864 with col2: 865 if st.button("Refresh Now"): 866 health_service.run_all_checks() 867 868 # Get the latest health data 869 health_data = health_service.get_health_data() 870 871 # Display overall status with appropriate color 872 overall_status = health_data.get("overall_status", "unknown") 873 status_color = { 874 "healthy": "green", 875 "warning": "orange", 876 "critical": "red", 877 "unknown": "gray" 878 }.get(overall_status, "gray") 879 880 st.markdown( 881 f"<h3 style='color: {status_color};'>Overall Status: {overall_status.upper()}</h3>", 882 unsafe_allow_html=True 883 ) 884 885 # Display last updated time 886 if health_data.get("last_updated"): 887 try: 888 last_updated = datetime.fromisoformat(health_data["last_updated"]) 889 st.text(f"Last updated: {last_updated.strftime('%Y-%m-%d %H:%M:%S')}") 890 except Exception as e: 891 st.error(f"Last updated: {health_data['last_updated']}") 892 st.exception(e) 893 894 server_health = health_data.get("streamlit_server", {}) 895 server_status = server_health.get("status", "unknown") 896 server_color = { 897 "healthy": "green", 898 "critical": "red", 899 "unknown": "gray" 900 }.get(server_status, "gray") 901 902 st.markdown( 903 f"### Streamlit Server Status: <span style='color: {server_color}'>{server_status.upper()}</span>", 904 unsafe_allow_html=True 905 ) 906 907 if server_status != "healthy": 908 st.error(server_health.get("message", "Server status unknown")) 909 if "error" in server_health: 910 st.code(server_health["error"]) 911 else: 912 st.success(server_health.get("message", "Server is running")) 913 if "latency_ms" in server_health: 914 latency = server_health["latency_ms"] 915 # Define color based on latency thresholds 916 if latency <= 50: 917 latency_color = "green" 918 performance = "Excellent" 919 elif latency <= 100: 920 latency_color = "blue" 921 performance = "Good" 922 elif latency <= 200: 923 latency_color = "orange" 924 performance = "Fair" 925 else: 926 latency_color = "red" 927 performance = "Poor" 928 929 st.markdown( 930 f""" 931 <div style='display: flex; align-items: center; gap: 10px;'> 932 <div>Server Response Time:</div> 933 <div style='color: {latency_color}; font-weight: bold;'> 934 {latency} ms 935 </div> 936 <div style='color: {latency_color};'> 937 ({performance}) 938 </div> 939 </div> 940 """, 941 unsafe_allow_html=True 942 ) 943 944 # Create tabs for different categories of health checks 945 tab1, tab2, tab3, tab4 = st.tabs(["System Resources", "Dependencies", "Custom Checks", "Streamlit Pages"]) 946 947 with tab1: 948 # Display system health checks 949 system_data = health_data.get("system", {}) 950 951 # CPU 952 if "cpu" in system_data: 953 cpu_data = system_data["cpu"] 954 cpu_status = cpu_data.get("status", "unknown") 955 cpu_color = {"healthy": "green", "warning": "orange", "critical": "red"}.get(cpu_status, "gray") 956 957 st.markdown(f"### CPU Status: <span style='color:{cpu_color}'>{cpu_status.upper()}</span>", unsafe_allow_html=True) 958 st.progress(cpu_data.get("usage_percent", 0) / 100) 959 st.text(f"CPU Usage: {cpu_data.get('usage_percent', 0)}%") 960 961 # Memory 962 if "memory" in system_data: 963 memory_data = system_data["memory"] 964 memory_status = memory_data.get("status", "unknown") 965 memory_color = {"healthy": "green", "warning": "orange", "critical": "red"}.get(memory_status, "gray") 966 967 st.markdown(f"### Memory Status: <span style='color:{memory_color}'>{memory_status.upper()}</span>", unsafe_allow_html=True) 968 st.progress(memory_data.get("usage_percent", 0) / 100) 969 st.text(f"Memory Usage: {memory_data.get('usage_percent', 0)}%") 970 st.text(f"Total Memory: {memory_data.get('total_gb', 0)} GB") 971 st.text(f"Available Memory: {memory_data.get('available_gb', 0)} GB") 972 973 # Disk 974 if "disk" in system_data: 975 disk_data = system_data["disk"] 976 disk_status = disk_data.get("status", "unknown") 977 disk_color = {"healthy": "green", "warning": "orange", "critical": "red"}.get(disk_status, "gray") 978 979 st.markdown(f"### Disk Status: <span style='color:{disk_color}'>{disk_status.upper()}</span>", unsafe_allow_html=True) 980 st.progress(disk_data.get("usage_percent", 0) / 100) 981 st.text(f"Disk Usage: {disk_data.get('usage_percent', 0)}%") 982 st.text(f"Total Disk Space: {disk_data.get('total_gb', 0)} GB") 983 st.text(f"Free Disk Space: {disk_data.get('free_gb', 0)} GB") 984 985 with tab2: 986 # Display dependency health checks 987 dependencies = health_data.get("dependencies", {}) 988 if dependencies: 989 # Create a dataframe for all dependencies 990 dep_data = [] 991 for name, dep_info in dependencies.items(): 992 dep_data.append({ 993 "Name": name, 994 "Type": dep_info.get("type", "unknown"), 995 "Status": dep_info.get("status", "unknown"), 996 "Details": ", ".join([f"{k}: {v}" for k, v in dep_info.items() 997 if k not in ["name", "type", "status", "error"] and not isinstance(v, dict)]) 998 }) 999 1000 if dep_data: 1001 df = pd.DataFrame(dep_data) 1002 1003 # Apply color formatting to status column 1004 def color_status(val): 1005 colors = { 1006 "healthy": "background-color: #c6efce; color: #006100", 1007 "warning": "background-color: #ffeb9c; color: #9c5700", 1008 "critical": "background-color: #ffc7ce; color: #9c0006", 1009 "unknown": "background-color: #eeeeee; color: #7f7f7f" 1010 } 1011 return colors.get(val.lower(), "") 1012 1013 st.dataframe(df.style.map(color_status, subset=["Status"])) 1014 else: 1015 st.info("No dependencies configured") 1016 else: 1017 st.info("No dependencies configured") 1018 1019 with tab3: 1020 # Display custom checks 1021 custom_checks = health_data.get("custom_checks", {}) 1022 if custom_checks: 1023 # Create a dataframe for all custom checks 1024 check_data = [] 1025 for name, check_info in custom_checks.items(): 1026 if isinstance(check_info, dict) and "check_func" not in check_info: 1027 check_data.append({ 1028 "Name": name, 1029 "Status": check_info.get("status", "unknown"), 1030 "Details": ", ".join([f"{k}: {v}" for k, v in check_info.items() 1031 if k not in ["name", "status", "check_func", "error"] and not isinstance(v, dict)]), 1032 "Error": check_info.get("error", "") 1033 }) 1034 1035 if check_data: 1036 df = pd.DataFrame(check_data) 1037 1038 # Apply color formatting to status column 1039 def color_status(val): 1040 colors = { 1041 "healthy": "background-color: #c6efce; color: #006100", 1042 "warning": "background-color: #ffeb9c; color: #9c5700", 1043 "critical": "background-color: #ffc7ce; color: #9c0006", 1044 "unknown": "background-color: #eeeeee; color: #7f7f7f" 1045 } 1046 return colors.get(val.lower(), "") 1047 1048 st.dataframe(df.style.map(color_status, subset=["Status"])) 1049 else: 1050 st.info("No custom checks configured") 1051 else: 1052 st.info("No custom checks configured") 1053 with tab4: 1054 page_health = health_data.get("streamlit_pages", {}) 1055 status = page_health.get("status", "unknown") 1056 error_count = page_health.get("error_count", 0) 1057 status_color = { 1058 "healthy": "green", 1059 "critical": "red", 1060 "unknown": "gray" 1061 }.get(status, "gray") 1062 1063 st.markdown(f"### Page Status: <span style='color:{status_color}'>{status.upper()}</span>", unsafe_allow_html=True) 1064 st.metric("Error Count", error_count) 1065 if error_count > 0: 1066 st.error("Pages with errors:") 1067 errors_dict = page_health.get("errors", {}) 1068 1069 if not isinstance(errors_dict, dict): 1070 st.error("Invalid error data format") 1071 return 1072 1073 for page_name, page_errors in errors_dict.items(): 1074 # Create a meaningful page name for display 1075 display_name = page_name.split("/")[-1] if "/" in page_name else page_name 1076 1077 for error_info in page_errors: 1078 if isinstance(error_info, dict): 1079 with st.expander(f"Error in {display_name}"): 1080 # Display error message without the "Streamlit Error:" prefix 1081 st.error(error_info.get('error', 'Unknown error')) 1082 1083 # Show additional error details 1084 if error_info.get('type') == 'streamlit_error': 1085 st.text("Type: Streamlit Error") 1086 else: 1087 st.text("Type: Exception") 1088 1089 st.text("Traceback:") 1090 st.code("".join(error_info.get('traceback', ['No traceback available']))) 1091 st.text(f"Timestamp: {error_info.get('timestamp', 'No timestamp')}") 1092 1093 # Configuration section 1094 with st.expander("Health Check Configuration"): 1095 st.subheader("System Check Thresholds") 1096 1097 col1, col2 = st.columns(2) 1098 with col1: 1099 cpu_warning = st.slider("CPU Warning Threshold (%)", 1100 min_value=10, max_value=90, 1101 value=health_service.config["thresholds"].get("cpu_warning", 70), 1102 step=5) 1103 memory_warning = st.slider("Memory Warning Threshold (%)", 1104 min_value=10, max_value=90, 1105 value=health_service.config["thresholds"].get("memory_warning", 70), 1106 step=5) 1107 disk_warning = st.slider("Disk Warning Threshold (%)", 1108 min_value=10, max_value=90, 1109 value=health_service.config["thresholds"].get("disk_warning", 70), 1110 step=5) 1111 streamlit_url_update = st.text_input( 1112 "Streamlit Server URL", 1113 value=health_service.config.get("streamlit_url", "http://localhost") 1114 ) 1115 1116 with col2: 1117 cpu_critical = st.slider("CPU Critical Threshold (%)", 1118 min_value=20, max_value=95, 1119 value=health_service.config["thresholds"].get("cpu_critical", 90), 1120 step=5) 1121 memory_critical = st.slider("Memory Critical Threshold (%)", 1122 min_value=20, max_value=95, 1123 value=health_service.config["thresholds"].get("memory_critical", 90), 1124 step=5) 1125 disk_critical = st.slider("Disk Critical Threshold (%)", 1126 min_value=20, max_value=95, 1127 value=health_service.config["thresholds"].get("disk_critical", 90), 1128 step=5) 1129 1130 check_interval = st.slider("Check Interval (seconds)", 1131 min_value=10, max_value=300, 1132 value=health_service.config.get("check_interval", 60), 1133 step=10) 1134 streamlit_port_update = st.number_input( 1135 "Streamlit Server Port", 1136 value=health_service.config.get("streamlit_port", 8501), 1137 step=1 1138 ) 1139 1140 if st.button("Save Configuration"): 1141 # Update configuration 1142 health_service.config["thresholds"]["cpu_warning"] = cpu_warning 1143 health_service.config["thresholds"]["cpu_critical"] = cpu_critical 1144 health_service.config["thresholds"]["memory_warning"] = memory_warning 1145 health_service.config["thresholds"]["memory_critical"] = memory_critical 1146 health_service.config["thresholds"]["disk_warning"] = disk_warning 1147 health_service.config["thresholds"]["disk_critical"] = disk_critical 1148 health_service.config["check_interval"] = check_interval 1149 health_service.config["streamlit_url"] = streamlit_url_update 1150 health_service.config["streamlit_port"] = streamlit_port_update 1151 1152 # Save to file 1153 health_service.save_config() 1154 st.success("Configuration saved successfully") 1155 1156 # Restart the service if interval changed 1157 health_service.stop() 1158 health_service.start()
Displays an interactive Streamlit dashboard for monitoring application health. This function initializes and manages a health check service, presenting real-time system metrics, dependency statuses, custom checks, and Streamlit page health in a user-friendly dashboard. Users can manually refresh health checks, view detailed error information, and adjust configuration thresholds and intervals directly from the UI. Args: config_path (str, optional): Path to the health check configuration JSON file. Defaults to "health_check_config.json". Features: - Displays overall health status with color-coded indicators. - Shows last updated timestamp for health data. - Monitors Streamlit server status, latency, and errors. - Provides tabs for: * System Resources (CPU, Memory, Disk usage and status) * Dependencies (external services and their health) * Custom Checks (user-defined health checks) * Streamlit Pages (page-specific errors and status) - Allows configuration of system thresholds, check intervals, and Streamlit server settings. - Supports manual refresh and saving configuration changes. Raises: Displays error messages in the UI for any exceptions encountered during health data retrieval or processing. Returns: None. The dashboard is rendered in the Streamlit app.