streamlit_healthcheck.healthcheck

   1import streamlit as st
   2import psutil
   3import numpy as np
   4import pandas as pd
   5import requests
   6import time
   7import threading
   8import json
   9import os
  10from datetime import datetime
  11from typing import Dict, List, Any, Optional, Callable
  12import threading
  13import functools
  14import traceback
  15import logging
  16
  17# Set up logging
  18logging.basicConfig(
  19    level=logging.INFO,
  20    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
  21    handlers=[
  22        logging.StreamHandler()
  23    ]
  24)
  25logger = logging.getLogger(__name__)
  26
  27class StreamlitPageMonitor:
  28    """
  29    Singleton class to monitor and record errors and exceptions occurring in Streamlit pages.
  30    This class monkey-patches `st.error` to capture error messages and provides decorators and methods
  31    to track exceptions and errors per Streamlit page. Errors are stored in a class-level dictionary
  32    and can be retrieved or cleared as needed.
  33    Attributes:
  34        _instance (StreamlitPageMonitor): Singleton instance of the monitor.
  35        _errors (Dict[str, List[Dict[str, Any]]]): Dictionary mapping page names to lists of error records.
  36        _st_error (Callable): Original `st.error` function before monkey-patching.
  37        _current_page (str): Name of the current page being monitored.
  38    Methods:
  39        __new__(cls):
  40            Ensures singleton behavior and monkey-patches `st.error` to record error messages.
  41        _handle_st_error(cls, error_message: str):
  42            Handles calls to `st.error` and records error information for the current page.
  43        set_page_context(cls, page_name: str):
  44            Sets the current page context for error recording.
  45        monitor_page(cls, page_name: str):
  46            Decorator to monitor a Streamlit page for exceptions and `st.error` calls.
  47            Records exceptions and errors under the specified page name.
  48        get_page_errors(cls):
  49            Retrieves all recorded errors for all pages, grouped by page name.
  50        clear_errors(cls, page_name: Optional[str] = None):
  51            Clears recorded errors for a specific page or all pages.
  52    """
  53    
  54    _instance = None
  55    _errors: Dict[str, List[Dict[str, Any]]] = {}
  56    _st_error = st.error
  57    _current_page = None
  58
  59    def __new__(cls):
  60        if cls._instance is None:
  61            cls._instance = super(StreamlitPageMonitor, cls).__new__(cls)
  62            
  63            # Monkey patch st.error to capture error messages
  64            def patched_error(*args, **kwargs):
  65                error_message = " ".join(str(arg) for arg in args)
  66                current_page = cls._current_page
  67                
  68                error_info = {
  69                    'error': error_message,
  70                    'traceback': traceback.format_stack(),
  71                    'timestamp': datetime.now().isoformat(),
  72                    'status': 'critical',
  73                    'type': 'streamlit_error',
  74                    'page': current_page
  75                }
  76
  77                # Ensure current_page is a string, not None
  78                if current_page is None:
  79                    current_page = "unknown_page"
  80                if current_page not in cls._errors:
  81                    cls._errors[current_page] = []
  82                
  83                cls._errors[current_page].append(error_info)
  84                
  85                # Call original st.error
  86                return cls._st_error(*args, **kwargs)
  87                
  88            st.error = patched_error
  89        return cls._instance
  90
  91    @classmethod
  92    def _handle_st_error(cls, error_message: str):
  93        """
  94        Handles Streamlit-specific errors by recording error details for the current page.
  95        Args:
  96            error_message (str): The error message to be logged.
  97        Side Effects:
  98            Updates the class-level _errors dictionary with error information for the current Streamlit page.
  99        Error Information Stored:
 100            - error: Formatted error message.
 101            - traceback: Stack trace at the point of error.
 102            - timestamp: Time when the error occurred (ISO format).
 103            - status: Error severity ('critical').
 104            - type: Error type ('streamlit_error').
 105        """
 106        
 107        # Get current page name from Streamlit context
 108        current_page = getattr(st, '_current_page', 'unknown_page')
 109        
 110        error_info = {
 111            'error': f"Streamlit Error: {error_message}",
 112            'traceback': traceback.format_stack(),
 113            'timestamp': datetime.now().isoformat(),
 114            'status': 'critical',
 115            'type': 'streamlit_error'
 116        }
 117
 118        # Initialize list for page if not exists
 119        if current_page not in cls._errors:
 120            cls._errors[current_page] = []
 121
 122        # Add new error
 123        cls._errors[current_page].append(error_info)
 124
 125    @classmethod
 126    def set_page_context(cls, page_name: str):
 127        """Set the current page context"""
 128        cls._current_page = page_name
 129
 130    @classmethod
 131    def monitor_page(cls, page_name: str):
 132        """
 133        Decorator to monitor and log exceptions for a specific Streamlit page.
 134        Args:
 135            page_name (str): The name of the page to monitor.
 136        Returns:
 137            Callable: A decorator that wraps the target function, sets the page context,
 138            clears previous non-Streamlit errors, and logs any exceptions that occur during execution.
 139        The decorator performs the following actions:
 140            - Sets the current page context using `cls.set_page_context`.
 141            - Clears previous exception errors for the page, retaining only those marked as 'streamlit_error'.
 142            - Executes the wrapped function.
 143            - If an exception occurs, logs detailed error information (error message, traceback, timestamp, status, type, and page)
 144              to `cls._errors` under the given page name, then re-raises the exception.
 145        """
 146        
 147        def decorator(func):
 148            """
 149            Decorator to manage page-specific error handling and context setting.
 150            This decorator sets the current page context before executing the decorated function.
 151            It clears previous exception errors for the page, retaining only Streamlit error calls.
 152            If an exception occurs during function execution, it captures error details including
 153            the error message, traceback, timestamp, status, type, and page name, and appends them
 154            to the page's error log. The exception is then re-raised.
 155            Args:
 156                func (Callable): The function to be decorated.
 157            Returns:
 158                Callable: The wrapped function with error handling and context management.
 159            """
 160            
 161            @functools.wraps(func)
 162            def wrapper(*args, **kwargs):
 163                # Set the current page context
 164                cls.set_page_context(page_name)
 165                try:
 166                    # Clear previous exception errors but keep st.error calls
 167                    if page_name in cls._errors:
 168                        cls._errors[page_name] = [
 169                            e for e in cls._errors[page_name]
 170                            if e.get('type') == 'streamlit_error'
 171                        ]
 172                    result = func(*args, **kwargs)
 173                    return result
 174                except Exception as e:
 175                    error_info = {
 176                        'error': str(e),
 177                        'traceback': traceback.format_exc(),
 178                        'timestamp': datetime.now().isoformat(),
 179                        'status': 'critical',
 180                        'type': 'exception',
 181                        'page': page_name
 182                    }
 183                    if page_name not in cls._errors:
 184                        cls._errors[page_name] = []
 185                    cls._errors[page_name].append(error_info)
 186                    raise
 187            return wrapper
 188        return decorator
 189
 190    @classmethod
 191    def get_page_errors(cls):
 192        """
 193        Collects and returns errors for each page that has recorded errors.
 194        Iterates through the internal `_errors` dictionary, and for each page with errors,
 195        constructs a list of error details including the error message, traceback, timestamp,
 196        and error type.
 197        Returns:
 198            dict: A dictionary where keys are page names and values are lists of error details.
 199                  Each error detail is a dictionary with the following keys:
 200                      - 'error' (str): The error message or 'Unknown error' if not present.
 201                      - 'traceback' (list): The traceback information or empty list if not present.
 202                      - 'timestamp' (str): The timestamp of the error or empty string if not present.
 203                      - 'type' (str): The type of error or 'unknown' if not present.
 204        """
 205        
 206        result = {}
 207        for page, errors in cls._errors.items():
 208            if errors:  # Only include pages with errors
 209                result[page] = [
 210                    {
 211                        'error': err.get('error', 'Unknown error'),
 212                        'traceback': err.get('traceback', []),
 213                        'timestamp': err.get('timestamp', ''),
 214                        'type': err.get('type', 'unknown')
 215                    }
 216                    for err in errors
 217                ]
 218        return result
 219
 220    @classmethod
 221    def clear_errors(cls, page_name: Optional[str] = None):
 222        """Clear errors for a specific page or all pages"""
 223        if page_name:
 224            if page_name in cls._errors:
 225                del cls._errors[page_name]
 226        else:
 227            cls._errors = {}
 228
 229class HealthCheckService:
 230    """
 231    HealthCheckService provides a comprehensive health monitoring solution for Streamlit applications.
 232    It periodically checks system resources, external dependencies, custom health checks, and Streamlit server/page status,
 233    updating and reporting the overall health status.
 234    Attributes:
 235        logger (logging.Logger): Logger for health check events.
 236        config_path (str): Path to the health check configuration file.
 237        health_data (Dict[str, Any]): Stores the latest health check results.
 238        config (Dict): Loaded health check configuration.
 239        check_interval (int): Interval (in seconds) between health checks.
 240        _running (bool): Indicates if the health check service is running.
 241        _thread (threading.Thread): Background thread for periodic checks.
 242        streamlit_url (str): URL of the Streamlit server.
 243        streamlit_port (int): Port of the Streamlit server.
 244    Methods:
 245        __init__(config_path: str = "health_check_config.json"):
 246            Initializes the HealthCheckService with configuration and default health data.
 247        _load_config() -> Dict:
 248            Loads health check configuration from file or returns default configuration.
 249        _get_default_config() -> Dict:
 250            Returns the default health check configuration.
 251        start():
 252            Starts the health check service in a background thread.
 253        stop():
 254            Stops the health check service.
 255        _run_checks_periodically():
 256            Runs health checks periodically based on the configured interval.
 257        run_all_checks():
 258            Executes all configured health checks and updates health data.
 259        check_cpu():
 260            Checks CPU usage and updates health data.
 261        check_memory():
 262            Checks memory usage and updates health data.
 263        check_disk():
 264            Checks disk usage and updates health data.
 265        check_dependencies():
 266            Checks external dependencies such as APIs and databases.
 267        _check_api_endpoint(endpoint: Dict):
 268            Checks if an API endpoint is accessible and updates health data.
 269        _check_database(db_config: Dict):
 270            Checks database connection (placeholder for actual implementation).
 271        register_custom_check(name: str, check_func: Callable[[], Dict[str, Any]]):
 272            Registers a custom health check function.
 273        run_custom_checks():
 274            Executes all registered custom health checks.
 275        _update_overall_status():
 276            Updates the overall health status based on individual checks.
 277        get_health_data() -> Dict:
 278            Returns the latest health check data, excluding function references.
 279        save_config():
 280            Saves the current configuration to file.
 281        check_streamlit_pages():
 282            Checks for errors in Streamlit pages and updates health data.
 283        check_streamlit_server() -> Dict[str, Any]:
 284            Checks if the Streamlit server is running and responding.
 285    """
 286    def __init__(self, config_path: str = "health_check_config.json"):
 287        """
 288        Initializes the HealthCheckService instance.
 289        Args:
 290            config_path (str): Path to the health check configuration file. Defaults to "health_check_config.json".
 291        Attributes:
 292            logger (logging.Logger): Logger for the HealthCheckService.
 293            config_path (str): Path to the configuration file.
 294            health_data (Dict[str, Any]): Dictionary storing health check data.
 295            config (dict): Loaded configuration from the config file.
 296            check_interval (int): Interval in seconds between health checks. Defaults to 60.
 297            _running (bool): Indicates if the health check service is running.
 298            _thread (threading.Thread or None): Thread running the health check loop.
 299            streamlit_url (str): URL of the Streamlit service. Defaults to "http://localhost".
 300            streamlit_port (int): Port of the Streamlit service. Defaults to 8501.
 301        """
 302        self.logger = logging.getLogger(f"{__name__}.HealthCheckService")
 303        self.logger.info("Initializing HealthCheckService")
 304        self.config_path = config_path
 305        self.health_data: Dict[str, Any] = {
 306            "last_updated": None,
 307            "system": {},
 308            "dependencies": {},
 309            "custom_checks": {},
 310            "overall_status": "unknown"
 311        }
 312        self.config = self._load_config()
 313        self.check_interval = self.config.get("check_interval", 60)  # Default: 60 seconds
 314        self._running = False
 315        self._thread = None
 316        self.streamlit_url = self.config.get("streamlit_url", "http://localhost")
 317        self.streamlit_port = self.config.get("streamlit_port", 8501)  # Default: 8501
 318    def _load_config(self) -> Dict:
 319        """Load health check configuration from file."""
 320        if os.path.exists(self.config_path):
 321            try:
 322                with open(self.config_path, "r") as f:
 323                    return json.load(f)
 324            except Exception as e:
 325                st.error(f"Error loading health check config: {str(e)}")
 326                return self._get_default_config()
 327        else:
 328            return self._get_default_config()
 329            
 330    def _get_default_config(self) -> Dict:
 331        """Return default health check configuration."""
 332        return {
 333            "check_interval": 60,
 334            "streamlit_url": "http://localhost",
 335            "streamlit_port": 8501,
 336            "system_checks": {
 337                "cpu": True,
 338                "memory": True,
 339                "disk": True
 340            },
 341            "dependencies": {
 342                "api_endpoints": [
 343                    # Example API endpoint to check
 344                    {"name": "example_api", "url": "https://httpbin.org/get", "timeout": 5}
 345                ],
 346                "databases": [
 347                    # Example database connection to check
 348                    {"name": "main_db", "type": "postgres", "connection_string": "..."}
 349                ]
 350            },
 351            "thresholds": {
 352                "cpu_warning": 70,
 353                "cpu_critical": 90,
 354                "memory_warning": 70,
 355                "memory_critical": 90,
 356                "disk_warning": 70,
 357                "disk_critical": 90
 358            }
 359        }
 360    
 361    def start(self):
 362        """Start the health check service in a background thread."""
 363        if self._running:
 364            return
 365            
 366        self._running = True
 367        self._thread = threading.Thread(target=self._run_checks_periodically, daemon=True)
 368        self._thread.start()
 369        
 370    def stop(self):
 371        """Stop the health check service."""
 372        self._running = False
 373        if self._thread:
 374            self._thread.join(timeout=1)
 375            
 376    def _run_checks_periodically(self):
 377        """Run health checks periodically based on check interval."""
 378        while self._running:
 379            self.run_all_checks()
 380            time.sleep(self.check_interval)
 381            
 382    def run_all_checks(self):
 383        """Run all configured health checks and update health data."""
 384        # Update timestamp
 385        self.health_data["last_updated"] = datetime.now().isoformat()
 386        
 387        # Check Streamlit server
 388        self.health_data["streamlit_server"] = self.check_streamlit_server()
 389        
 390        # System checks
 391        if self.config["system_checks"].get("cpu", True):
 392            self.check_cpu()
 393        if self.config["system_checks"].get("memory", True):
 394            self.check_memory()
 395        if self.config["system_checks"].get("disk", True):
 396            self.check_disk()
 397            
 398        # Rest of the existing checks...
 399        self.check_dependencies()
 400        self.run_custom_checks()
 401        self.check_streamlit_pages()
 402        self._update_overall_status()
 403        
 404    def check_cpu(self):
 405        """
 406        Checks the current CPU usage and updates the health status based on configured thresholds.
 407        Measures the CPU usage percentage over a 1-second interval using psutil. Compares the result
 408        against warning and critical thresholds defined in the configuration. Sets the status to
 409        'healthy', 'warning', or 'critical' accordingly, and updates the health data dictionary.
 410        Returns:
 411            None
 412        """
 413        
 414        cpu_percent = psutil.cpu_percent(interval=1)
 415        warning_threshold = self.config["thresholds"].get("cpu_warning", 70)
 416        critical_threshold = self.config["thresholds"].get("cpu_critical", 90)
 417        
 418        status = "healthy"
 419        if cpu_percent >= critical_threshold:
 420            status = "critical"
 421        elif cpu_percent >= warning_threshold:
 422            status = "warning"
 423            
 424        self.health_data["system"]["cpu"] = {
 425            "usage_percent": cpu_percent,
 426            "status": status
 427        }
 428        
 429    def check_memory(self):
 430        """
 431        Checks the system's memory usage and updates the health status accordingly.
 432        Retrieves the current memory usage statistics using psutil, compares the usage percentage
 433        against configured warning and critical thresholds, and sets the memory status to 'healthy',
 434        'warning', or 'critical'. Updates the health_data dictionary with total memory, available memory,
 435        usage percentage, and status.
 436        Returns:
 437            None
 438        """
 439        
 440        memory = psutil.virtual_memory()
 441        memory_percent = memory.percent
 442        warning_threshold = self.config["thresholds"].get("memory_warning", 70)
 443        critical_threshold = self.config["thresholds"].get("memory_critical", 90)
 444        
 445        status = "healthy"
 446        if memory_percent >= critical_threshold:
 447            status = "critical"
 448        elif memory_percent >= warning_threshold:
 449            status = "warning"
 450            
 451        self.health_data["system"]["memory"] = {
 452            "total_gb": round(memory.total / (1024**3), 2),
 453            "available_gb": round(memory.available / (1024**3), 2),
 454            "usage_percent": memory_percent,
 455            "status": status
 456        }
 457        
 458    def check_disk(self):
 459        """
 460        Checks the disk usage of the root filesystem and updates the health status.
 461        Retrieves disk usage statistics using psutil, compares the usage percentage
 462        against configured warning and critical thresholds, and sets the disk status
 463        accordingly ("healthy", "warning", or "critical"). Updates the health_data
 464        dictionary with total disk size, free space, usage percentage, and status.
 465        Returns:
 466            None
 467        """
 468        
 469        disk = psutil.disk_usage('/')
 470        disk_percent = disk.percent
 471        warning_threshold = self.config["thresholds"].get("disk_warning", 70)
 472        critical_threshold = self.config["thresholds"].get("disk_critical", 90)
 473        
 474        status = "healthy"
 475        if disk_percent >= critical_threshold:
 476            status = "critical"
 477        elif disk_percent >= warning_threshold:
 478            status = "warning"
 479            
 480        self.health_data["system"]["disk"] = {
 481            "total_gb": round(disk.total / (1024**3), 2),
 482            "free_gb": round(disk.free / (1024**3), 2),
 483            "usage_percent": disk_percent,
 484            "status": status
 485        }
 486        
 487    def check_dependencies(self):
 488        """
 489        Checks the health of configured dependencies, including API endpoints and databases.
 490        Iterates through the list of API endpoints and databases specified in the configuration,
 491        and performs health checks on each by invoking the corresponding internal methods.
 492        Raises:
 493            Exception: If any dependency check fails.
 494        """
 495        
 496        # Check API endpoints
 497        for endpoint in self.config["dependencies"].get("api_endpoints", []):
 498            self._check_api_endpoint(endpoint)
 499            
 500        # Check database connections
 501        for db in self.config["dependencies"].get("databases", []):
 502            self._check_database(db)
 503            
 504    def _check_api_endpoint(self, endpoint: Dict):
 505        """
 506        Check if an API endpoint is accessible.
 507        
 508        Args:
 509            endpoint: Dictionary with endpoint configuration
 510        """
 511        name = endpoint.get("name", "unknown_api")
 512        url = endpoint.get("url", "")
 513        timeout = endpoint.get("timeout", 5)
 514        
 515        if not url:
 516            return
 517            
 518        try:
 519            start_time = time.time()
 520            response = requests.get(url, timeout=timeout)
 521            response_time = time.time() - start_time
 522            
 523            status = "healthy" if response.status_code < 400 else "critical"
 524            
 525            self.health_data["dependencies"][name] = {
 526                "type": "api",
 527                "url": url,
 528                "status": status,
 529                "response_time_ms": round(response_time * 1000, 2),
 530                "status_code": response.status_code
 531            }
 532        except Exception as e:
 533            self.health_data["dependencies"][name] = {
 534                "type": "api",
 535                "url": url,
 536                "status": "critical",
 537                "error": str(e)
 538            }
 539            
 540    def _check_database(self, db_config: Dict):
 541        """
 542        Check database connection.
 543        Note: This is a placeholder. You'll need to implement specific database checks
 544        based on your application's needs.
 545        
 546        Args:
 547            db_config: Dictionary with database configuration
 548        """
 549        name = db_config.get("name", "unknown_db")
 550        db_type = db_config.get("type", "")
 551        
 552        # Placeholder for database connection check
 553        # In a real implementation, you would check the specific database connection
 554        self.health_data["dependencies"][name] = {
 555            "type": "database",
 556            "db_type": db_type,
 557            "status": "unknown",
 558            "message": "Database check not implemented"
 559        }
 560        
 561    def register_custom_check(self, name: str, check_func: Callable[[], Dict[str, Any]]):
 562        """
 563        Register a custom health check function.
 564        
 565        Args:
 566            name: Name of the custom check
 567            check_func: Function that performs the check and returns a dictionary with results
 568        """
 569        if "custom_checks" not in self.health_data:
 570            self.health_data["custom_checks"] = {}
 571            
 572        self.health_data["custom_checks"][name] = {
 573            "status": "unknown",
 574            "check_func": check_func
 575        }
 576        
 577    def run_custom_checks(self):
 578        """Run all registered custom health checks."""
 579        if "custom_checks" not in self.health_data:
 580            return
 581            
 582        for name, check_info in list(self.health_data["custom_checks"].items()):
 583            if "check_func" in check_info and callable(check_info["check_func"]):
 584                try:
 585                    result = check_info["check_func"]()
 586                    # Remove the function reference from the result
 587                    func = check_info["check_func"]
 588                    self.health_data["custom_checks"][name] = result
 589                    # Add the function back
 590                    self.health_data["custom_checks"][name]["check_func"] = func
 591                except Exception as e:
 592                    self.health_data["custom_checks"][name] = {
 593                        "status": "critical",
 594                        "error": str(e),
 595                        "check_func": check_info["check_func"]
 596                    }
 597                    
 598    def _update_overall_status(self):
 599        """
 600        Updates the overall health status of the application based on the statuses of various components.
 601        The method checks the health status of the following components:
 602            - Streamlit server
 603            - System checks
 604            - Dependencies
 605            - Custom checks (excluding those with a 'check_func' key)
 606            - Streamlit pages
 607        The overall status is determined using the following priority order:
 608            1. "critical" if any component is critical
 609            2. "warning" if any component is warning and none are critical
 610            3. "unknown" if any component is unknown and none are critical or warning, and no healthy components exist
 611            4. "healthy" if any component is healthy and none are critical, warning, or unknown
 612            5. "unknown" if no statuses are found
 613        The result is stored in `self.health_data["overall_status"]`.
 614        """
 615        
 616        has_critical = False
 617        has_warning = False
 618        has_healthy = False
 619        has_unknown = False
 620        
 621        # Helper function to check status
 622        def check_component_status(status):
 623            nonlocal has_critical, has_warning, has_healthy, has_unknown
 624            if status == "critical":
 625                has_critical = True
 626            elif status == "warning":
 627                has_warning = True
 628            elif status == "healthy":
 629                has_healthy = True
 630            elif status == "unknown":
 631                has_unknown = True
 632
 633        # Check Streamlit server status
 634        server_status = self.health_data.get("streamlit_server", {}).get("status")
 635        check_component_status(server_status)
 636        
 637        # Check system status
 638        for system_check in self.health_data.get("system", {}).values():
 639            check_component_status(system_check.get("status"))
 640                    
 641        # Check dependencies status
 642        for dep_check in self.health_data.get("dependencies", {}).values():
 643            check_component_status(dep_check.get("status"))
 644                    
 645        # Check custom checks status
 646        for custom_check in self.health_data.get("custom_checks", {}).values():
 647            if isinstance(custom_check, dict) and "check_func" not in custom_check:
 648                check_component_status(custom_check.get("status"))
 649        
 650        # Check Streamlit pages status
 651        pages_status = self.health_data.get("streamlit_pages", {}).get("status")
 652        check_component_status(pages_status)
 653                        
 654        # Determine overall status with priority:
 655        # critical > warning > unknown > healthy
 656        if has_critical:
 657            self.health_data["overall_status"] = "critical"
 658        elif has_warning:
 659            self.health_data["overall_status"] = "warning"
 660        elif has_unknown and not has_healthy:
 661            self.health_data["overall_status"] = "unknown"
 662        elif has_healthy:
 663            self.health_data["overall_status"] = "healthy"
 664        else:
 665            self.health_data["overall_status"] = "unknown"
 666                
 667    def get_health_data(self) -> Dict:
 668        """Get the latest health check data."""
 669        # Create a copy without the function references
 670        result: Dict[str, Any] = {}
 671        for key, value in self.health_data.items():
 672            if key == "custom_checks":
 673                result[key] = {}
 674                for check_name, check_data in value.items():
 675                    if isinstance(check_data, dict):
 676                        check_copy = check_data.copy()
 677                        if "check_func" in check_copy:
 678                            del check_copy["check_func"]
 679                        result[key][check_name] = check_copy
 680            else:
 681                result[key] = value
 682        return result
 683        
 684    def save_config(self):
 685        """
 686        Saves the current health check configuration to a JSON file.
 687        Attempts to write the configuration stored in `self.config` to the file specified by `self.config_path`.
 688        Displays a success message in the Streamlit app upon successful save.
 689        Handles and displays appropriate error messages for file not found, permission issues, JSON decoding errors, and other exceptions.
 690        Raises:
 691            FileNotFoundError: If the configuration file path does not exist.
 692            PermissionError: If there are insufficient permissions to write to the file.
 693            json.JSONDecodeError: If there is an error decoding the JSON data.
 694            Exception: For any other exceptions encountered during the save process.
 695        """
 696        
 697        try:
 698            with open(self.config_path, "w") as f:
 699                json.dump(self.config, f, indent=2)
 700                st.success(f"Health check config saved successfully to {self.config_path}")
 701        except FileNotFoundError:
 702            st.error(f"Configuration file not found: {self.config_path}")
 703        except PermissionError:
 704            st.error(f"Permission denied: Unable to write to {self.config_path}")
 705        except json.JSONDecodeError:
 706            st.error(f"Error decoding JSON in config file: {self.config_path}")
 707        except Exception as e:
 708            st.error(f"Error saving health check config: {str(e)}")
 709    def check_streamlit_pages(self):
 710        """
 711        Checks for errors in Streamlit pages and updates the health data accordingly.
 712        This method retrieves page errors using StreamlitPageMonitor.get_page_errors().
 713        If errors are found, it sets the 'streamlit_pages' status to 'critical' and updates
 714        the overall health status to 'critical'. If no errors are found, it marks the
 715        'streamlit_pages' status as 'healthy'.
 716        Updates:
 717            self.health_data["streamlit_pages"]: Dict containing status, error count, errors, and details.
 718            self.health_data["overall_status"]: Set to 'critical' if errors are detected.
 719        Returns:
 720            None
 721        """
 722        
 723        page_errors = StreamlitPageMonitor.get_page_errors()
 724        
 725        if "streamlit_pages" not in self.health_data:
 726            self.health_data["streamlit_pages"] = {}
 727        
 728        if page_errors:
 729            self.health_data["streamlit_pages"] = {
 730                "status": "critical",
 731                "error_count": len(page_errors),
 732                "errors": page_errors,
 733                "details": "Errors detected in Streamlit pages"
 734            }
 735            # This affects overall status
 736            self.health_data["overall_status"] = "critical"
 737        else:
 738            self.health_data["streamlit_pages"] = {
 739                "status": "healthy",
 740                "error_count": 0,
 741                "errors": {},
 742                "details": "All pages functioning normally"
 743            }
 744    
 745    def check_streamlit_server(self) -> Dict[str, Any]:
 746        """
 747        Checks the health status of the Streamlit server by sending a GET request to the /healthz endpoint.
 748        Returns:
 749            Dict[str, Any]: A dictionary containing the health status, response code, latency in milliseconds,
 750                            message, and the URL checked. If the server is healthy (HTTP 200), status is "healthy".
 751                            Otherwise, status is "critical" with error details.
 752        Handles:
 753            - Connection errors: Returns critical status with connection error details.
 754            - Timeout errors: Returns critical status with timeout error details.
 755            - Other exceptions: Returns critical status with unknown error details.
 756        Logs:
 757            - The URL being checked.
 758            - The response status code and text.
 759            - Health status and response time if healthy.
 760            - Warnings and errors for unhealthy or failed checks.
 761        """
 762        
 763        try:
 764            host = self.streamlit_url.rstrip('/')
 765            if not host.startswith(('http://', 'https://')):
 766                host = f"http://{host}"
 767            
 768            url = f"{host}:{self.streamlit_port}/healthz"
 769            self.logger.info(f"Checking Streamlit server health at: {url}")
 770            
 771            start_time = time.time()
 772            response = requests.get(url, timeout=3)
 773            total_time = (time.time() - start_time) * 1000
 774            self.logger.info(f"{response.status_code} - {response.text}")
 775            # Check if the response is healthy
 776            if response.status_code == 200:
 777                self.logger.info(f"Streamlit server healthy - Response time: {round(total_time, 2)}ms")
 778                return {
 779                    "status": "healthy",
 780                    "response_code": response.status_code,
 781                    "latency_ms": round(total_time, 2),
 782                    "message": "Streamlit server is running",
 783                    "url": url
 784                }
 785            else:
 786                self.logger.warning(f"Unhealthy response from server: {response.status_code}")
 787                return {
 788                    "status": "critical",
 789                    "response_code": response.status_code,
 790                    "error": f"Unhealthy response from server: {response.status_code}",
 791                    "message": "Streamlit server is not healthy",
 792                    "url": url
 793                }
 794
 795        except requests.exceptions.ConnectionError as e:
 796            self.logger.error(f"Connection error while checking Streamlit server: {str(e)}")
 797            return {
 798                "status": "critical",
 799                "error": f"Connection error: {str(e)}",
 800                "message": "Cannot connect to Streamlit server",
 801                "url": url
 802            }
 803        except requests.exceptions.Timeout as e:
 804            self.logger.error(f"Timeout while checking Streamlit server: {str(e)}")
 805            return {
 806                "status": "critical",
 807                "error": f"Timeout error: {str(e)}",
 808                "message": "Streamlit server is not responding",
 809                "url": url
 810            }
 811        except Exception as e:
 812            self.logger.error(f"Unexpected error while checking Streamlit server: {str(e)}")
 813            return {
 814                "status": "critical",
 815                "error": f"Unknown error: {str(e)}",
 816                "message": "Failed to check Streamlit server",
 817                "url": url
 818            }
 819    
 820def health_check(config_path:str = "health_check_config.json"):
 821    """
 822    Displays an interactive Streamlit dashboard for monitoring application health.
 823    This function initializes and manages a health check service, presenting real-time system metrics,
 824    dependency statuses, custom checks, and Streamlit page health in a user-friendly dashboard.
 825    Users can manually refresh health checks, view detailed error information, and adjust configuration
 826    thresholds and intervals directly from the UI.
 827    Args:
 828        config_path (str, optional): Path to the health check configuration JSON file.
 829            Defaults to "health_check_config.json".
 830    Features:
 831        - Displays overall health status with color-coded indicators.
 832        - Shows last updated timestamp for health data.
 833        - Monitors Streamlit server status, latency, and errors.
 834        - Provides tabs for:
 835            * System Resources (CPU, Memory, Disk usage and status)
 836            * Dependencies (external services and their health)
 837            * Custom Checks (user-defined health checks)
 838            * Streamlit Pages (page-specific errors and status)
 839        - Allows configuration of system thresholds, check intervals, and Streamlit server settings.
 840        - Supports manual refresh and saving configuration changes.
 841    Raises:
 842        Displays error messages in the UI for any exceptions encountered during health data retrieval or processing.
 843    Returns:
 844        None. The dashboard is rendered in the Streamlit app.
 845    """
 846    
 847    logger = logging.getLogger(f"{__name__}.health_check")
 848    logger.info("Starting health check dashboard")
 849    st.title("Application Health Dashboard")
 850    
 851    # Initialize or get the health check service
 852    if "health_service" not in st.session_state:
 853        logger.info("Initializing new health check service")
 854        st.session_state.health_service = HealthCheckService(config_path = config_path)
 855        st.session_state.health_service.start()
 856    
 857    health_service = st.session_state.health_service
 858    
 859    # Add controls for manual refresh and configuration
 860    col1, col2 = st.columns([3, 1])
 861    with col1:
 862        st.subheader("System Health Status")
 863    with col2:
 864        if st.button("Refresh Now"):
 865            health_service.run_all_checks()
 866    
 867    # Get the latest health data
 868    health_data = health_service.get_health_data()
 869    
 870    # Display overall status with appropriate color
 871    overall_status = health_data.get("overall_status", "unknown")
 872    status_color = {
 873        "healthy": "green",
 874        "warning": "orange",
 875        "critical": "red",
 876        "unknown": "gray"
 877    }.get(overall_status, "gray")
 878    
 879    st.markdown(
 880        f"<h3 style='color: {status_color};'>Overall Status: {overall_status.upper()}</h3>",
 881        unsafe_allow_html=True
 882    )
 883    
 884    # Display last updated time
 885    if health_data.get("last_updated"):
 886        try:
 887            last_updated = datetime.fromisoformat(health_data["last_updated"])
 888            st.text(f"Last updated: {last_updated.strftime('%Y-%m-%d %H:%M:%S')}")
 889        except Exception as e:
 890            st.error(f"Last updated: {health_data['last_updated']}")
 891            st.exception(e)
 892    
 893    server_health = health_data.get("streamlit_server", {})
 894    server_status = server_health.get("status", "unknown")
 895    server_color = {
 896        "healthy": "green",
 897        "critical": "red",
 898        "unknown": "gray"
 899    }.get(server_status, "gray")
 900
 901    st.markdown(
 902        f"### Streamlit Server Status: <span style='color: {server_color}'>{server_status.upper()}</span>",
 903        unsafe_allow_html=True
 904    )
 905
 906    if server_status != "healthy":
 907        st.error(server_health.get("message", "Server status unknown"))
 908        if "error" in server_health:
 909            st.code(server_health["error"])
 910    else:
 911        st.success(server_health.get("message", "Server is running"))
 912        if "latency_ms" in server_health:
 913            latency = server_health["latency_ms"]
 914            # Define color based on latency thresholds
 915            if latency <= 50:
 916                latency_color = "green"
 917                performance = "Excellent"
 918            elif latency <= 100:
 919                latency_color = "blue"
 920                performance = "Good"
 921            elif latency <= 200:
 922                latency_color = "orange"
 923                performance = "Fair"
 924            else:
 925                latency_color = "red"
 926                performance = "Poor"
 927                
 928            st.markdown(
 929                f"""
 930                <div style='display: flex; align-items: center; gap: 10px;'>
 931                    <div>Server Response Time:</div>
 932                    <div style='color: {latency_color}; font-weight: bold;'>
 933                        {latency} ms
 934                    </div>
 935                    <div style='color: {latency_color};'>
 936                        ({performance})
 937                    </div>
 938                </div>
 939                """,
 940                unsafe_allow_html=True
 941            )
 942    
 943    # Create tabs for different categories of health checks
 944    tab1, tab2, tab3, tab4 = st.tabs(["System Resources", "Dependencies", "Custom Checks", "Streamlit Pages"])
 945    
 946    with tab1:
 947        # Display system health checks
 948        system_data = health_data.get("system", {})
 949        
 950        # CPU
 951        if "cpu" in system_data:
 952            cpu_data = system_data["cpu"]
 953            cpu_status = cpu_data.get("status", "unknown")
 954            cpu_color = {"healthy": "green", "warning": "orange", "critical": "red"}.get(cpu_status, "gray")
 955            
 956            st.markdown(f"### CPU Status: <span style='color:{cpu_color}'>{cpu_status.upper()}</span>", unsafe_allow_html=True)
 957            st.progress(cpu_data.get("usage_percent", 0) / 100)
 958            st.text(f"CPU Usage: {cpu_data.get('usage_percent', 0)}%")
 959        
 960        # Memory
 961        if "memory" in system_data:
 962            memory_data = system_data["memory"]
 963            memory_status = memory_data.get("status", "unknown")
 964            memory_color = {"healthy": "green", "warning": "orange", "critical": "red"}.get(memory_status, "gray")
 965            
 966            st.markdown(f"### Memory Status: <span style='color:{memory_color}'>{memory_status.upper()}</span>", unsafe_allow_html=True)
 967            st.progress(memory_data.get("usage_percent", 0) / 100)
 968            st.text(f"Memory Usage: {memory_data.get('usage_percent', 0)}%")
 969            st.text(f"Total Memory: {memory_data.get('total_gb', 0)} GB")
 970            st.text(f"Available Memory: {memory_data.get('available_gb', 0)} GB")
 971        
 972        # Disk
 973        if "disk" in system_data:
 974            disk_data = system_data["disk"]
 975            disk_status = disk_data.get("status", "unknown")
 976            disk_color = {"healthy": "green", "warning": "orange", "critical": "red"}.get(disk_status, "gray")
 977            
 978            st.markdown(f"### Disk Status: <span style='color:{disk_color}'>{disk_status.upper()}</span>", unsafe_allow_html=True)
 979            st.progress(disk_data.get("usage_percent", 0) / 100)
 980            st.text(f"Disk Usage: {disk_data.get('usage_percent', 0)}%")
 981            st.text(f"Total Disk Space: {disk_data.get('total_gb', 0)} GB")
 982            st.text(f"Free Disk Space: {disk_data.get('free_gb', 0)} GB")
 983    
 984    with tab2:
 985        # Display dependency health checks
 986        dependencies = health_data.get("dependencies", {})
 987        if dependencies:
 988            # Create a dataframe for all dependencies
 989            dep_data = []
 990            for name, dep_info in dependencies.items():
 991                dep_data.append({
 992                    "Name": name,
 993                    "Type": dep_info.get("type", "unknown"),
 994                    "Status": dep_info.get("status", "unknown"),
 995                    "Details": ", ".join([f"{k}: {v}" for k, v in dep_info.items() 
 996                               if k not in ["name", "type", "status", "error"] and not isinstance(v, dict)])
 997                })
 998            
 999            if dep_data:
1000                df = pd.DataFrame(dep_data)
1001                
1002                # Apply color formatting to status column
1003                def color_status(val):
1004                    colors = {
1005                        "healthy": "background-color: #c6efce; color: #006100",
1006                        "warning": "background-color: #ffeb9c; color: #9c5700",
1007                        "critical": "background-color: #ffc7ce; color: #9c0006",
1008                        "unknown": "background-color: #eeeeee; color: #7f7f7f"
1009                    }
1010                    return colors.get(val.lower(), "")
1011                
1012                st.dataframe(df.style.map(color_status, subset=["Status"]))
1013            else:
1014                st.info("No dependencies configured")
1015        else:
1016            st.info("No dependencies configured")
1017    
1018    with tab3:
1019        # Display custom checks
1020        custom_checks = health_data.get("custom_checks", {})
1021        if custom_checks:
1022            # Create a dataframe for all custom checks
1023            check_data = []
1024            for name, check_info in custom_checks.items():
1025                if isinstance(check_info, dict) and "check_func" not in check_info:
1026                    check_data.append({
1027                        "Name": name,
1028                        "Status": check_info.get("status", "unknown"),
1029                        "Details": ", ".join([f"{k}: {v}" for k, v in check_info.items() 
1030                                  if k not in ["name", "status", "check_func", "error"] and not isinstance(v, dict)]),
1031                        "Error": check_info.get("error", "")
1032                    })
1033            
1034            if check_data:
1035                df = pd.DataFrame(check_data)
1036                
1037                # Apply color formatting to status column
1038                def color_status(val):
1039                    colors = {
1040                        "healthy": "background-color: #c6efce; color: #006100",
1041                        "warning": "background-color: #ffeb9c; color: #9c5700",
1042                        "critical": "background-color: #ffc7ce; color: #9c0006",
1043                        "unknown": "background-color: #eeeeee; color: #7f7f7f"
1044                    }
1045                    return colors.get(val.lower(), "")
1046                
1047                st.dataframe(df.style.map(color_status, subset=["Status"]))
1048            else:
1049                st.info("No custom checks configured")
1050        else:
1051            st.info("No custom checks configured")
1052    with tab4:
1053        page_health = health_data.get("streamlit_pages", {})
1054        status = page_health.get("status", "unknown")
1055        error_count = page_health.get("error_count", 0)  
1056        status_color = {
1057            "healthy": "green",
1058            "critical": "red",
1059            "unknown": "gray"
1060        }.get(status, "gray")
1061        
1062        st.markdown(f"### Page Status: <span style='color:{status_color}'>{status.upper()}</span>", unsafe_allow_html=True)
1063        st.metric("Error Count", error_count)
1064        if error_count > 0:
1065            st.error("Pages with errors:")
1066            errors_dict = page_health.get("errors", {})
1067            
1068            if not isinstance(errors_dict, dict):
1069                st.error("Invalid error data format")
1070                return
1071            
1072            for page_name, page_errors in errors_dict.items():
1073                # Create a meaningful page name for display
1074                display_name = page_name.split("/")[-1] if "/" in page_name else page_name
1075                
1076                for error_info in page_errors:
1077                    if isinstance(error_info, dict):
1078                        with st.expander(f"Error in {display_name}"):
1079                            # Display error message without the "Streamlit Error:" prefix
1080                            st.error(error_info.get('error', 'Unknown error'))
1081                            
1082                            # Show additional error details
1083                            if error_info.get('type') == 'streamlit_error':
1084                                st.text("Type: Streamlit Error")
1085                            else:
1086                                st.text("Type: Exception")
1087                                
1088                            st.text("Traceback:")
1089                            st.code("".join(error_info.get('traceback', ['No traceback available'])))
1090                            st.text(f"Timestamp: {error_info.get('timestamp', 'No timestamp')}")
1091    
1092    # Configuration section
1093    with st.expander("Health Check Configuration"):
1094        st.subheader("System Check Thresholds")
1095        
1096        col1, col2 = st.columns(2)
1097        with col1:
1098            cpu_warning = st.slider("CPU Warning Threshold (%)", 
1099                                min_value=10, max_value=90, 
1100                                value=health_service.config["thresholds"].get("cpu_warning", 70),
1101                                step=5)
1102            memory_warning = st.slider("Memory Warning Threshold (%)", 
1103                                   min_value=10, max_value=90, 
1104                                   value=health_service.config["thresholds"].get("memory_warning", 70),
1105                                   step=5)
1106            disk_warning = st.slider("Disk Warning Threshold (%)", 
1107                                 min_value=10, max_value=90, 
1108                                 value=health_service.config["thresholds"].get("disk_warning", 70),
1109                                 step=5)
1110            streamlit_url_update = st.text_input(
1111                "Streamlit Server URL",
1112                value=health_service.config.get("streamlit_url", "http://localhost")
1113            )
1114        
1115        with col2:
1116            cpu_critical = st.slider("CPU Critical Threshold (%)", 
1117                                 min_value=20, max_value=95, 
1118                                 value=health_service.config["thresholds"].get("cpu_critical", 90),
1119                                 step=5)
1120            memory_critical = st.slider("Memory Critical Threshold (%)", 
1121                                    min_value=20, max_value=95, 
1122                                    value=health_service.config["thresholds"].get("memory_critical", 90),
1123                                    step=5)
1124            disk_critical = st.slider("Disk Critical Threshold (%)", 
1125                                  min_value=20, max_value=95, 
1126                                  value=health_service.config["thresholds"].get("disk_critical", 90),
1127                                  step=5)
1128        
1129            check_interval = st.slider("Check Interval (seconds)", 
1130                                min_value=10, max_value=300, 
1131                                value=health_service.config.get("check_interval", 60),
1132                                step=10)
1133            streamlit_port_update = st.number_input(
1134                "Streamlit Server Port",
1135                value=health_service.config.get("streamlit_port", 8501),
1136                step=1
1137            )
1138        
1139        if st.button("Save Configuration"):
1140            # Update configuration
1141            health_service.config["thresholds"]["cpu_warning"] = cpu_warning
1142            health_service.config["thresholds"]["cpu_critical"] = cpu_critical
1143            health_service.config["thresholds"]["memory_warning"] = memory_warning
1144            health_service.config["thresholds"]["memory_critical"] = memory_critical
1145            health_service.config["thresholds"]["disk_warning"] = disk_warning
1146            health_service.config["thresholds"]["disk_critical"] = disk_critical
1147            health_service.config["check_interval"] = check_interval
1148            health_service.config["streamlit_url"] = streamlit_url_update
1149            health_service.config["streamlit_port"] = streamlit_port_update
1150            
1151            # Save to file
1152            health_service.save_config()
1153            st.success("Configuration saved successfully")
1154            
1155            # Restart the service if interval changed
1156            health_service.stop()
1157            health_service.start()
logger = <Logger streamlit_healthcheck.healthcheck (INFO)>
class StreamlitPageMonitor:
 28class StreamlitPageMonitor:
 29    """
 30    Singleton class to monitor and record errors and exceptions occurring in Streamlit pages.
 31    This class monkey-patches `st.error` to capture error messages and provides decorators and methods
 32    to track exceptions and errors per Streamlit page. Errors are stored in a class-level dictionary
 33    and can be retrieved or cleared as needed.
 34    Attributes:
 35        _instance (StreamlitPageMonitor): Singleton instance of the monitor.
 36        _errors (Dict[str, List[Dict[str, Any]]]): Dictionary mapping page names to lists of error records.
 37        _st_error (Callable): Original `st.error` function before monkey-patching.
 38        _current_page (str): Name of the current page being monitored.
 39    Methods:
 40        __new__(cls):
 41            Ensures singleton behavior and monkey-patches `st.error` to record error messages.
 42        _handle_st_error(cls, error_message: str):
 43            Handles calls to `st.error` and records error information for the current page.
 44        set_page_context(cls, page_name: str):
 45            Sets the current page context for error recording.
 46        monitor_page(cls, page_name: str):
 47            Decorator to monitor a Streamlit page for exceptions and `st.error` calls.
 48            Records exceptions and errors under the specified page name.
 49        get_page_errors(cls):
 50            Retrieves all recorded errors for all pages, grouped by page name.
 51        clear_errors(cls, page_name: Optional[str] = None):
 52            Clears recorded errors for a specific page or all pages.
 53    """
 54    
 55    _instance = None
 56    _errors: Dict[str, List[Dict[str, Any]]] = {}
 57    _st_error = st.error
 58    _current_page = None
 59
 60    def __new__(cls):
 61        if cls._instance is None:
 62            cls._instance = super(StreamlitPageMonitor, cls).__new__(cls)
 63            
 64            # Monkey patch st.error to capture error messages
 65            def patched_error(*args, **kwargs):
 66                error_message = " ".join(str(arg) for arg in args)
 67                current_page = cls._current_page
 68                
 69                error_info = {
 70                    'error': error_message,
 71                    'traceback': traceback.format_stack(),
 72                    'timestamp': datetime.now().isoformat(),
 73                    'status': 'critical',
 74                    'type': 'streamlit_error',
 75                    'page': current_page
 76                }
 77
 78                # Ensure current_page is a string, not None
 79                if current_page is None:
 80                    current_page = "unknown_page"
 81                if current_page not in cls._errors:
 82                    cls._errors[current_page] = []
 83                
 84                cls._errors[current_page].append(error_info)
 85                
 86                # Call original st.error
 87                return cls._st_error(*args, **kwargs)
 88                
 89            st.error = patched_error
 90        return cls._instance
 91
 92    @classmethod
 93    def _handle_st_error(cls, error_message: str):
 94        """
 95        Handles Streamlit-specific errors by recording error details for the current page.
 96        Args:
 97            error_message (str): The error message to be logged.
 98        Side Effects:
 99            Updates the class-level _errors dictionary with error information for the current Streamlit page.
100        Error Information Stored:
101            - error: Formatted error message.
102            - traceback: Stack trace at the point of error.
103            - timestamp: Time when the error occurred (ISO format).
104            - status: Error severity ('critical').
105            - type: Error type ('streamlit_error').
106        """
107        
108        # Get current page name from Streamlit context
109        current_page = getattr(st, '_current_page', 'unknown_page')
110        
111        error_info = {
112            'error': f"Streamlit Error: {error_message}",
113            'traceback': traceback.format_stack(),
114            'timestamp': datetime.now().isoformat(),
115            'status': 'critical',
116            'type': 'streamlit_error'
117        }
118
119        # Initialize list for page if not exists
120        if current_page not in cls._errors:
121            cls._errors[current_page] = []
122
123        # Add new error
124        cls._errors[current_page].append(error_info)
125
126    @classmethod
127    def set_page_context(cls, page_name: str):
128        """Set the current page context"""
129        cls._current_page = page_name
130
131    @classmethod
132    def monitor_page(cls, page_name: str):
133        """
134        Decorator to monitor and log exceptions for a specific Streamlit page.
135        Args:
136            page_name (str): The name of the page to monitor.
137        Returns:
138            Callable: A decorator that wraps the target function, sets the page context,
139            clears previous non-Streamlit errors, and logs any exceptions that occur during execution.
140        The decorator performs the following actions:
141            - Sets the current page context using `cls.set_page_context`.
142            - Clears previous exception errors for the page, retaining only those marked as 'streamlit_error'.
143            - Executes the wrapped function.
144            - If an exception occurs, logs detailed error information (error message, traceback, timestamp, status, type, and page)
145              to `cls._errors` under the given page name, then re-raises the exception.
146        """
147        
148        def decorator(func):
149            """
150            Decorator to manage page-specific error handling and context setting.
151            This decorator sets the current page context before executing the decorated function.
152            It clears previous exception errors for the page, retaining only Streamlit error calls.
153            If an exception occurs during function execution, it captures error details including
154            the error message, traceback, timestamp, status, type, and page name, and appends them
155            to the page's error log. The exception is then re-raised.
156            Args:
157                func (Callable): The function to be decorated.
158            Returns:
159                Callable: The wrapped function with error handling and context management.
160            """
161            
162            @functools.wraps(func)
163            def wrapper(*args, **kwargs):
164                # Set the current page context
165                cls.set_page_context(page_name)
166                try:
167                    # Clear previous exception errors but keep st.error calls
168                    if page_name in cls._errors:
169                        cls._errors[page_name] = [
170                            e for e in cls._errors[page_name]
171                            if e.get('type') == 'streamlit_error'
172                        ]
173                    result = func(*args, **kwargs)
174                    return result
175                except Exception as e:
176                    error_info = {
177                        'error': str(e),
178                        'traceback': traceback.format_exc(),
179                        'timestamp': datetime.now().isoformat(),
180                        'status': 'critical',
181                        'type': 'exception',
182                        'page': page_name
183                    }
184                    if page_name not in cls._errors:
185                        cls._errors[page_name] = []
186                    cls._errors[page_name].append(error_info)
187                    raise
188            return wrapper
189        return decorator
190
191    @classmethod
192    def get_page_errors(cls):
193        """
194        Collects and returns errors for each page that has recorded errors.
195        Iterates through the internal `_errors` dictionary, and for each page with errors,
196        constructs a list of error details including the error message, traceback, timestamp,
197        and error type.
198        Returns:
199            dict: A dictionary where keys are page names and values are lists of error details.
200                  Each error detail is a dictionary with the following keys:
201                      - 'error' (str): The error message or 'Unknown error' if not present.
202                      - 'traceback' (list): The traceback information or empty list if not present.
203                      - 'timestamp' (str): The timestamp of the error or empty string if not present.
204                      - 'type' (str): The type of error or 'unknown' if not present.
205        """
206        
207        result = {}
208        for page, errors in cls._errors.items():
209            if errors:  # Only include pages with errors
210                result[page] = [
211                    {
212                        'error': err.get('error', 'Unknown error'),
213                        'traceback': err.get('traceback', []),
214                        'timestamp': err.get('timestamp', ''),
215                        'type': err.get('type', 'unknown')
216                    }
217                    for err in errors
218                ]
219        return result
220
221    @classmethod
222    def clear_errors(cls, page_name: Optional[str] = None):
223        """Clear errors for a specific page or all pages"""
224        if page_name:
225            if page_name in cls._errors:
226                del cls._errors[page_name]
227        else:
228            cls._errors = {}

Singleton class to monitor and record errors and exceptions occurring in Streamlit pages. This class monkey-patches st.error to capture error messages and provides decorators and methods to track exceptions and errors per Streamlit page. Errors are stored in a class-level dictionary and can be retrieved or cleared as needed. Attributes: _instance (StreamlitPageMonitor): Singleton instance of the monitor. _errors (Dict[str, List[Dict[str, Any]]]): Dictionary mapping page names to lists of error records. _st_error (Callable): Original st.error function before monkey-patching. _current_page (str): Name of the current page being monitored. Methods: __new__(cls): Ensures singleton behavior and monkey-patches st.error to record error messages. _handle_st_error(cls, error_message: str): Handles calls to st.error and records error information for the current page. set_page_context(cls, page_name: str): Sets the current page context for error recording. monitor_page(cls, page_name: str): Decorator to monitor a Streamlit page for exceptions and st.error calls. Records exceptions and errors under the specified page name. get_page_errors(cls): Retrieves all recorded errors for all pages, grouped by page name. clear_errors(cls, page_name: Optional[str] = None): Clears recorded errors for a specific page or all pages.

@classmethod
def set_page_context(cls, page_name: str):
126    @classmethod
127    def set_page_context(cls, page_name: str):
128        """Set the current page context"""
129        cls._current_page = page_name

Set the current page context

@classmethod
def monitor_page(cls, page_name: str):
131    @classmethod
132    def monitor_page(cls, page_name: str):
133        """
134        Decorator to monitor and log exceptions for a specific Streamlit page.
135        Args:
136            page_name (str): The name of the page to monitor.
137        Returns:
138            Callable: A decorator that wraps the target function, sets the page context,
139            clears previous non-Streamlit errors, and logs any exceptions that occur during execution.
140        The decorator performs the following actions:
141            - Sets the current page context using `cls.set_page_context`.
142            - Clears previous exception errors for the page, retaining only those marked as 'streamlit_error'.
143            - Executes the wrapped function.
144            - If an exception occurs, logs detailed error information (error message, traceback, timestamp, status, type, and page)
145              to `cls._errors` under the given page name, then re-raises the exception.
146        """
147        
148        def decorator(func):
149            """
150            Decorator to manage page-specific error handling and context setting.
151            This decorator sets the current page context before executing the decorated function.
152            It clears previous exception errors for the page, retaining only Streamlit error calls.
153            If an exception occurs during function execution, it captures error details including
154            the error message, traceback, timestamp, status, type, and page name, and appends them
155            to the page's error log. The exception is then re-raised.
156            Args:
157                func (Callable): The function to be decorated.
158            Returns:
159                Callable: The wrapped function with error handling and context management.
160            """
161            
162            @functools.wraps(func)
163            def wrapper(*args, **kwargs):
164                # Set the current page context
165                cls.set_page_context(page_name)
166                try:
167                    # Clear previous exception errors but keep st.error calls
168                    if page_name in cls._errors:
169                        cls._errors[page_name] = [
170                            e for e in cls._errors[page_name]
171                            if e.get('type') == 'streamlit_error'
172                        ]
173                    result = func(*args, **kwargs)
174                    return result
175                except Exception as e:
176                    error_info = {
177                        'error': str(e),
178                        'traceback': traceback.format_exc(),
179                        'timestamp': datetime.now().isoformat(),
180                        'status': 'critical',
181                        'type': 'exception',
182                        'page': page_name
183                    }
184                    if page_name not in cls._errors:
185                        cls._errors[page_name] = []
186                    cls._errors[page_name].append(error_info)
187                    raise
188            return wrapper
189        return decorator

Decorator to monitor and log exceptions for a specific Streamlit page. Args: page_name (str): The name of the page to monitor. Returns: Callable: A decorator that wraps the target function, sets the page context, clears previous non-Streamlit errors, and logs any exceptions that occur during execution. The decorator performs the following actions: - Sets the current page context using cls.set_page_context. - Clears previous exception errors for the page, retaining only those marked as 'streamlit_error'. - Executes the wrapped function. - If an exception occurs, logs detailed error information (error message, traceback, timestamp, status, type, and page) to cls._errors under the given page name, then re-raises the exception.

@classmethod
def get_page_errors(cls):
191    @classmethod
192    def get_page_errors(cls):
193        """
194        Collects and returns errors for each page that has recorded errors.
195        Iterates through the internal `_errors` dictionary, and for each page with errors,
196        constructs a list of error details including the error message, traceback, timestamp,
197        and error type.
198        Returns:
199            dict: A dictionary where keys are page names and values are lists of error details.
200                  Each error detail is a dictionary with the following keys:
201                      - 'error' (str): The error message or 'Unknown error' if not present.
202                      - 'traceback' (list): The traceback information or empty list if not present.
203                      - 'timestamp' (str): The timestamp of the error or empty string if not present.
204                      - 'type' (str): The type of error or 'unknown' if not present.
205        """
206        
207        result = {}
208        for page, errors in cls._errors.items():
209            if errors:  # Only include pages with errors
210                result[page] = [
211                    {
212                        'error': err.get('error', 'Unknown error'),
213                        'traceback': err.get('traceback', []),
214                        'timestamp': err.get('timestamp', ''),
215                        'type': err.get('type', 'unknown')
216                    }
217                    for err in errors
218                ]
219        return result

Collects and returns errors for each page that has recorded errors. Iterates through the internal _errors dictionary, and for each page with errors, constructs a list of error details including the error message, traceback, timestamp, and error type. Returns: dict: A dictionary where keys are page names and values are lists of error details. Each error detail is a dictionary with the following keys: - 'error' (str): The error message or 'Unknown error' if not present. - 'traceback' (list): The traceback information or empty list if not present. - 'timestamp' (str): The timestamp of the error or empty string if not present. - 'type' (str): The type of error or 'unknown' if not present.

@classmethod
def clear_errors(cls, page_name: Optional[str] = None):
221    @classmethod
222    def clear_errors(cls, page_name: Optional[str] = None):
223        """Clear errors for a specific page or all pages"""
224        if page_name:
225            if page_name in cls._errors:
226                del cls._errors[page_name]
227        else:
228            cls._errors = {}

Clear errors for a specific page or all pages

class HealthCheckService:
230class HealthCheckService:
231    """
232    HealthCheckService provides a comprehensive health monitoring solution for Streamlit applications.
233    It periodically checks system resources, external dependencies, custom health checks, and Streamlit server/page status,
234    updating and reporting the overall health status.
235    Attributes:
236        logger (logging.Logger): Logger for health check events.
237        config_path (str): Path to the health check configuration file.
238        health_data (Dict[str, Any]): Stores the latest health check results.
239        config (Dict): Loaded health check configuration.
240        check_interval (int): Interval (in seconds) between health checks.
241        _running (bool): Indicates if the health check service is running.
242        _thread (threading.Thread): Background thread for periodic checks.
243        streamlit_url (str): URL of the Streamlit server.
244        streamlit_port (int): Port of the Streamlit server.
245    Methods:
246        __init__(config_path: str = "health_check_config.json"):
247            Initializes the HealthCheckService with configuration and default health data.
248        _load_config() -> Dict:
249            Loads health check configuration from file or returns default configuration.
250        _get_default_config() -> Dict:
251            Returns the default health check configuration.
252        start():
253            Starts the health check service in a background thread.
254        stop():
255            Stops the health check service.
256        _run_checks_periodically():
257            Runs health checks periodically based on the configured interval.
258        run_all_checks():
259            Executes all configured health checks and updates health data.
260        check_cpu():
261            Checks CPU usage and updates health data.
262        check_memory():
263            Checks memory usage and updates health data.
264        check_disk():
265            Checks disk usage and updates health data.
266        check_dependencies():
267            Checks external dependencies such as APIs and databases.
268        _check_api_endpoint(endpoint: Dict):
269            Checks if an API endpoint is accessible and updates health data.
270        _check_database(db_config: Dict):
271            Checks database connection (placeholder for actual implementation).
272        register_custom_check(name: str, check_func: Callable[[], Dict[str, Any]]):
273            Registers a custom health check function.
274        run_custom_checks():
275            Executes all registered custom health checks.
276        _update_overall_status():
277            Updates the overall health status based on individual checks.
278        get_health_data() -> Dict:
279            Returns the latest health check data, excluding function references.
280        save_config():
281            Saves the current configuration to file.
282        check_streamlit_pages():
283            Checks for errors in Streamlit pages and updates health data.
284        check_streamlit_server() -> Dict[str, Any]:
285            Checks if the Streamlit server is running and responding.
286    """
287    def __init__(self, config_path: str = "health_check_config.json"):
288        """
289        Initializes the HealthCheckService instance.
290        Args:
291            config_path (str): Path to the health check configuration file. Defaults to "health_check_config.json".
292        Attributes:
293            logger (logging.Logger): Logger for the HealthCheckService.
294            config_path (str): Path to the configuration file.
295            health_data (Dict[str, Any]): Dictionary storing health check data.
296            config (dict): Loaded configuration from the config file.
297            check_interval (int): Interval in seconds between health checks. Defaults to 60.
298            _running (bool): Indicates if the health check service is running.
299            _thread (threading.Thread or None): Thread running the health check loop.
300            streamlit_url (str): URL of the Streamlit service. Defaults to "http://localhost".
301            streamlit_port (int): Port of the Streamlit service. Defaults to 8501.
302        """
303        self.logger = logging.getLogger(f"{__name__}.HealthCheckService")
304        self.logger.info("Initializing HealthCheckService")
305        self.config_path = config_path
306        self.health_data: Dict[str, Any] = {
307            "last_updated": None,
308            "system": {},
309            "dependencies": {},
310            "custom_checks": {},
311            "overall_status": "unknown"
312        }
313        self.config = self._load_config()
314        self.check_interval = self.config.get("check_interval", 60)  # Default: 60 seconds
315        self._running = False
316        self._thread = None
317        self.streamlit_url = self.config.get("streamlit_url", "http://localhost")
318        self.streamlit_port = self.config.get("streamlit_port", 8501)  # Default: 8501
319    def _load_config(self) -> Dict:
320        """Load health check configuration from file."""
321        if os.path.exists(self.config_path):
322            try:
323                with open(self.config_path, "r") as f:
324                    return json.load(f)
325            except Exception as e:
326                st.error(f"Error loading health check config: {str(e)}")
327                return self._get_default_config()
328        else:
329            return self._get_default_config()
330            
331    def _get_default_config(self) -> Dict:
332        """Return default health check configuration."""
333        return {
334            "check_interval": 60,
335            "streamlit_url": "http://localhost",
336            "streamlit_port": 8501,
337            "system_checks": {
338                "cpu": True,
339                "memory": True,
340                "disk": True
341            },
342            "dependencies": {
343                "api_endpoints": [
344                    # Example API endpoint to check
345                    {"name": "example_api", "url": "https://httpbin.org/get", "timeout": 5}
346                ],
347                "databases": [
348                    # Example database connection to check
349                    {"name": "main_db", "type": "postgres", "connection_string": "..."}
350                ]
351            },
352            "thresholds": {
353                "cpu_warning": 70,
354                "cpu_critical": 90,
355                "memory_warning": 70,
356                "memory_critical": 90,
357                "disk_warning": 70,
358                "disk_critical": 90
359            }
360        }
361    
362    def start(self):
363        """Start the health check service in a background thread."""
364        if self._running:
365            return
366            
367        self._running = True
368        self._thread = threading.Thread(target=self._run_checks_periodically, daemon=True)
369        self._thread.start()
370        
371    def stop(self):
372        """Stop the health check service."""
373        self._running = False
374        if self._thread:
375            self._thread.join(timeout=1)
376            
377    def _run_checks_periodically(self):
378        """Run health checks periodically based on check interval."""
379        while self._running:
380            self.run_all_checks()
381            time.sleep(self.check_interval)
382            
383    def run_all_checks(self):
384        """Run all configured health checks and update health data."""
385        # Update timestamp
386        self.health_data["last_updated"] = datetime.now().isoformat()
387        
388        # Check Streamlit server
389        self.health_data["streamlit_server"] = self.check_streamlit_server()
390        
391        # System checks
392        if self.config["system_checks"].get("cpu", True):
393            self.check_cpu()
394        if self.config["system_checks"].get("memory", True):
395            self.check_memory()
396        if self.config["system_checks"].get("disk", True):
397            self.check_disk()
398            
399        # Rest of the existing checks...
400        self.check_dependencies()
401        self.run_custom_checks()
402        self.check_streamlit_pages()
403        self._update_overall_status()
404        
405    def check_cpu(self):
406        """
407        Checks the current CPU usage and updates the health status based on configured thresholds.
408        Measures the CPU usage percentage over a 1-second interval using psutil. Compares the result
409        against warning and critical thresholds defined in the configuration. Sets the status to
410        'healthy', 'warning', or 'critical' accordingly, and updates the health data dictionary.
411        Returns:
412            None
413        """
414        
415        cpu_percent = psutil.cpu_percent(interval=1)
416        warning_threshold = self.config["thresholds"].get("cpu_warning", 70)
417        critical_threshold = self.config["thresholds"].get("cpu_critical", 90)
418        
419        status = "healthy"
420        if cpu_percent >= critical_threshold:
421            status = "critical"
422        elif cpu_percent >= warning_threshold:
423            status = "warning"
424            
425        self.health_data["system"]["cpu"] = {
426            "usage_percent": cpu_percent,
427            "status": status
428        }
429        
430    def check_memory(self):
431        """
432        Checks the system's memory usage and updates the health status accordingly.
433        Retrieves the current memory usage statistics using psutil, compares the usage percentage
434        against configured warning and critical thresholds, and sets the memory status to 'healthy',
435        'warning', or 'critical'. Updates the health_data dictionary with total memory, available memory,
436        usage percentage, and status.
437        Returns:
438            None
439        """
440        
441        memory = psutil.virtual_memory()
442        memory_percent = memory.percent
443        warning_threshold = self.config["thresholds"].get("memory_warning", 70)
444        critical_threshold = self.config["thresholds"].get("memory_critical", 90)
445        
446        status = "healthy"
447        if memory_percent >= critical_threshold:
448            status = "critical"
449        elif memory_percent >= warning_threshold:
450            status = "warning"
451            
452        self.health_data["system"]["memory"] = {
453            "total_gb": round(memory.total / (1024**3), 2),
454            "available_gb": round(memory.available / (1024**3), 2),
455            "usage_percent": memory_percent,
456            "status": status
457        }
458        
459    def check_disk(self):
460        """
461        Checks the disk usage of the root filesystem and updates the health status.
462        Retrieves disk usage statistics using psutil, compares the usage percentage
463        against configured warning and critical thresholds, and sets the disk status
464        accordingly ("healthy", "warning", or "critical"). Updates the health_data
465        dictionary with total disk size, free space, usage percentage, and status.
466        Returns:
467            None
468        """
469        
470        disk = psutil.disk_usage('/')
471        disk_percent = disk.percent
472        warning_threshold = self.config["thresholds"].get("disk_warning", 70)
473        critical_threshold = self.config["thresholds"].get("disk_critical", 90)
474        
475        status = "healthy"
476        if disk_percent >= critical_threshold:
477            status = "critical"
478        elif disk_percent >= warning_threshold:
479            status = "warning"
480            
481        self.health_data["system"]["disk"] = {
482            "total_gb": round(disk.total / (1024**3), 2),
483            "free_gb": round(disk.free / (1024**3), 2),
484            "usage_percent": disk_percent,
485            "status": status
486        }
487        
488    def check_dependencies(self):
489        """
490        Checks the health of configured dependencies, including API endpoints and databases.
491        Iterates through the list of API endpoints and databases specified in the configuration,
492        and performs health checks on each by invoking the corresponding internal methods.
493        Raises:
494            Exception: If any dependency check fails.
495        """
496        
497        # Check API endpoints
498        for endpoint in self.config["dependencies"].get("api_endpoints", []):
499            self._check_api_endpoint(endpoint)
500            
501        # Check database connections
502        for db in self.config["dependencies"].get("databases", []):
503            self._check_database(db)
504            
505    def _check_api_endpoint(self, endpoint: Dict):
506        """
507        Check if an API endpoint is accessible.
508        
509        Args:
510            endpoint: Dictionary with endpoint configuration
511        """
512        name = endpoint.get("name", "unknown_api")
513        url = endpoint.get("url", "")
514        timeout = endpoint.get("timeout", 5)
515        
516        if not url:
517            return
518            
519        try:
520            start_time = time.time()
521            response = requests.get(url, timeout=timeout)
522            response_time = time.time() - start_time
523            
524            status = "healthy" if response.status_code < 400 else "critical"
525            
526            self.health_data["dependencies"][name] = {
527                "type": "api",
528                "url": url,
529                "status": status,
530                "response_time_ms": round(response_time * 1000, 2),
531                "status_code": response.status_code
532            }
533        except Exception as e:
534            self.health_data["dependencies"][name] = {
535                "type": "api",
536                "url": url,
537                "status": "critical",
538                "error": str(e)
539            }
540            
541    def _check_database(self, db_config: Dict):
542        """
543        Check database connection.
544        Note: This is a placeholder. You'll need to implement specific database checks
545        based on your application's needs.
546        
547        Args:
548            db_config: Dictionary with database configuration
549        """
550        name = db_config.get("name", "unknown_db")
551        db_type = db_config.get("type", "")
552        
553        # Placeholder for database connection check
554        # In a real implementation, you would check the specific database connection
555        self.health_data["dependencies"][name] = {
556            "type": "database",
557            "db_type": db_type,
558            "status": "unknown",
559            "message": "Database check not implemented"
560        }
561        
562    def register_custom_check(self, name: str, check_func: Callable[[], Dict[str, Any]]):
563        """
564        Register a custom health check function.
565        
566        Args:
567            name: Name of the custom check
568            check_func: Function that performs the check and returns a dictionary with results
569        """
570        if "custom_checks" not in self.health_data:
571            self.health_data["custom_checks"] = {}
572            
573        self.health_data["custom_checks"][name] = {
574            "status": "unknown",
575            "check_func": check_func
576        }
577        
578    def run_custom_checks(self):
579        """Run all registered custom health checks."""
580        if "custom_checks" not in self.health_data:
581            return
582            
583        for name, check_info in list(self.health_data["custom_checks"].items()):
584            if "check_func" in check_info and callable(check_info["check_func"]):
585                try:
586                    result = check_info["check_func"]()
587                    # Remove the function reference from the result
588                    func = check_info["check_func"]
589                    self.health_data["custom_checks"][name] = result
590                    # Add the function back
591                    self.health_data["custom_checks"][name]["check_func"] = func
592                except Exception as e:
593                    self.health_data["custom_checks"][name] = {
594                        "status": "critical",
595                        "error": str(e),
596                        "check_func": check_info["check_func"]
597                    }
598                    
599    def _update_overall_status(self):
600        """
601        Updates the overall health status of the application based on the statuses of various components.
602        The method checks the health status of the following components:
603            - Streamlit server
604            - System checks
605            - Dependencies
606            - Custom checks (excluding those with a 'check_func' key)
607            - Streamlit pages
608        The overall status is determined using the following priority order:
609            1. "critical" if any component is critical
610            2. "warning" if any component is warning and none are critical
611            3. "unknown" if any component is unknown and none are critical or warning, and no healthy components exist
612            4. "healthy" if any component is healthy and none are critical, warning, or unknown
613            5. "unknown" if no statuses are found
614        The result is stored in `self.health_data["overall_status"]`.
615        """
616        
617        has_critical = False
618        has_warning = False
619        has_healthy = False
620        has_unknown = False
621        
622        # Helper function to check status
623        def check_component_status(status):
624            nonlocal has_critical, has_warning, has_healthy, has_unknown
625            if status == "critical":
626                has_critical = True
627            elif status == "warning":
628                has_warning = True
629            elif status == "healthy":
630                has_healthy = True
631            elif status == "unknown":
632                has_unknown = True
633
634        # Check Streamlit server status
635        server_status = self.health_data.get("streamlit_server", {}).get("status")
636        check_component_status(server_status)
637        
638        # Check system status
639        for system_check in self.health_data.get("system", {}).values():
640            check_component_status(system_check.get("status"))
641                    
642        # Check dependencies status
643        for dep_check in self.health_data.get("dependencies", {}).values():
644            check_component_status(dep_check.get("status"))
645                    
646        # Check custom checks status
647        for custom_check in self.health_data.get("custom_checks", {}).values():
648            if isinstance(custom_check, dict) and "check_func" not in custom_check:
649                check_component_status(custom_check.get("status"))
650        
651        # Check Streamlit pages status
652        pages_status = self.health_data.get("streamlit_pages", {}).get("status")
653        check_component_status(pages_status)
654                        
655        # Determine overall status with priority:
656        # critical > warning > unknown > healthy
657        if has_critical:
658            self.health_data["overall_status"] = "critical"
659        elif has_warning:
660            self.health_data["overall_status"] = "warning"
661        elif has_unknown and not has_healthy:
662            self.health_data["overall_status"] = "unknown"
663        elif has_healthy:
664            self.health_data["overall_status"] = "healthy"
665        else:
666            self.health_data["overall_status"] = "unknown"
667                
668    def get_health_data(self) -> Dict:
669        """Get the latest health check data."""
670        # Create a copy without the function references
671        result: Dict[str, Any] = {}
672        for key, value in self.health_data.items():
673            if key == "custom_checks":
674                result[key] = {}
675                for check_name, check_data in value.items():
676                    if isinstance(check_data, dict):
677                        check_copy = check_data.copy()
678                        if "check_func" in check_copy:
679                            del check_copy["check_func"]
680                        result[key][check_name] = check_copy
681            else:
682                result[key] = value
683        return result
684        
685    def save_config(self):
686        """
687        Saves the current health check configuration to a JSON file.
688        Attempts to write the configuration stored in `self.config` to the file specified by `self.config_path`.
689        Displays a success message in the Streamlit app upon successful save.
690        Handles and displays appropriate error messages for file not found, permission issues, JSON decoding errors, and other exceptions.
691        Raises:
692            FileNotFoundError: If the configuration file path does not exist.
693            PermissionError: If there are insufficient permissions to write to the file.
694            json.JSONDecodeError: If there is an error decoding the JSON data.
695            Exception: For any other exceptions encountered during the save process.
696        """
697        
698        try:
699            with open(self.config_path, "w") as f:
700                json.dump(self.config, f, indent=2)
701                st.success(f"Health check config saved successfully to {self.config_path}")
702        except FileNotFoundError:
703            st.error(f"Configuration file not found: {self.config_path}")
704        except PermissionError:
705            st.error(f"Permission denied: Unable to write to {self.config_path}")
706        except json.JSONDecodeError:
707            st.error(f"Error decoding JSON in config file: {self.config_path}")
708        except Exception as e:
709            st.error(f"Error saving health check config: {str(e)}")
710    def check_streamlit_pages(self):
711        """
712        Checks for errors in Streamlit pages and updates the health data accordingly.
713        This method retrieves page errors using StreamlitPageMonitor.get_page_errors().
714        If errors are found, it sets the 'streamlit_pages' status to 'critical' and updates
715        the overall health status to 'critical'. If no errors are found, it marks the
716        'streamlit_pages' status as 'healthy'.
717        Updates:
718            self.health_data["streamlit_pages"]: Dict containing status, error count, errors, and details.
719            self.health_data["overall_status"]: Set to 'critical' if errors are detected.
720        Returns:
721            None
722        """
723        
724        page_errors = StreamlitPageMonitor.get_page_errors()
725        
726        if "streamlit_pages" not in self.health_data:
727            self.health_data["streamlit_pages"] = {}
728        
729        if page_errors:
730            self.health_data["streamlit_pages"] = {
731                "status": "critical",
732                "error_count": len(page_errors),
733                "errors": page_errors,
734                "details": "Errors detected in Streamlit pages"
735            }
736            # This affects overall status
737            self.health_data["overall_status"] = "critical"
738        else:
739            self.health_data["streamlit_pages"] = {
740                "status": "healthy",
741                "error_count": 0,
742                "errors": {},
743                "details": "All pages functioning normally"
744            }
745    
746    def check_streamlit_server(self) -> Dict[str, Any]:
747        """
748        Checks the health status of the Streamlit server by sending a GET request to the /healthz endpoint.
749        Returns:
750            Dict[str, Any]: A dictionary containing the health status, response code, latency in milliseconds,
751                            message, and the URL checked. If the server is healthy (HTTP 200), status is "healthy".
752                            Otherwise, status is "critical" with error details.
753        Handles:
754            - Connection errors: Returns critical status with connection error details.
755            - Timeout errors: Returns critical status with timeout error details.
756            - Other exceptions: Returns critical status with unknown error details.
757        Logs:
758            - The URL being checked.
759            - The response status code and text.
760            - Health status and response time if healthy.
761            - Warnings and errors for unhealthy or failed checks.
762        """
763        
764        try:
765            host = self.streamlit_url.rstrip('/')
766            if not host.startswith(('http://', 'https://')):
767                host = f"http://{host}"
768            
769            url = f"{host}:{self.streamlit_port}/healthz"
770            self.logger.info(f"Checking Streamlit server health at: {url}")
771            
772            start_time = time.time()
773            response = requests.get(url, timeout=3)
774            total_time = (time.time() - start_time) * 1000
775            self.logger.info(f"{response.status_code} - {response.text}")
776            # Check if the response is healthy
777            if response.status_code == 200:
778                self.logger.info(f"Streamlit server healthy - Response time: {round(total_time, 2)}ms")
779                return {
780                    "status": "healthy",
781                    "response_code": response.status_code,
782                    "latency_ms": round(total_time, 2),
783                    "message": "Streamlit server is running",
784                    "url": url
785                }
786            else:
787                self.logger.warning(f"Unhealthy response from server: {response.status_code}")
788                return {
789                    "status": "critical",
790                    "response_code": response.status_code,
791                    "error": f"Unhealthy response from server: {response.status_code}",
792                    "message": "Streamlit server is not healthy",
793                    "url": url
794                }
795
796        except requests.exceptions.ConnectionError as e:
797            self.logger.error(f"Connection error while checking Streamlit server: {str(e)}")
798            return {
799                "status": "critical",
800                "error": f"Connection error: {str(e)}",
801                "message": "Cannot connect to Streamlit server",
802                "url": url
803            }
804        except requests.exceptions.Timeout as e:
805            self.logger.error(f"Timeout while checking Streamlit server: {str(e)}")
806            return {
807                "status": "critical",
808                "error": f"Timeout error: {str(e)}",
809                "message": "Streamlit server is not responding",
810                "url": url
811            }
812        except Exception as e:
813            self.logger.error(f"Unexpected error while checking Streamlit server: {str(e)}")
814            return {
815                "status": "critical",
816                "error": f"Unknown error: {str(e)}",
817                "message": "Failed to check Streamlit server",
818                "url": url
819            }

HealthCheckService provides a comprehensive health monitoring solution for Streamlit applications. It periodically checks system resources, external dependencies, custom health checks, and Streamlit server/page status, updating and reporting the overall health status. Attributes: logger (logging.Logger): Logger for health check events. config_path (str): Path to the health check configuration file. health_data (Dict[str, Any]): Stores the latest health check results. config (Dict): Loaded health check configuration. check_interval (int): Interval (in seconds) between health checks. _running (bool): Indicates if the health check service is running. _thread (threading.Thread): Background thread for periodic checks. streamlit_url (str): URL of the Streamlit server. streamlit_port (int): Port of the Streamlit server. Methods: __init__(config_path: str = "health_check_config.json"): Initializes the HealthCheckService with configuration and default health data. _load_config() -> Dict: Loads health check configuration from file or returns default configuration. _get_default_config() -> Dict: Returns the default health check configuration. start(): Starts the health check service in a background thread. stop(): Stops the health check service. _run_checks_periodically(): Runs health checks periodically based on the configured interval. run_all_checks(): Executes all configured health checks and updates health data. check_cpu(): Checks CPU usage and updates health data. check_memory(): Checks memory usage and updates health data. check_disk(): Checks disk usage and updates health data. check_dependencies(): Checks external dependencies such as APIs and databases. _check_api_endpoint(endpoint: Dict): Checks if an API endpoint is accessible and updates health data. _check_database(db_config: Dict): Checks database connection (placeholder for actual implementation). register_custom_check(name: str, check_func: Callable[[], Dict[str, Any]]): Registers a custom health check function. run_custom_checks(): Executes all registered custom health checks. _update_overall_status(): Updates the overall health status based on individual checks. get_health_data() -> Dict: Returns the latest health check data, excluding function references. save_config(): Saves the current configuration to file. check_streamlit_pages(): Checks for errors in Streamlit pages and updates health data. check_streamlit_server() -> Dict[str, Any]: Checks if the Streamlit server is running and responding.

HealthCheckService(config_path: str = 'health_check_config.json')
287    def __init__(self, config_path: str = "health_check_config.json"):
288        """
289        Initializes the HealthCheckService instance.
290        Args:
291            config_path (str): Path to the health check configuration file. Defaults to "health_check_config.json".
292        Attributes:
293            logger (logging.Logger): Logger for the HealthCheckService.
294            config_path (str): Path to the configuration file.
295            health_data (Dict[str, Any]): Dictionary storing health check data.
296            config (dict): Loaded configuration from the config file.
297            check_interval (int): Interval in seconds between health checks. Defaults to 60.
298            _running (bool): Indicates if the health check service is running.
299            _thread (threading.Thread or None): Thread running the health check loop.
300            streamlit_url (str): URL of the Streamlit service. Defaults to "http://localhost".
301            streamlit_port (int): Port of the Streamlit service. Defaults to 8501.
302        """
303        self.logger = logging.getLogger(f"{__name__}.HealthCheckService")
304        self.logger.info("Initializing HealthCheckService")
305        self.config_path = config_path
306        self.health_data: Dict[str, Any] = {
307            "last_updated": None,
308            "system": {},
309            "dependencies": {},
310            "custom_checks": {},
311            "overall_status": "unknown"
312        }
313        self.config = self._load_config()
314        self.check_interval = self.config.get("check_interval", 60)  # Default: 60 seconds
315        self._running = False
316        self._thread = None
317        self.streamlit_url = self.config.get("streamlit_url", "http://localhost")
318        self.streamlit_port = self.config.get("streamlit_port", 8501)  # Default: 8501

Initializes the HealthCheckService instance. Args: config_path (str): Path to the health check configuration file. Defaults to "health_check_config.json". Attributes: logger (logging.Logger): Logger for the HealthCheckService. config_path (str): Path to the configuration file. health_data (Dict[str, Any]): Dictionary storing health check data. config (dict): Loaded configuration from the config file. check_interval (int): Interval in seconds between health checks. Defaults to 60. _running (bool): Indicates if the health check service is running. _thread (threading.Thread or None): Thread running the health check loop. streamlit_url (str): URL of the Streamlit service. Defaults to "http://localhost". streamlit_port (int): Port of the Streamlit service. Defaults to 8501.

logger
config_path
health_data: Dict[str, Any]
config
check_interval
streamlit_url
streamlit_port
def start(self):
362    def start(self):
363        """Start the health check service in a background thread."""
364        if self._running:
365            return
366            
367        self._running = True
368        self._thread = threading.Thread(target=self._run_checks_periodically, daemon=True)
369        self._thread.start()

Start the health check service in a background thread.

def stop(self):
371    def stop(self):
372        """Stop the health check service."""
373        self._running = False
374        if self._thread:
375            self._thread.join(timeout=1)

Stop the health check service.

def run_all_checks(self):
383    def run_all_checks(self):
384        """Run all configured health checks and update health data."""
385        # Update timestamp
386        self.health_data["last_updated"] = datetime.now().isoformat()
387        
388        # Check Streamlit server
389        self.health_data["streamlit_server"] = self.check_streamlit_server()
390        
391        # System checks
392        if self.config["system_checks"].get("cpu", True):
393            self.check_cpu()
394        if self.config["system_checks"].get("memory", True):
395            self.check_memory()
396        if self.config["system_checks"].get("disk", True):
397            self.check_disk()
398            
399        # Rest of the existing checks...
400        self.check_dependencies()
401        self.run_custom_checks()
402        self.check_streamlit_pages()
403        self._update_overall_status()

Run all configured health checks and update health data.

def check_cpu(self):
405    def check_cpu(self):
406        """
407        Checks the current CPU usage and updates the health status based on configured thresholds.
408        Measures the CPU usage percentage over a 1-second interval using psutil. Compares the result
409        against warning and critical thresholds defined in the configuration. Sets the status to
410        'healthy', 'warning', or 'critical' accordingly, and updates the health data dictionary.
411        Returns:
412            None
413        """
414        
415        cpu_percent = psutil.cpu_percent(interval=1)
416        warning_threshold = self.config["thresholds"].get("cpu_warning", 70)
417        critical_threshold = self.config["thresholds"].get("cpu_critical", 90)
418        
419        status = "healthy"
420        if cpu_percent >= critical_threshold:
421            status = "critical"
422        elif cpu_percent >= warning_threshold:
423            status = "warning"
424            
425        self.health_data["system"]["cpu"] = {
426            "usage_percent": cpu_percent,
427            "status": status
428        }

Checks the current CPU usage and updates the health status based on configured thresholds. Measures the CPU usage percentage over a 1-second interval using psutil. Compares the result against warning and critical thresholds defined in the configuration. Sets the status to 'healthy', 'warning', or 'critical' accordingly, and updates the health data dictionary. Returns: None

def check_memory(self):
430    def check_memory(self):
431        """
432        Checks the system's memory usage and updates the health status accordingly.
433        Retrieves the current memory usage statistics using psutil, compares the usage percentage
434        against configured warning and critical thresholds, and sets the memory status to 'healthy',
435        'warning', or 'critical'. Updates the health_data dictionary with total memory, available memory,
436        usage percentage, and status.
437        Returns:
438            None
439        """
440        
441        memory = psutil.virtual_memory()
442        memory_percent = memory.percent
443        warning_threshold = self.config["thresholds"].get("memory_warning", 70)
444        critical_threshold = self.config["thresholds"].get("memory_critical", 90)
445        
446        status = "healthy"
447        if memory_percent >= critical_threshold:
448            status = "critical"
449        elif memory_percent >= warning_threshold:
450            status = "warning"
451            
452        self.health_data["system"]["memory"] = {
453            "total_gb": round(memory.total / (1024**3), 2),
454            "available_gb": round(memory.available / (1024**3), 2),
455            "usage_percent": memory_percent,
456            "status": status
457        }

Checks the system's memory usage and updates the health status accordingly. Retrieves the current memory usage statistics using psutil, compares the usage percentage against configured warning and critical thresholds, and sets the memory status to 'healthy', 'warning', or 'critical'. Updates the health_data dictionary with total memory, available memory, usage percentage, and status. Returns: None

def check_disk(self):
459    def check_disk(self):
460        """
461        Checks the disk usage of the root filesystem and updates the health status.
462        Retrieves disk usage statistics using psutil, compares the usage percentage
463        against configured warning and critical thresholds, and sets the disk status
464        accordingly ("healthy", "warning", or "critical"). Updates the health_data
465        dictionary with total disk size, free space, usage percentage, and status.
466        Returns:
467            None
468        """
469        
470        disk = psutil.disk_usage('/')
471        disk_percent = disk.percent
472        warning_threshold = self.config["thresholds"].get("disk_warning", 70)
473        critical_threshold = self.config["thresholds"].get("disk_critical", 90)
474        
475        status = "healthy"
476        if disk_percent >= critical_threshold:
477            status = "critical"
478        elif disk_percent >= warning_threshold:
479            status = "warning"
480            
481        self.health_data["system"]["disk"] = {
482            "total_gb": round(disk.total / (1024**3), 2),
483            "free_gb": round(disk.free / (1024**3), 2),
484            "usage_percent": disk_percent,
485            "status": status
486        }

Checks the disk usage of the root filesystem and updates the health status. Retrieves disk usage statistics using psutil, compares the usage percentage against configured warning and critical thresholds, and sets the disk status accordingly ("healthy", "warning", or "critical"). Updates the health_data dictionary with total disk size, free space, usage percentage, and status. Returns: None

def check_dependencies(self):
488    def check_dependencies(self):
489        """
490        Checks the health of configured dependencies, including API endpoints and databases.
491        Iterates through the list of API endpoints and databases specified in the configuration,
492        and performs health checks on each by invoking the corresponding internal methods.
493        Raises:
494            Exception: If any dependency check fails.
495        """
496        
497        # Check API endpoints
498        for endpoint in self.config["dependencies"].get("api_endpoints", []):
499            self._check_api_endpoint(endpoint)
500            
501        # Check database connections
502        for db in self.config["dependencies"].get("databases", []):
503            self._check_database(db)

Checks the health of configured dependencies, including API endpoints and databases. Iterates through the list of API endpoints and databases specified in the configuration, and performs health checks on each by invoking the corresponding internal methods. Raises: Exception: If any dependency check fails.

def register_custom_check(self, name: str, check_func: Callable[[], Dict[str, Any]]):
562    def register_custom_check(self, name: str, check_func: Callable[[], Dict[str, Any]]):
563        """
564        Register a custom health check function.
565        
566        Args:
567            name: Name of the custom check
568            check_func: Function that performs the check and returns a dictionary with results
569        """
570        if "custom_checks" not in self.health_data:
571            self.health_data["custom_checks"] = {}
572            
573        self.health_data["custom_checks"][name] = {
574            "status": "unknown",
575            "check_func": check_func
576        }

Register a custom health check function.

Args: name: Name of the custom check check_func: Function that performs the check and returns a dictionary with results

def run_custom_checks(self):
578    def run_custom_checks(self):
579        """Run all registered custom health checks."""
580        if "custom_checks" not in self.health_data:
581            return
582            
583        for name, check_info in list(self.health_data["custom_checks"].items()):
584            if "check_func" in check_info and callable(check_info["check_func"]):
585                try:
586                    result = check_info["check_func"]()
587                    # Remove the function reference from the result
588                    func = check_info["check_func"]
589                    self.health_data["custom_checks"][name] = result
590                    # Add the function back
591                    self.health_data["custom_checks"][name]["check_func"] = func
592                except Exception as e:
593                    self.health_data["custom_checks"][name] = {
594                        "status": "critical",
595                        "error": str(e),
596                        "check_func": check_info["check_func"]
597                    }

Run all registered custom health checks.

def get_health_data(self) -> Dict:
668    def get_health_data(self) -> Dict:
669        """Get the latest health check data."""
670        # Create a copy without the function references
671        result: Dict[str, Any] = {}
672        for key, value in self.health_data.items():
673            if key == "custom_checks":
674                result[key] = {}
675                for check_name, check_data in value.items():
676                    if isinstance(check_data, dict):
677                        check_copy = check_data.copy()
678                        if "check_func" in check_copy:
679                            del check_copy["check_func"]
680                        result[key][check_name] = check_copy
681            else:
682                result[key] = value
683        return result

Get the latest health check data.

def save_config(self):
685    def save_config(self):
686        """
687        Saves the current health check configuration to a JSON file.
688        Attempts to write the configuration stored in `self.config` to the file specified by `self.config_path`.
689        Displays a success message in the Streamlit app upon successful save.
690        Handles and displays appropriate error messages for file not found, permission issues, JSON decoding errors, and other exceptions.
691        Raises:
692            FileNotFoundError: If the configuration file path does not exist.
693            PermissionError: If there are insufficient permissions to write to the file.
694            json.JSONDecodeError: If there is an error decoding the JSON data.
695            Exception: For any other exceptions encountered during the save process.
696        """
697        
698        try:
699            with open(self.config_path, "w") as f:
700                json.dump(self.config, f, indent=2)
701                st.success(f"Health check config saved successfully to {self.config_path}")
702        except FileNotFoundError:
703            st.error(f"Configuration file not found: {self.config_path}")
704        except PermissionError:
705            st.error(f"Permission denied: Unable to write to {self.config_path}")
706        except json.JSONDecodeError:
707            st.error(f"Error decoding JSON in config file: {self.config_path}")
708        except Exception as e:
709            st.error(f"Error saving health check config: {str(e)}")

Saves the current health check configuration to a JSON file. Attempts to write the configuration stored in self.config to the file specified by self.config_path. Displays a success message in the Streamlit app upon successful save. Handles and displays appropriate error messages for file not found, permission issues, JSON decoding errors, and other exceptions. Raises: FileNotFoundError: If the configuration file path does not exist. PermissionError: If there are insufficient permissions to write to the file. json.JSONDecodeError: If there is an error decoding the JSON data. Exception: For any other exceptions encountered during the save process.

def check_streamlit_pages(self):
710    def check_streamlit_pages(self):
711        """
712        Checks for errors in Streamlit pages and updates the health data accordingly.
713        This method retrieves page errors using StreamlitPageMonitor.get_page_errors().
714        If errors are found, it sets the 'streamlit_pages' status to 'critical' and updates
715        the overall health status to 'critical'. If no errors are found, it marks the
716        'streamlit_pages' status as 'healthy'.
717        Updates:
718            self.health_data["streamlit_pages"]: Dict containing status, error count, errors, and details.
719            self.health_data["overall_status"]: Set to 'critical' if errors are detected.
720        Returns:
721            None
722        """
723        
724        page_errors = StreamlitPageMonitor.get_page_errors()
725        
726        if "streamlit_pages" not in self.health_data:
727            self.health_data["streamlit_pages"] = {}
728        
729        if page_errors:
730            self.health_data["streamlit_pages"] = {
731                "status": "critical",
732                "error_count": len(page_errors),
733                "errors": page_errors,
734                "details": "Errors detected in Streamlit pages"
735            }
736            # This affects overall status
737            self.health_data["overall_status"] = "critical"
738        else:
739            self.health_data["streamlit_pages"] = {
740                "status": "healthy",
741                "error_count": 0,
742                "errors": {},
743                "details": "All pages functioning normally"
744            }

Checks for errors in Streamlit pages and updates the health data accordingly. This method retrieves page errors using StreamlitPageMonitor.get_page_errors(). If errors are found, it sets the 'streamlit_pages' status to 'critical' and updates the overall health status to 'critical'. If no errors are found, it marks the 'streamlit_pages' status as 'healthy'. Updates: self.health_data["streamlit_pages"]: Dict containing status, error count, errors, and details. self.health_data["overall_status"]: Set to 'critical' if errors are detected. Returns: None

def check_streamlit_server(self) -> Dict[str, Any]:
746    def check_streamlit_server(self) -> Dict[str, Any]:
747        """
748        Checks the health status of the Streamlit server by sending a GET request to the /healthz endpoint.
749        Returns:
750            Dict[str, Any]: A dictionary containing the health status, response code, latency in milliseconds,
751                            message, and the URL checked. If the server is healthy (HTTP 200), status is "healthy".
752                            Otherwise, status is "critical" with error details.
753        Handles:
754            - Connection errors: Returns critical status with connection error details.
755            - Timeout errors: Returns critical status with timeout error details.
756            - Other exceptions: Returns critical status with unknown error details.
757        Logs:
758            - The URL being checked.
759            - The response status code and text.
760            - Health status and response time if healthy.
761            - Warnings and errors for unhealthy or failed checks.
762        """
763        
764        try:
765            host = self.streamlit_url.rstrip('/')
766            if not host.startswith(('http://', 'https://')):
767                host = f"http://{host}"
768            
769            url = f"{host}:{self.streamlit_port}/healthz"
770            self.logger.info(f"Checking Streamlit server health at: {url}")
771            
772            start_time = time.time()
773            response = requests.get(url, timeout=3)
774            total_time = (time.time() - start_time) * 1000
775            self.logger.info(f"{response.status_code} - {response.text}")
776            # Check if the response is healthy
777            if response.status_code == 200:
778                self.logger.info(f"Streamlit server healthy - Response time: {round(total_time, 2)}ms")
779                return {
780                    "status": "healthy",
781                    "response_code": response.status_code,
782                    "latency_ms": round(total_time, 2),
783                    "message": "Streamlit server is running",
784                    "url": url
785                }
786            else:
787                self.logger.warning(f"Unhealthy response from server: {response.status_code}")
788                return {
789                    "status": "critical",
790                    "response_code": response.status_code,
791                    "error": f"Unhealthy response from server: {response.status_code}",
792                    "message": "Streamlit server is not healthy",
793                    "url": url
794                }
795
796        except requests.exceptions.ConnectionError as e:
797            self.logger.error(f"Connection error while checking Streamlit server: {str(e)}")
798            return {
799                "status": "critical",
800                "error": f"Connection error: {str(e)}",
801                "message": "Cannot connect to Streamlit server",
802                "url": url
803            }
804        except requests.exceptions.Timeout as e:
805            self.logger.error(f"Timeout while checking Streamlit server: {str(e)}")
806            return {
807                "status": "critical",
808                "error": f"Timeout error: {str(e)}",
809                "message": "Streamlit server is not responding",
810                "url": url
811            }
812        except Exception as e:
813            self.logger.error(f"Unexpected error while checking Streamlit server: {str(e)}")
814            return {
815                "status": "critical",
816                "error": f"Unknown error: {str(e)}",
817                "message": "Failed to check Streamlit server",
818                "url": url
819            }

Checks the health status of the Streamlit server by sending a GET request to the /healthz endpoint. Returns: Dict[str, Any]: A dictionary containing the health status, response code, latency in milliseconds, message, and the URL checked. If the server is healthy (HTTP 200), status is "healthy". Otherwise, status is "critical" with error details. Handles: - Connection errors: Returns critical status with connection error details. - Timeout errors: Returns critical status with timeout error details. - Other exceptions: Returns critical status with unknown error details. Logs: - The URL being checked. - The response status code and text. - Health status and response time if healthy. - Warnings and errors for unhealthy or failed checks.

def health_check(config_path: str = 'health_check_config.json'):
 821def health_check(config_path:str = "health_check_config.json"):
 822    """
 823    Displays an interactive Streamlit dashboard for monitoring application health.
 824    This function initializes and manages a health check service, presenting real-time system metrics,
 825    dependency statuses, custom checks, and Streamlit page health in a user-friendly dashboard.
 826    Users can manually refresh health checks, view detailed error information, and adjust configuration
 827    thresholds and intervals directly from the UI.
 828    Args:
 829        config_path (str, optional): Path to the health check configuration JSON file.
 830            Defaults to "health_check_config.json".
 831    Features:
 832        - Displays overall health status with color-coded indicators.
 833        - Shows last updated timestamp for health data.
 834        - Monitors Streamlit server status, latency, and errors.
 835        - Provides tabs for:
 836            * System Resources (CPU, Memory, Disk usage and status)
 837            * Dependencies (external services and their health)
 838            * Custom Checks (user-defined health checks)
 839            * Streamlit Pages (page-specific errors and status)
 840        - Allows configuration of system thresholds, check intervals, and Streamlit server settings.
 841        - Supports manual refresh and saving configuration changes.
 842    Raises:
 843        Displays error messages in the UI for any exceptions encountered during health data retrieval or processing.
 844    Returns:
 845        None. The dashboard is rendered in the Streamlit app.
 846    """
 847    
 848    logger = logging.getLogger(f"{__name__}.health_check")
 849    logger.info("Starting health check dashboard")
 850    st.title("Application Health Dashboard")
 851    
 852    # Initialize or get the health check service
 853    if "health_service" not in st.session_state:
 854        logger.info("Initializing new health check service")
 855        st.session_state.health_service = HealthCheckService(config_path = config_path)
 856        st.session_state.health_service.start()
 857    
 858    health_service = st.session_state.health_service
 859    
 860    # Add controls for manual refresh and configuration
 861    col1, col2 = st.columns([3, 1])
 862    with col1:
 863        st.subheader("System Health Status")
 864    with col2:
 865        if st.button("Refresh Now"):
 866            health_service.run_all_checks()
 867    
 868    # Get the latest health data
 869    health_data = health_service.get_health_data()
 870    
 871    # Display overall status with appropriate color
 872    overall_status = health_data.get("overall_status", "unknown")
 873    status_color = {
 874        "healthy": "green",
 875        "warning": "orange",
 876        "critical": "red",
 877        "unknown": "gray"
 878    }.get(overall_status, "gray")
 879    
 880    st.markdown(
 881        f"<h3 style='color: {status_color};'>Overall Status: {overall_status.upper()}</h3>",
 882        unsafe_allow_html=True
 883    )
 884    
 885    # Display last updated time
 886    if health_data.get("last_updated"):
 887        try:
 888            last_updated = datetime.fromisoformat(health_data["last_updated"])
 889            st.text(f"Last updated: {last_updated.strftime('%Y-%m-%d %H:%M:%S')}")
 890        except Exception as e:
 891            st.error(f"Last updated: {health_data['last_updated']}")
 892            st.exception(e)
 893    
 894    server_health = health_data.get("streamlit_server", {})
 895    server_status = server_health.get("status", "unknown")
 896    server_color = {
 897        "healthy": "green",
 898        "critical": "red",
 899        "unknown": "gray"
 900    }.get(server_status, "gray")
 901
 902    st.markdown(
 903        f"### Streamlit Server Status: <span style='color: {server_color}'>{server_status.upper()}</span>",
 904        unsafe_allow_html=True
 905    )
 906
 907    if server_status != "healthy":
 908        st.error(server_health.get("message", "Server status unknown"))
 909        if "error" in server_health:
 910            st.code(server_health["error"])
 911    else:
 912        st.success(server_health.get("message", "Server is running"))
 913        if "latency_ms" in server_health:
 914            latency = server_health["latency_ms"]
 915            # Define color based on latency thresholds
 916            if latency <= 50:
 917                latency_color = "green"
 918                performance = "Excellent"
 919            elif latency <= 100:
 920                latency_color = "blue"
 921                performance = "Good"
 922            elif latency <= 200:
 923                latency_color = "orange"
 924                performance = "Fair"
 925            else:
 926                latency_color = "red"
 927                performance = "Poor"
 928                
 929            st.markdown(
 930                f"""
 931                <div style='display: flex; align-items: center; gap: 10px;'>
 932                    <div>Server Response Time:</div>
 933                    <div style='color: {latency_color}; font-weight: bold;'>
 934                        {latency} ms
 935                    </div>
 936                    <div style='color: {latency_color};'>
 937                        ({performance})
 938                    </div>
 939                </div>
 940                """,
 941                unsafe_allow_html=True
 942            )
 943    
 944    # Create tabs for different categories of health checks
 945    tab1, tab2, tab3, tab4 = st.tabs(["System Resources", "Dependencies", "Custom Checks", "Streamlit Pages"])
 946    
 947    with tab1:
 948        # Display system health checks
 949        system_data = health_data.get("system", {})
 950        
 951        # CPU
 952        if "cpu" in system_data:
 953            cpu_data = system_data["cpu"]
 954            cpu_status = cpu_data.get("status", "unknown")
 955            cpu_color = {"healthy": "green", "warning": "orange", "critical": "red"}.get(cpu_status, "gray")
 956            
 957            st.markdown(f"### CPU Status: <span style='color:{cpu_color}'>{cpu_status.upper()}</span>", unsafe_allow_html=True)
 958            st.progress(cpu_data.get("usage_percent", 0) / 100)
 959            st.text(f"CPU Usage: {cpu_data.get('usage_percent', 0)}%")
 960        
 961        # Memory
 962        if "memory" in system_data:
 963            memory_data = system_data["memory"]
 964            memory_status = memory_data.get("status", "unknown")
 965            memory_color = {"healthy": "green", "warning": "orange", "critical": "red"}.get(memory_status, "gray")
 966            
 967            st.markdown(f"### Memory Status: <span style='color:{memory_color}'>{memory_status.upper()}</span>", unsafe_allow_html=True)
 968            st.progress(memory_data.get("usage_percent", 0) / 100)
 969            st.text(f"Memory Usage: {memory_data.get('usage_percent', 0)}%")
 970            st.text(f"Total Memory: {memory_data.get('total_gb', 0)} GB")
 971            st.text(f"Available Memory: {memory_data.get('available_gb', 0)} GB")
 972        
 973        # Disk
 974        if "disk" in system_data:
 975            disk_data = system_data["disk"]
 976            disk_status = disk_data.get("status", "unknown")
 977            disk_color = {"healthy": "green", "warning": "orange", "critical": "red"}.get(disk_status, "gray")
 978            
 979            st.markdown(f"### Disk Status: <span style='color:{disk_color}'>{disk_status.upper()}</span>", unsafe_allow_html=True)
 980            st.progress(disk_data.get("usage_percent", 0) / 100)
 981            st.text(f"Disk Usage: {disk_data.get('usage_percent', 0)}%")
 982            st.text(f"Total Disk Space: {disk_data.get('total_gb', 0)} GB")
 983            st.text(f"Free Disk Space: {disk_data.get('free_gb', 0)} GB")
 984    
 985    with tab2:
 986        # Display dependency health checks
 987        dependencies = health_data.get("dependencies", {})
 988        if dependencies:
 989            # Create a dataframe for all dependencies
 990            dep_data = []
 991            for name, dep_info in dependencies.items():
 992                dep_data.append({
 993                    "Name": name,
 994                    "Type": dep_info.get("type", "unknown"),
 995                    "Status": dep_info.get("status", "unknown"),
 996                    "Details": ", ".join([f"{k}: {v}" for k, v in dep_info.items() 
 997                               if k not in ["name", "type", "status", "error"] and not isinstance(v, dict)])
 998                })
 999            
1000            if dep_data:
1001                df = pd.DataFrame(dep_data)
1002                
1003                # Apply color formatting to status column
1004                def color_status(val):
1005                    colors = {
1006                        "healthy": "background-color: #c6efce; color: #006100",
1007                        "warning": "background-color: #ffeb9c; color: #9c5700",
1008                        "critical": "background-color: #ffc7ce; color: #9c0006",
1009                        "unknown": "background-color: #eeeeee; color: #7f7f7f"
1010                    }
1011                    return colors.get(val.lower(), "")
1012                
1013                st.dataframe(df.style.map(color_status, subset=["Status"]))
1014            else:
1015                st.info("No dependencies configured")
1016        else:
1017            st.info("No dependencies configured")
1018    
1019    with tab3:
1020        # Display custom checks
1021        custom_checks = health_data.get("custom_checks", {})
1022        if custom_checks:
1023            # Create a dataframe for all custom checks
1024            check_data = []
1025            for name, check_info in custom_checks.items():
1026                if isinstance(check_info, dict) and "check_func" not in check_info:
1027                    check_data.append({
1028                        "Name": name,
1029                        "Status": check_info.get("status", "unknown"),
1030                        "Details": ", ".join([f"{k}: {v}" for k, v in check_info.items() 
1031                                  if k not in ["name", "status", "check_func", "error"] and not isinstance(v, dict)]),
1032                        "Error": check_info.get("error", "")
1033                    })
1034            
1035            if check_data:
1036                df = pd.DataFrame(check_data)
1037                
1038                # Apply color formatting to status column
1039                def color_status(val):
1040                    colors = {
1041                        "healthy": "background-color: #c6efce; color: #006100",
1042                        "warning": "background-color: #ffeb9c; color: #9c5700",
1043                        "critical": "background-color: #ffc7ce; color: #9c0006",
1044                        "unknown": "background-color: #eeeeee; color: #7f7f7f"
1045                    }
1046                    return colors.get(val.lower(), "")
1047                
1048                st.dataframe(df.style.map(color_status, subset=["Status"]))
1049            else:
1050                st.info("No custom checks configured")
1051        else:
1052            st.info("No custom checks configured")
1053    with tab4:
1054        page_health = health_data.get("streamlit_pages", {})
1055        status = page_health.get("status", "unknown")
1056        error_count = page_health.get("error_count", 0)  
1057        status_color = {
1058            "healthy": "green",
1059            "critical": "red",
1060            "unknown": "gray"
1061        }.get(status, "gray")
1062        
1063        st.markdown(f"### Page Status: <span style='color:{status_color}'>{status.upper()}</span>", unsafe_allow_html=True)
1064        st.metric("Error Count", error_count)
1065        if error_count > 0:
1066            st.error("Pages with errors:")
1067            errors_dict = page_health.get("errors", {})
1068            
1069            if not isinstance(errors_dict, dict):
1070                st.error("Invalid error data format")
1071                return
1072            
1073            for page_name, page_errors in errors_dict.items():
1074                # Create a meaningful page name for display
1075                display_name = page_name.split("/")[-1] if "/" in page_name else page_name
1076                
1077                for error_info in page_errors:
1078                    if isinstance(error_info, dict):
1079                        with st.expander(f"Error in {display_name}"):
1080                            # Display error message without the "Streamlit Error:" prefix
1081                            st.error(error_info.get('error', 'Unknown error'))
1082                            
1083                            # Show additional error details
1084                            if error_info.get('type') == 'streamlit_error':
1085                                st.text("Type: Streamlit Error")
1086                            else:
1087                                st.text("Type: Exception")
1088                                
1089                            st.text("Traceback:")
1090                            st.code("".join(error_info.get('traceback', ['No traceback available'])))
1091                            st.text(f"Timestamp: {error_info.get('timestamp', 'No timestamp')}")
1092    
1093    # Configuration section
1094    with st.expander("Health Check Configuration"):
1095        st.subheader("System Check Thresholds")
1096        
1097        col1, col2 = st.columns(2)
1098        with col1:
1099            cpu_warning = st.slider("CPU Warning Threshold (%)", 
1100                                min_value=10, max_value=90, 
1101                                value=health_service.config["thresholds"].get("cpu_warning", 70),
1102                                step=5)
1103            memory_warning = st.slider("Memory Warning Threshold (%)", 
1104                                   min_value=10, max_value=90, 
1105                                   value=health_service.config["thresholds"].get("memory_warning", 70),
1106                                   step=5)
1107            disk_warning = st.slider("Disk Warning Threshold (%)", 
1108                                 min_value=10, max_value=90, 
1109                                 value=health_service.config["thresholds"].get("disk_warning", 70),
1110                                 step=5)
1111            streamlit_url_update = st.text_input(
1112                "Streamlit Server URL",
1113                value=health_service.config.get("streamlit_url", "http://localhost")
1114            )
1115        
1116        with col2:
1117            cpu_critical = st.slider("CPU Critical Threshold (%)", 
1118                                 min_value=20, max_value=95, 
1119                                 value=health_service.config["thresholds"].get("cpu_critical", 90),
1120                                 step=5)
1121            memory_critical = st.slider("Memory Critical Threshold (%)", 
1122                                    min_value=20, max_value=95, 
1123                                    value=health_service.config["thresholds"].get("memory_critical", 90),
1124                                    step=5)
1125            disk_critical = st.slider("Disk Critical Threshold (%)", 
1126                                  min_value=20, max_value=95, 
1127                                  value=health_service.config["thresholds"].get("disk_critical", 90),
1128                                  step=5)
1129        
1130            check_interval = st.slider("Check Interval (seconds)", 
1131                                min_value=10, max_value=300, 
1132                                value=health_service.config.get("check_interval", 60),
1133                                step=10)
1134            streamlit_port_update = st.number_input(
1135                "Streamlit Server Port",
1136                value=health_service.config.get("streamlit_port", 8501),
1137                step=1
1138            )
1139        
1140        if st.button("Save Configuration"):
1141            # Update configuration
1142            health_service.config["thresholds"]["cpu_warning"] = cpu_warning
1143            health_service.config["thresholds"]["cpu_critical"] = cpu_critical
1144            health_service.config["thresholds"]["memory_warning"] = memory_warning
1145            health_service.config["thresholds"]["memory_critical"] = memory_critical
1146            health_service.config["thresholds"]["disk_warning"] = disk_warning
1147            health_service.config["thresholds"]["disk_critical"] = disk_critical
1148            health_service.config["check_interval"] = check_interval
1149            health_service.config["streamlit_url"] = streamlit_url_update
1150            health_service.config["streamlit_port"] = streamlit_port_update
1151            
1152            # Save to file
1153            health_service.save_config()
1154            st.success("Configuration saved successfully")
1155            
1156            # Restart the service if interval changed
1157            health_service.stop()
1158            health_service.start()

Displays an interactive Streamlit dashboard for monitoring application health. This function initializes and manages a health check service, presenting real-time system metrics, dependency statuses, custom checks, and Streamlit page health in a user-friendly dashboard. Users can manually refresh health checks, view detailed error information, and adjust configuration thresholds and intervals directly from the UI. Args: config_path (str, optional): Path to the health check configuration JSON file. Defaults to "health_check_config.json". Features: - Displays overall health status with color-coded indicators. - Shows last updated timestamp for health data. - Monitors Streamlit server status, latency, and errors. - Provides tabs for: * System Resources (CPU, Memory, Disk usage and status) * Dependencies (external services and their health) * Custom Checks (user-defined health checks) * Streamlit Pages (page-specific errors and status) - Allows configuration of system thresholds, check intervals, and Streamlit server settings. - Supports manual refresh and saving configuration changes. Raises: Displays error messages in the UI for any exceptions encountered during health data retrieval or processing. Returns: None. The dashboard is rendered in the Streamlit app.