#!/usr/bin/env python3
"""
Agent Observability: Quick Health Check
Target: <5 seconds execution

Provides rapid health status of agent execution system:
- Overall success rate
- Critical alerts
- Unprocessed events
- Recent failures
"""

import sys
from pathlib import Path

# Add shared utilities to path
SKILLS_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(SKILLS_DIR / "_shared"))

from db_helper import execute_query


def get_health_metrics():
    """Get core health metrics."""
    # Overall execution stats (last 24 hours)
    execution_stats = execute_query(
        """
        SELECT
            COUNT(*) as total_executions,
            COUNT(*) FILTER (WHERE status = 'success') as successful,
            COUNT(*) FILTER (WHERE status = 'error') as failed,
            COUNT(*) FILTER (WHERE status = 'in_progress') as in_progress,
            ROUND(AVG(duration_ms)::numeric, 0) as avg_duration_ms,
            MAX(started_at) as last_execution
        FROM agent_execution_logs
        WHERE started_at > NOW() - INTERVAL '24 hours'
        """,
        fetch=True,
    )

    # Unprocessed hook events
    hook_stats = execute_query(
        """
        SELECT
            COUNT(*) as unprocessed_count,
            COUNT(*) FILTER (WHERE retry_count > 0) as retry_count,
            MAX(created_at) as oldest_unprocessed
        FROM hook_events
        WHERE processed = FALSE
        """,
        fetch=True,
    )

    # Recent errors (last hour)
    recent_errors = execute_query(
        """
        SELECT
            error_type,
            COUNT(*) as count
        FROM agent_execution_logs
        WHERE status = 'error'
        AND started_at > NOW() - INTERVAL '1 hour'
        GROUP BY error_type
        ORDER BY count DESC
        LIMIT 3
        """,
        fetch=True,
    )

    # Routing stats (last 24 hours)
    routing_stats = execute_query(
        """
        SELECT
            routing_strategy,
            COUNT(*) as count,
            ROUND(AVG(confidence_score)::numeric, 2) as avg_confidence
        FROM agent_routing_decisions
        WHERE created_at > NOW() - INTERVAL '24 hours'
        GROUP BY routing_strategy
        ORDER BY count DESC
        """,
        fetch=True,
    )

    return {
        "execution": (
            execution_stats["rows"][0]
            if execution_stats["success"] and execution_stats["rows"]
            else None
        ),
        "hooks": (
            hook_stats["rows"][0]
            if hook_stats["success"] and hook_stats["rows"]
            else None
        ),
        "recent_errors": recent_errors["rows"] if recent_errors["success"] else [],
        "routing": routing_stats["rows"] if routing_stats["success"] else [],
    }


def calculate_health_status(metrics):
    """Calculate overall health status."""
    if not metrics["execution"]:
        return "🔴", "CRITICAL", "No execution data available"

    exec_metrics = metrics["execution"]
    total = exec_metrics["total_executions"]

    if total == 0:
        return "🟡", "WARNING", "No executions in last 24 hours"

    success_rate = (exec_metrics["successful"] / total) * 100 if total > 0 else 0
    unprocessed = metrics["hooks"]["unprocessed_count"] if metrics["hooks"] else 0

    # Critical conditions
    if success_rate < 70:
        return "🔴", "CRITICAL", f"Success rate critically low: {success_rate:.1f}%"
    if unprocessed > 200:
        return (
            "🔴",
            "CRITICAL",
            f"Unprocessed hook events critical: {unprocessed}",
        )

    # Warning conditions
    if success_rate < 80:
        return "🟡", "WARNING", f"Success rate below target: {success_rate:.1f}%"
    if unprocessed > 50:
        return "🟡", "WARNING", f"Elevated unprocessed events: {unprocessed}"

    # Healthy
    return "🟢", "HEALTHY", f"System healthy: {success_rate:.1f}% success rate"


def format_output(metrics):
    """Format health check output for Claude readability."""
    status_emoji, status_text, status_message = calculate_health_status(metrics)

    output = []
    output.append("# Agent System Health Check")
    output.append("")
    output.append(f"## {status_emoji} Status: {status_text}")
    output.append(f"**{status_message}**")
    output.append("")

    # Key Metrics
    output.append("## Key Metrics (Last 24 Hours)")
    output.append("")

    if metrics["execution"]:
        exec_m = metrics["execution"]
        total = exec_m["total_executions"]
        success_rate = (exec_m["successful"] / total * 100) if total > 0 else 0

        output.append("### Execution Summary")
        output.append("| Metric | Value |")
        output.append("|--------|-------|")
        output.append(f"| Total Executions | {total} |")
        output.append(f"| Success Rate | {success_rate:.1f}% |")
        output.append(f"| Successful | {exec_m['successful']} |")
        output.append(f"| Failed | {exec_m['failed']} |")
        output.append(f"| In Progress | {exec_m['in_progress']} |")
        avg_dur = exec_m["avg_duration_ms"]
        output.append(
            f"| Avg Duration | {avg_dur:.0f}ms |"
            if avg_dur is not None
            else "| Avg Duration | N/A |"
        )
        output.append("")

    if metrics["hooks"]:
        hook_m = metrics["hooks"]
        output.append("### Hook Events")
        output.append("| Metric | Value |")
        output.append("|--------|-------|")
        output.append(f"| Unprocessed | {hook_m['unprocessed_count']} |")
        output.append(f"| With Retries | {hook_m['retry_count']} |")
        output.append("")

    if metrics["routing"]:
        output.append("### Routing Intelligence")
        output.append("| Strategy | Count | Avg Confidence |")
        output.append("|----------|-------|----------------|")
        for row in metrics["routing"]:
            strategy = row["routing_strategy"] or "Unknown"
            output.append(
                f"| {strategy} | {row['count']} | {row['avg_confidence'] or 'N/A'} |"
            )
        output.append("")

    # Recent Errors
    if metrics["recent_errors"]:
        output.append("## 🔴 Recent Errors (Last Hour)")
        output.append("| Error Type | Count |")
        output.append("|------------|-------|")
        for row in metrics["recent_errors"]:
            output.append(f"| {row['error_type'] or 'Unknown'} | {row['count']} |")
        output.append("")

    # Recommendations
    if status_text in ["CRITICAL", "WARNING"]:
        output.append("## 💡 Recommended Actions")
        if status_text == "CRITICAL":
            output.append(
                "- 🚨 **URGENT**: Run `/agent-observability/diagnose-errors` immediately"
            )
        if metrics["hooks"] and metrics["hooks"]["unprocessed_count"] > 50:
            output.append("- Check hook processing system for failures")
        if metrics["recent_errors"]:
            output.append("- Investigate top error types listed above")
        output.append("")

    return "\n".join(output)


def main():
    """Main execution."""
    try:
        metrics = get_health_metrics()
        output = format_output(metrics)
        print(output)
        return 0
    except Exception as e:
        print(f"❌ Health check failed: {e}", file=sys.stderr)
        return 1


if __name__ == "__main__":
    sys.exit(main())
