#!/usr/bin/env python3
"""
Agent Observability: Error Diagnostics
Target: <15 seconds execution

Deep dive into error patterns:
- Error type distribution
- Failed agent analysis
- Error timeline
- Root cause patterns
"""

import argparse
import sys
from pathlib import Path

# Add shared utilities to path
SKILLS_DIR = Path(__file__).parent.parent
sys.path.insert(0, str(SKILLS_DIR / "_shared"))

from db_helper import execute_query


def get_error_metrics(time_range="24h", agent_name=None):
    """Get comprehensive error metrics."""
    # Parse time range
    interval_map = {"1h": "1 hour", "24h": "24 hours", "7d": "7 days", "30d": "30 days"}
    interval = interval_map.get(time_range, "24 hours")

    # Error type distribution
    error_types_sql = """
        SELECT
            error_type,
            COUNT(*) as count,
            ROUND(AVG(duration_ms)::numeric, 0) as avg_duration_ms,
            MAX(started_at) as last_occurrence
        FROM agent_execution_logs
        WHERE status = 'error'
        AND started_at > NOW() - INTERVAL %s
    """
    params = [interval]

    if agent_name:
        error_types_sql += " AND agent_name = %s"
        params.append(agent_name)

    error_types_sql += """
        GROUP BY error_type
        ORDER BY count DESC
        LIMIT 10
    """

    error_types_result = execute_query(error_types_sql, tuple(params), fetch=True)
    error_types = error_types_result["rows"] if error_types_result["success"] else []

    # Failed agents
    failed_agents_sql = """
        SELECT
            agent_name,
            COUNT(*) as error_count,
            COUNT(*) FILTER (WHERE error_type IS NOT NULL) as typed_errors,
            ARRAY_AGG(DISTINCT error_type) FILTER (WHERE error_type IS NOT NULL) as error_types
        FROM agent_execution_logs
        WHERE status = 'error'
        AND started_at > NOW() - INTERVAL %s
    """
    params = [interval]

    if agent_name:
        failed_agents_sql += " AND agent_name = %s"
        params.append(agent_name)

    failed_agents_sql += """
        GROUP BY agent_name
        ORDER BY error_count DESC
        LIMIT 10
    """

    failed_agents_result = execute_query(failed_agents_sql, tuple(params), fetch=True)
    failed_agents = (
        failed_agents_result["rows"] if failed_agents_result["success"] else []
    )

    # Recent error examples with messages
    error_examples_sql = """
        SELECT
            agent_name,
            error_type,
            error_message,
            started_at,
            duration_ms
        FROM agent_execution_logs
        WHERE status = 'error'
        AND started_at > NOW() - INTERVAL %s
    """
    params = [interval]

    if agent_name:
        error_examples_sql += " AND agent_name = %s"
        params.append(agent_name)

    error_examples_sql += """
        ORDER BY started_at DESC
        LIMIT 5
    """

    error_examples_result = execute_query(error_examples_sql, tuple(params), fetch=True)
    error_examples = (
        error_examples_result["rows"] if error_examples_result["success"] else []
    )

    # Error rate over time (hourly buckets)
    error_timeline_sql = """
        SELECT
            DATE_TRUNC('hour', started_at) as hour,
            COUNT(*) FILTER (WHERE status = 'error') as errors,
            COUNT(*) as total,
            ROUND((COUNT(*) FILTER (WHERE status = 'error')::numeric / COUNT(*) * 100), 1) as error_rate
        FROM agent_execution_logs
        WHERE started_at > NOW() - INTERVAL %s
    """
    params = [interval]

    if agent_name:
        error_timeline_sql += " AND agent_name = %s"
        params.append(agent_name)

    error_timeline_sql += """
        GROUP BY hour
        ORDER BY hour DESC
        LIMIT 24
    """

    error_timeline_result = execute_query(error_timeline_sql, tuple(params), fetch=True)
    error_timeline = (
        error_timeline_result["rows"] if error_timeline_result["success"] else []
    )

    return {
        "error_types": error_types,
        "failed_agents": failed_agents,
        "error_examples": error_examples,
        "error_timeline": error_timeline,
    }


def format_output(metrics, time_range, agent_name):
    """Format error diagnostics for Claude readability."""
    output = []
    output.append("# Agent Error Diagnostics")
    output.append("")
    output.append(f"**Time Range**: {time_range}")
    if agent_name:
        output.append(f"**Filtered Agent**: {agent_name}")
    output.append("")

    total_errors = sum(row["count"] for row in metrics["error_types"])

    if total_errors == 0:
        output.append("## 🟢 No Errors Detected")
        output.append("")
        output.append(f"No errors found in the last {time_range}. System is healthy!")
        return "\n".join(output)

    output.append(f"## 🔴 Total Errors: {total_errors}")
    output.append("")

    # Error Type Distribution
    if metrics["error_types"]:
        output.append("### Error Type Distribution")
        output.append("| Error Type | Count | % of Errors | Avg Duration | Last Seen |")
        output.append("|------------|-------|-------------|--------------|-----------|")
        for row in metrics["error_types"]:
            error_type = row["error_type"] or "Unknown"
            count = row["count"]
            pct = (count / total_errors * 100) if total_errors > 0 else 0
            duration = (
                f"{row['avg_duration_ms']:.0f}ms" if row["avg_duration_ms"] else "N/A"
            )
            last = (
                row["last_occurrence"].strftime("%H:%M")
                if row["last_occurrence"]
                else "N/A"
            )
            output.append(
                f"| {error_type[:40]} | {count} | {pct:.1f}% | {duration} | {last} |"
            )
        output.append("")

    # Failed Agents
    if metrics["failed_agents"]:
        output.append("### Failed Agents")
        output.append("| Agent | Error Count | Error Types |")
        output.append("|-------|-------------|-------------|")
        for row in metrics["failed_agents"]:
            agent = row["agent_name"] or "Unknown"
            count = row["error_count"]
            types = ", ".join(row["error_types"][:3]) if row["error_types"] else "N/A"
            output.append(f"| {agent} | {count} | {types[:50]} |")
        output.append("")

    # Error Timeline
    if metrics["error_timeline"]:
        output.append("### Error Rate Timeline (Recent Hours)")
        output.append("| Hour | Errors | Total | Error Rate |")
        output.append("|------|--------|-------|------------|")
        for row in metrics["error_timeline"][:12]:  # Last 12 hours
            hour = row["hour"].strftime("%m-%d %H:%M")
            errors = row["errors"]
            total = row["total"]
            rate = row["error_rate"]
            indicator = "🔴" if rate > 30 else "🟡" if rate > 20 else "🟢"
            output.append(f"| {hour} | {errors} | {total} | {indicator} {rate}% |")
        output.append("")

    # Recent Error Examples
    if metrics["error_examples"]:
        output.append("### Recent Error Examples")
        for i, row in enumerate(metrics["error_examples"][:3], 1):
            output.append(f"#### Example {i}")
            output.append(f"- **Agent**: {row['agent_name']}")
            output.append(f"- **Type**: {row['error_type'] or 'Unknown'}")
            output.append(
                f"- **Time**: {row['started_at'].strftime('%Y-%m-%d %H:%M:%S')}"
            )
            if row["error_message"]:
                msg = row["error_message"][:200]
                output.append(f"- **Message**: `{msg}`")
            output.append("")

    # Recommendations
    output.append("## 💡 Recommended Actions")
    if metrics["error_types"]:
        top_error = metrics["error_types"][0]
        output.append(
            f"1. **Investigate top error**: {top_error['error_type']} ({top_error['count']} occurrences)"
        )

    if metrics["failed_agents"]:
        top_agent = metrics["failed_agents"][0]
        output.append(
            f"2. **Focus on agent**: {top_agent['agent_name']} ({top_agent['error_count']} failures)"
        )

    if any(row["error_rate"] > 30 for row in metrics["error_timeline"][:3]):
        output.append(
            "3. **Critical**: Error rate >30% detected in recent hours - immediate attention required"
        )

    output.append("")

    return "\n".join(output)


def main():
    """Main execution."""
    parser = argparse.ArgumentParser(description="Diagnose agent execution errors")
    parser.add_argument(
        "--time-range",
        default="24h",
        choices=["1h", "24h", "7d", "30d"],
        help="Time range for analysis",
    )
    parser.add_argument("--agent", help="Filter by specific agent name")

    args = parser.parse_args()

    try:
        metrics = get_error_metrics(args.time_range, args.agent)
        output = format_output(metrics, args.time_range, args.agent)
        print(output)
        return 0
    except Exception as e:
        print(f"❌ Error diagnostics failed: {e}", file=sys.stderr)
        import traceback

        traceback.print_exc()
        return 1


if __name__ == "__main__":
    sys.exit(main())
