#!/usr/bin/env python3
"""Generate ARCHITECTURE.md using hypergumbo to analyze itself.

This script uses hypergumbo's own analysis capabilities to generate architecture
documentation that stays in sync with the code.

Usage:
    ./scripts/generate-architecture              # Generate docs/ARCHITECTURE.md
    ./scripts/generate-architecture --check      # Verify docs are up-to-date (for CI)

The generated documentation includes:
- Sketch overview (from hypergumbo sketch)
- Most-connected symbols (from hypergumbo run)
- Module reference with docstrings
"""

from __future__ import annotations

import argparse
import ast
import json
import os
import platform
import re
import subprocess
import sys
import tokenize
from collections import defaultdict
from pathlib import Path

# Try to import a TOML parser for reading pyproject.toml
try:
    import tomllib  # Python 3.11+
except ImportError:
    try:
        import tomli as tomllib  # Fallback for Python 3.9-3.10
    except ImportError:
        tomllib = None  # type: ignore[assignment]

REPO_ROOT = Path(__file__).parent.parent


def get_git_commit_sha() -> str | None:
    """Get the current git commit SHA."""
    try:
        result = subprocess.run(
            ["git", "rev-parse", "HEAD"],
            capture_output=True,
            text=True,
            cwd=REPO_ROOT,
        )
        if result.returncode == 0:
            return result.stdout.strip()[:12]  # Short SHA
    except Exception:
        pass
    return None


def get_hypergumbo_version() -> str:
    """Get the hypergumbo package version from pyproject.toml.

    Reads directly from pyproject.toml rather than importing the package,
    ensuring we get the repo version rather than a potentially different
    installed version.
    """
    pyproject_path = REPO_ROOT / "pyproject.toml"
    if not pyproject_path.exists():
        return "unknown"

    try:
        content = pyproject_path.read_text(encoding="utf-8")

        # Try TOML parser first (more reliable)
        if tomllib is not None:
            data = tomllib.loads(content)
            return data.get("project", {}).get("version", "unknown")

        # Fallback: simple regex for version = "x.y.z" line
        # This is reliable enough for the simple pyproject.toml format
        match = re.search(r'^version\s*=\s*["\']([^"\']+)["\']', content, re.MULTILINE)
        if match:
            return match.group(1)
    except Exception:
        pass
    return "unknown"


def run_command(args: list[str], description: str) -> subprocess.CompletedProcess:
    """Run a command with proper error handling.

    Args:
        args: Command and arguments to run
        description: Human-readable description for error messages

    Returns:
        CompletedProcess on success

    Raises:
        SystemExit: If command fails
    """
    # Set PYTHONPATH to include src/ for module imports
    env = os.environ.copy()
    src_path = str(REPO_ROOT / "src")
    env["PYTHONPATH"] = src_path + os.pathsep + env.get("PYTHONPATH", "")

    result = subprocess.run(
        args,
        capture_output=True,
        text=True,
        cwd=REPO_ROOT,
        env=env,
    )

    if result.returncode != 0:
        print(f"ERROR: {description} failed with exit code {result.returncode}",
              file=sys.stderr)
        print(f"Command: {' '.join(args)}", file=sys.stderr)
        if result.stderr:
            print(f"stderr: {result.stderr}", file=sys.stderr)
        sys.exit(1)

    return result


def run_hypergumbo_sketch() -> str:
    """Run hypergumbo sketch on itself."""
    result = run_command(
        [sys.executable, "-m", "hypergumbo", "sketch", str(REPO_ROOT / "src"),
         "-t", "1500", "-x"],
        "hypergumbo sketch",
    )
    return result.stdout


def run_hypergumbo_analysis() -> dict:
    """Run hypergumbo full analysis on itself."""
    output_file = REPO_ROOT / ".hypergumbo" / "self-analysis.json"
    output_file.parent.mkdir(parents=True, exist_ok=True)

    # Delete stale analysis file to avoid reusing old data on failure
    if output_file.exists():
        output_file.unlink()

    run_command(
        [sys.executable, "-m", "hypergumbo", "run", str(REPO_ROOT / "src"),
         "--out", str(output_file), "--first-party-only"],
        "hypergumbo run",
    )

    if not output_file.exists():
        print(f"ERROR: hypergumbo run succeeded but output file not created: {output_file}",
              file=sys.stderr)
        sys.exit(1)

    return json.loads(output_file.read_text(encoding="utf-8"))


def extract_module_docstring(file_path: Path) -> str | None:
    """Extract the module docstring from a Python file.

    Uses tokenize.open() to respect PEP 263 encoding declarations,
    ensuring correct handling of files with non-UTF8 encodings.
    """
    try:
        with tokenize.open(file_path) as f:
            content = f.read()
        tree = ast.parse(content)
        return ast.get_docstring(tree)
    except Exception:
        return None


def categorize_modules() -> dict[str, list[tuple[str, str | None]]]:
    """Categorize modules into logical groups with docstrings."""
    src_root = REPO_ROOT / "src" / "hypergumbo"
    categories: dict[str, list[tuple[str, str | None]]] = {
        "Core": [],
        "Analyzers": [],
        "Linkers": [],
        "CLI & I/O": [],
    }

    for py_file in sorted(src_root.rglob("*.py")):
        # Skip private modules except __init__.py and __main__.py
        if py_file.name.startswith("_") and py_file.name not in {"__init__.py", "__main__.py"}:
            continue

        rel_path = py_file.relative_to(src_root)
        parts = list(rel_path.parts)
        if parts[-1] == "__init__.py":
            continue  # Skip __init__ files in listing

        # Use with_suffix to properly remove only the .py extension
        # (avoids bug where analyze/py.py became "analyze" via .replace(".py", ""))
        module_path = rel_path.with_suffix("")
        module_name = ".".join(module_path.parts)
        docstring = extract_module_docstring(py_file)

        # Use parts[0] check to avoid false positives from files containing
        # "analyze" or "linkers" elsewhere in their path
        if parts and parts[0] == "analyze":
            categories["Analyzers"].append((module_name, docstring))
        elif parts and parts[0] == "linkers":
            categories["Linkers"].append((module_name, docstring))
        elif module_path.name in {"cli", "export", "schema", "plan", "sketch", "__main__"}:
            categories["CLI & I/O"].append((module_name, docstring))
        else:
            categories["Core"].append((module_name, docstring))

    return categories


def get_top_symbols(analysis: dict, limit: int = 20) -> list[dict]:
    """Get the top symbols by in-degree centrality."""
    # Count incoming edges per symbol
    in_degree: dict[str, int] = defaultdict(int)
    for edge in analysis.get("edges", []):
        dst = edge.get("dst", "")
        if dst:
            in_degree[dst] += 1

    # Get symbol details
    symbols_by_id = {s["id"]: s for s in analysis.get("nodes", [])}

    # Sort by in-degree with deterministic tie-breaking (by symbol ID)
    sorted_symbols = sorted(in_degree.items(), key=lambda x: (-x[1], x[0]))[:limit]

    result = []
    for sym_id, count in sorted_symbols:
        if sym_id in symbols_by_id:
            sym = symbols_by_id[sym_id]
            result.append({
                "name": sym.get("name", ""),
                "kind": sym.get("kind", ""),
                "path": sym.get("path", ""),
                "in_degree": count,
            })

    return result


def generate_architecture_md() -> str:
    """Generate the ARCHITECTURE.md content."""
    print("Running hypergumbo sketch on itself...", file=sys.stderr)
    sketch = run_hypergumbo_sketch()

    print("Running hypergumbo analysis on itself...", file=sys.stderr)
    analysis = run_hypergumbo_analysis()

    categories = categorize_modules()
    top_symbols = get_top_symbols(analysis)

    # Count stats
    analyzer_count = len(categories.get("Analyzers", []))
    linker_count = len(categories.get("Linkers", []))
    total_modules = sum(len(v) for v in categories.values())
    node_count = len(analysis.get("nodes", []))
    edge_count = len(analysis.get("edges", []))

    # Get metadata for drift detection
    commit_sha = get_git_commit_sha()
    hypergumbo_version = get_hypergumbo_version()
    python_version = platform.python_version()

    lines = [
        "# Architecture",
        "",
        "> **Auto-generated** by running hypergumbo on itself.",
        "> Run `./scripts/generate-architecture` to update.",
        "",
        "<!--",
        "GENERATION METADATA (for drift detection):",
        f"  commit: {commit_sha or 'unknown'}",
        f"  hypergumbo: {hypergumbo_version}",
        f"  python: {python_version}",
        "-->",
        "",
        "## Self-Analysis Summary",
        "",
        f"hypergumbo analyzed its own source code and found:",
        f"- **{total_modules}** Python modules ({analyzer_count} analyzers, {linker_count} linkers)",
        f"- **{node_count}** symbols (functions, classes, methods)",
        f"- **{edge_count}** edges (calls, imports, instantiates)",
        "",
        "## Sketch (hypergumbo on hypergumbo)",
        "",
        "```markdown",
        sketch.strip(),
        "```",
        "",
        "## Data Flow",
        "",
        "```",
        "Source Files",
        "     │",
        "     ▼",
        "┌─────────────┐     ┌─────────────┐",
        "│  discovery  │────▶│   profile   │  Detect languages, frameworks",
        "└─────────────┘     └─────────────┘",
        "     │                    │",
        "     ▼                    ▼",
        "┌─────────────┐     ┌─────────────┐",
        f"│  analyzers  │────▶│     IR      │  {node_count} Symbols + {edge_count} Edges",
        "└─────────────┘     └─────────────┘",
        "     │                    │",
        "     ▼                    ▼",
        "┌─────────────┐     ┌─────────────┐",
        "│   linkers   │────▶│   merged    │  Cross-language edges",
        "└─────────────┘     └─────────────┘",
        "                          │",
        "          ┌───────────────┼───────────────┐",
        "          ▼               ▼               ▼",
        "    ┌──────────┐   ┌──────────┐   ┌──────────┐",
        "    │  sketch  │   │   run    │   │  slice   │",
        "    │ Markdown │   │   JSON   │   │ subgraph │",
        "    └──────────┘   └──────────┘   └──────────┘",
        "```",
        "",
    ]

    # Top symbols section
    if top_symbols:
        lines.extend([
            "## Most-Connected Symbols",
            "",
            "These symbols have the highest in-degree (most referenced by other symbols):",
            "",
            "| Symbol | Kind | In-Degree | Location |",
            "|--------|------|-----------|----------|",
        ])
        for sym in top_symbols[:15]:
            path = Path(sym["path"]).name if sym["path"] else ""
            lines.append(f"| `{sym['name']}` | {sym['kind']} | {sym['in_degree']} | {path} |")
        lines.append("")

    # Module reference
    lines.extend([
        "## Module Reference",
        "",
    ])

    for category, mods in categories.items():
        if not mods:
            continue
        lines.append(f"### {category}")
        lines.append("")

        for name, docstring in mods:
            # Extract first non-empty line of docstring
            if docstring:
                first_line = next(
                    (ln.strip() for ln in docstring.splitlines() if ln.strip()),
                    "(no docstring)"
                )
                if len(first_line) > 70:
                    first_line = first_line[:67] + "..."
            else:
                first_line = "(no docstring)"

            lines.append(f"- **`{name}`**: {first_line}")

        lines.append("")

    lines.extend([
        "## Key Abstractions",
        "",
        "> **Note:** This section is manually maintained. Update if IR classes change.",
        "",
        "### Symbol (`ir.py`)",
        "Represents a code entity (function, class, method, etc.) with:",
        "- `id`: Unique identifier within the analysis",
        "- `name`: Human-readable name",
        "- `kind`: Type of symbol (function, class, method, etc.)",
        "- `path`: File path",
        "- `span`: Location in source (start/end line/column)",
        "- `stable_id`: Cross-run stable identifier",
        "- `supply_chain`: Object with `tier` (1-4), `tier_name`, and `reason`",
        "",
        "### Edge (`ir.py`)",
        "Represents a relationship between symbols:",
        "- `src`, `dst`: Source and destination symbol IDs",
        "- `type`: Relationship type (calls, imports, instantiates, etc.)",
        "- `confidence`: 0.0-1.0 confidence score",
        "- `meta.evidence_type`: How the edge was detected",
        "",
        "### AnalysisRun (`ir.py`)",
        "Provenance tracking for reproducibility:",
        "- `pass`: Which analyzer produced this data",
        "- `execution_id`: Unique run identifier",
        "- `duration_ms`: Analysis time",
        "- `files_analyzed`: Count of processed files",
        "",
        "## Adding a New Analyzer",
        "",
        "1. Create `src/hypergumbo/analyze/<language>.py`",
        "2. Implement `analyze(root: Path) -> AnalysisResult`",
        "3. Return symbols and edges following IR conventions",
        "4. Add tests in `tests/test_<language>_analyzer.py`",
        "5. Register in `catalog.py` if needed",
        "",
        "## Adding a New Linker",
        "",
        "1. Create `src/hypergumbo/linkers/<name>.py`",
        "2. Implement `link_<name>(root: Path) -> LinkResult`",
        "3. Match patterns across existing symbols",
        "4. Create cross-language edges",
        "5. Add tests in `tests/test_<name>_linker.py`",
        "",
        "---",
        "",
        "*Generated by `./scripts/generate-architecture` using hypergumbo self-analysis.*",
    ])

    return "\n".join(lines)


def main() -> int:
    parser = argparse.ArgumentParser(description="Generate ARCHITECTURE.md")
    parser.add_argument("--check", action="store_true", help="Check if docs are up-to-date")
    args = parser.parse_args()

    output_path = REPO_ROOT / "docs" / "ARCHITECTURE.md"

    if args.check:
        if not output_path.exists():
            print("ERROR: docs/ARCHITECTURE.md does not exist")
            print("Run: ./scripts/generate-architecture")
            return 1

        # Light check: verify file exists and has auto-generated markers
        # Full content comparison is not reliable across environments because
        # hypergumbo output can vary by Python version, tree-sitter builds, etc.
        content = output_path.read_text(encoding="utf-8")
        if "Auto-generated" not in content:
            print("ERROR: docs/ARCHITECTURE.md doesn't appear to be auto-generated")
            return 1
        if "hypergumbo self-analysis" not in content:
            print("ERROR: docs/ARCHITECTURE.md is missing expected content")
            return 1

        # Check for commit SHA drift (warn-only, not a hard failure)
        # This helps detect when docs may be stale relative to code changes
        current_sha = get_git_commit_sha()
        if current_sha:
            sha_match = re.search(r"commit:\s*(\w+)", content)
            if sha_match:
                doc_sha = sha_match.group(1)
                if doc_sha != current_sha:
                    print(f"WARNING: docs generated at commit {doc_sha}, current HEAD is {current_sha}")
                    print("Consider regenerating: ./scripts/generate-architecture")
            else:
                print("WARNING: could not extract commit SHA from ARCHITECTURE.md")

        print("OK: docs/ARCHITECTURE.md exists and appears valid")
        return 0

    generated = generate_architecture_md()
    output_path.write_text(generated, encoding="utf-8")
    print(f"Generated {output_path}")
    return 0


if __name__ == "__main__":
    sys.exit(main())
