#!/usr/bin/env python3
"""Generate JSON Schema from hypergumbo's Python dataclasses.

This script introspects the IR dataclasses (Symbol, Edge, Span, AnalysisRun)
and generates a formal JSON Schema that stays in sync with the code.

Usage:
    ./scripts/generate-schema              # Generate docs/schema.json
    ./scripts/generate-schema --check      # Verify schema is up-to-date (for CI)

The generated schema can be used for:
- Validating hypergumbo output files
- IDE autocompletion for consumers
- Documentation in a standard format
"""

from __future__ import annotations

import argparse
import dataclasses
import json
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional, get_args, get_origin

# Add src to path for imports
REPO_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(REPO_ROOT / "src"))

from hypergumbo.ir import AnalysisRun, Edge, Span, Symbol
from hypergumbo.schema import SCHEMA_VERSION


def python_type_to_json_schema(py_type: Any) -> Dict[str, Any]:
    """Convert a Python type annotation to JSON Schema."""
    origin = get_origin(py_type)
    args = get_args(py_type)

    # Handle Optional[X] (Union[X, None])
    if origin is type(None):
        return {"type": "null"}

    if py_type is type(None):
        return {"type": "null"}

    # Check for Optional (Union with None)
    if origin is not None:
        # Handle Optional[X]
        if type(None) in args:
            non_none_args = [a for a in args if a is not type(None)]
            if len(non_none_args) == 1:
                inner_schema = python_type_to_json_schema(non_none_args[0])
                return {"oneOf": [inner_schema, {"type": "null"}]}

        # Handle List[X]
        if origin is list:
            if args:
                return {
                    "type": "array",
                    "items": python_type_to_json_schema(args[0])
                }
            return {"type": "array"}

        # Handle Dict[K, V]
        if origin is dict:
            return {"type": "object"}

    # Basic types
    if py_type is str:
        return {"type": "string"}
    if py_type is int:
        return {"type": "integer"}
    if py_type is float:
        return {"type": "number"}
    if py_type is bool:
        return {"type": "boolean"}

    # Dataclass references
    if dataclasses.is_dataclass(py_type):
        return {"$ref": f"#/$defs/{py_type.__name__}"}

    # Fallback
    return {"type": "object"}


def dataclass_to_json_schema(cls: type) -> Dict[str, Any]:
    """Convert a dataclass to a JSON Schema definition."""
    properties: Dict[str, Any] = {}
    required: List[str] = []

    for field in dataclasses.fields(cls):
        field_schema = python_type_to_json_schema(field.type)

        # Add description from field metadata or docstring
        properties[field.name] = field_schema

        # Check if required (no default and not Optional)
        origin = get_origin(field.type)
        args = get_args(field.type)
        is_optional = origin is not None and type(None) in args

        if field.default is dataclasses.MISSING and field.default_factory is dataclasses.MISSING:
            if not is_optional:
                required.append(field.name)

    schema: Dict[str, Any] = {
        "type": "object",
        "properties": properties,
    }
    if required:
        schema["required"] = required

    return schema


def generate_behavior_map_schema() -> Dict[str, Any]:
    """Generate the complete JSON Schema for behavior_map output."""

    # Build the schema with definitions
    schema: Dict[str, Any] = {
        "$schema": "https://json-schema.org/draft/2020-12/schema",
        "$id": "https://codeberg.org/iterabloom/hypergumbo/docs/schema.json",
        "title": "Hypergumbo Behavior Map",
        "description": "Output schema for hypergumbo analysis. Auto-generated from Python dataclasses.",
        "type": "object",
        "properties": {
            "schema_version": {
                "type": "string",
                "description": "Schema version (semver)",
                "const": SCHEMA_VERSION
            },
            "confidence_model": {
                "type": "string",
                "description": "Identifier for confidence scoring algorithm"
            },
            "stable_id_scheme": {
                "type": "string",
                "description": "Identifier for stable_id generation algorithm"
            },
            "shape_id_scheme": {
                "type": "string",
                "description": "Identifier for shape_id generation algorithm"
            },
            "repo_fingerprint_scheme": {
                "type": "string",
                "description": "Identifier for repo fingerprinting algorithm"
            },
            "view": {
                "type": "string",
                "const": "behavior_map",
                "description": "Output view type"
            },
            "generated_at": {
                "type": "string",
                "format": "date-time",
                "description": "ISO-8601 timestamp when analysis was generated"
            },
            "analysis_incomplete": {
                "type": "boolean",
                "description": "True if analysis was truncated or limited"
            },
            "analysis_runs": {
                "type": "array",
                "items": {"$ref": "#/$defs/AnalysisRun"},
                "description": "Provenance tracking for each analysis pass"
            },
            "profile": {
                "type": "object",
                "description": "Repository profile (languages, frameworks, etc.)",
                "properties": {
                    "languages": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "object",
                            "properties": {
                                "files": {"type": "integer"},
                                "lines": {"type": "integer"},
                                "percentage": {"type": "number"}
                            }
                        }
                    },
                    "frameworks": {
                        "type": "array",
                        "items": {"type": "string"}
                    },
                    "total_files": {"type": "integer"},
                    "total_lines": {"type": "integer"}
                }
            },
            "nodes": {
                "type": "array",
                "items": {"$ref": "#/$defs/Symbol"},
                "description": "Code symbols (functions, classes, etc.)"
            },
            "edges": {
                "type": "array",
                "items": {"$ref": "#/$defs/Edge"},
                "description": "Relationships between symbols"
            },
            "features": {
                "type": "array",
                "items": {"type": "object"},
                "description": "Extracted feature slices"
            },
            "metrics": {
                "type": "object",
                "description": "Aggregate analysis metrics"
            },
            "limits": {
                "type": "object",
                "description": "Known gaps and limitations"
            },
            "supply_chain_summary": {
                "type": "object",
                "description": "Summary of supply chain classification",
                "properties": {
                    "by_tier": {
                        "type": "object",
                        "additionalProperties": {
                            "type": "object",
                            "properties": {
                                "files": {"type": "integer"},
                                "symbols": {"type": "integer"}
                            }
                        }
                    }
                }
            }
        },
        "required": [
            "schema_version",
            "view",
            "generated_at",
            "nodes",
            "edges"
        ],
        "$defs": {}
    }

    # Add Span definition (manually since it's simple and output differs from dataclass)
    schema["$defs"]["Span"] = {
        "type": "object",
        "description": "Source code location",
        "properties": {
            "start_line": {
                "type": "integer",
                "minimum": 1,
                "description": "Starting line number (1-indexed)"
            },
            "end_line": {
                "type": "integer",
                "minimum": 1,
                "description": "Ending line number (1-indexed)"
            },
            "start_col": {
                "type": "integer",
                "minimum": 0,
                "description": "Starting column (0-indexed)"
            },
            "end_col": {
                "type": "integer",
                "minimum": 0,
                "description": "Ending column (0-indexed)"
            }
        },
        "required": ["start_line", "end_line", "start_col", "end_col"]
    }

    # Add Symbol definition (based on to_dict output)
    schema["$defs"]["Symbol"] = {
        "type": "object",
        "description": "A code symbol (function, class, method, etc.)",
        "properties": {
            "id": {"type": "string", "description": "Unique identifier within analysis"},
            "name": {"type": "string", "description": "Symbol name"},
            "kind": {
                "type": "string",
                "description": "Symbol type",
                "enum": [
                    "function", "class", "method", "constructor", "property",
                    "interface", "type", "enum", "struct", "trait", "module",
                    "route", "getter", "setter", "macro", "data", "instance",
                    "contract", "event", "modifier", "library",
                    "table", "view", "trigger", "index", "procedure",
                    "resource", "variable", "output", "provider",
                    "kernel", "device_function", "host_device_function",
                    "entity", "architecture", "package", "component",
                    "uniform", "input", "storage", "keyframes", "media", "font_face",
                    "playbook", "task", "handler",
                    "event_publisher", "event_subscriber",
                    "ipc_send", "ipc_receive", "websocket_endpoint",
                    "grpc_service", "grpc_servicer", "grpc_stub", "grpc_client", "grpc_server",
                    "http_client", "graphql_client", "graphql_resolver",
                    "mq_publisher", "mq_subscriber", "db_query"
                ]
            },
            "language": {"type": "string", "description": "Programming language"},
            "path": {"type": "string", "description": "File path"},
            "span": {"$ref": "#/$defs/Span"},
            "origin": {"type": "string", "description": "Analysis pass that created this symbol"},
            "origin_run_id": {"type": "string", "description": "Unique execution ID"},
            "origin_run_signature": {
                "oneOf": [{"type": "string"}, {"type": "null"}],
                "description": "Run signature for cache keying"
            },
            "stable_id": {
                "oneOf": [{"type": "string"}, {"type": "null"}],
                "description": "Semantic identity hash (survives renames)"
            },
            "shape_id": {
                "oneOf": [{"type": "string"}, {"type": "null"}],
                "description": "Structural implementation fingerprint"
            },
            "canonical_name": {
                "oneOf": [{"type": "string"}, {"type": "null"}],
                "description": "Fully qualified name"
            },
            "fingerprint": {
                "oneOf": [{"type": "string"}, {"type": "null"}],
                "description": "Content hash of source"
            },
            "quality": {
                "oneOf": [{"type": "object"}, {"type": "null"}],
                "description": "Quality assessment"
            },
            "meta": {
                "oneOf": [{"type": "object"}, {"type": "null"}],
                "description": "Language-specific metadata"
            },
            "supply_chain": {
                "type": "object",
                "description": "Supply chain classification",
                "properties": {
                    "tier": {
                        "type": "integer",
                        "enum": [1, 2, 3, 4],
                        "description": "1=first_party, 2=internal_dep, 3=external_dep, 4=derived"
                    },
                    "tier_name": {
                        "type": "string",
                        "enum": ["first_party", "internal_dep", "external_dep", "derived"]
                    },
                    "reason": {"type": "string", "description": "Why this tier was assigned"}
                },
                "required": ["tier", "tier_name", "reason"]
            }
        },
        "required": ["id", "name", "kind", "language", "path", "span"]
    }

    # Add Edge definition (based on to_dict output)
    schema["$defs"]["Edge"] = {
        "type": "object",
        "description": "A relationship between two symbols",
        "properties": {
            "id": {"type": "string", "description": "Unique edge identifier"},
            "edge_key": {
                "oneOf": [{"type": "string"}, {"type": "null"}],
                "description": "Canonical key for deduplication"
            },
            "src": {"type": "string", "description": "Source symbol ID"},
            "dst": {"type": "string", "description": "Destination symbol ID"},
            "type": {
                "type": "string",
                "description": "Relationship type",
                "enum": [
                    "calls", "imports", "instantiates", "extends", "implements",
                    "references", "depends_on", "links", "sources",
                    "script_src", "base_image", "kernel_launch",
                    "native_bridge", "message_send", "message_receive",
                    "websocket_message", "websocket_connection",
                    "grpc_calls", "http_calls", "graphql_calls",
                    "message_queue", "query_references", "event_publishes",
                    "resolver_implements", "resolver_for_type"
                ]
            },
            "line": {
                "type": "integer",
                "minimum": 1,
                "description": "Line number where relationship occurs"
            },
            "confidence": {
                "type": "number",
                "minimum": 0.0,
                "maximum": 1.0,
                "description": "Confidence score"
            },
            "origin": {"type": "string", "description": "Analysis pass that created this edge"},
            "origin_run_id": {"type": "string", "description": "Unique execution ID"},
            "origin_run_signature": {
                "oneOf": [{"type": "string"}, {"type": "null"}],
                "description": "Run signature for cache keying"
            },
            "quality": {
                "oneOf": [{"type": "object"}, {"type": "null"}],
                "description": "Quality assessment"
            },
            "meta": {
                "type": "object",
                "description": "Edge metadata including evidence",
                "properties": {
                    "evidence_type": {"type": "string"},
                    "evidence_lang": {"type": "string"},
                    "evidence_spans": {"type": "array"}
                }
            }
        },
        "required": ["id", "src", "dst", "type", "line", "confidence"]
    }

    # Add AnalysisRun definition
    schema["$defs"]["AnalysisRun"] = {
        "type": "object",
        "description": "Provenance tracking for an analysis pass",
        "properties": {
            "execution_id": {"type": "string", "description": "Unique run identifier (uuid)"},
            "run_signature": {"type": "string", "description": "Deterministic config hash"},
            "repo_fingerprint": {
                "oneOf": [{"type": "string"}, {"type": "null"}],
                "description": "Git state hash for cache invalidation"
            },
            "pass": {"type": "string", "description": "Analysis pass identifier"},
            "version": {"type": "string", "description": "Hypergumbo version"},
            "toolchain": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "version": {"type": "string"}
                }
            },
            "config_fingerprint": {"type": "string"},
            "files_analyzed": {"type": "integer"},
            "files_skipped": {"type": "integer"},
            "skipped_passes": {"type": "array", "items": {"type": "object"}},
            "warnings": {"type": "array", "items": {"type": "string"}},
            "started_at": {"type": "string", "format": "date-time"},
            "duration_ms": {"type": "integer"}
        },
        "required": ["execution_id", "pass", "version"]
    }

    return schema


def main() -> int:
    parser = argparse.ArgumentParser(description="Generate JSON Schema")
    parser.add_argument("--check", action="store_true", help="Check if schema is up-to-date")
    args = parser.parse_args()

    output_path = REPO_ROOT / "docs" / "schema.json"
    generated = generate_behavior_map_schema()
    generated_json = json.dumps(generated, indent=2) + "\n"

    if args.check:
        if not output_path.exists():
            print("ERROR: docs/schema.json does not exist")
            print("Run: ./scripts/generate-schema")
            return 1

        existing = output_path.read_text(encoding="utf-8")
        if existing != generated_json:
            print("ERROR: docs/schema.json is out of date")
            print("Run: ./scripts/generate-schema")
            return 1

        print("OK: docs/schema.json is up-to-date")
        return 0

    output_path.write_text(generated_json, encoding="utf-8")
    print(f"Generated {output_path}")
    return 0


if __name__ == "__main__":
    sys.exit(main())
