#!/usr/bin/env python3
"""Verify which ~/.claude-lost/projects/*/<UUID>.jsonl files are safe to delete.

Classifies each recovered jsonl by comparing it to any matching <UUID>.jsonl
in the real profiles (~/.claude-personal, ~/.claude-work, ~/.claude-the-third):

  - IDENTICAL  : same SHA-256, real file fully covers recovered. Safe to delete.
  - PREFIX     : recovered is a strict line-prefix of a real file (older CoW
                 snapshot from earlier in the session). Real has all the data
                 plus more turns. Safe to delete.
  - DIVERGENT  : recovered has lines that no matching real file has, or content
                 doesn't share a common prefix. UNSAFE to delete -- might be
                 unique data.
  - ORPHAN     : no matching <UUID>.jsonl in any real profile. Untouchable
                 here; these are the 13 origin-uncertain sessions.

Outputs a JSON manifest at data/verify-manifests/<ts>.json + human summary on
stdout. Does NOT delete anything by default. Pass --apply to delete the
IDENTICAL + PREFIX files (DIVERGENT/ORPHAN always preserved).
"""
from __future__ import annotations

import argparse
import datetime as _dt
import json
import sys
from pathlib import Path

from claude_timeline.recovery import (
    discover_real_profiles,
    record_key,
    sha256_file,
)

LOST_ROOT = Path.home() / ".claude-lost" / "projects"
REAL_PROFILES = discover_real_profiles()
PROJECT_ROOT = Path(__file__).resolve().parent.parent
MANIFEST_DIR = PROJECT_ROOT / "data" / "verify-manifests"


def read_lines(p: Path) -> list[bytes]:
    with p.open("rb") as f:
        return f.read().splitlines(keepends=True)


def parse_records(lines: list[bytes]) -> tuple[list[dict], int]:
    """Parse JSONL lines. Returns (records, num_unparseable)."""
    records: list[dict] = []
    bad = 0
    for line in lines:
        s = line.strip()
        if not s:
            continue
        try:
            records.append(json.loads(s))
        except (UnicodeDecodeError, json.JSONDecodeError):
            bad += 1
    return records, bad


def find_uuid_in_real(uuid: str) -> list[Path]:
    """Return all real-profile <UUID>.jsonl files matching this uuid."""
    matches: list[Path] = []
    for prof in REAL_PROFILES:
        proj_dir = prof / "projects"
        if not proj_dir.is_dir():
            continue
        # Look for <UUID>.jsonl in any encoded-project subdir.
        for jl in proj_dir.glob(f"*/{uuid}.jsonl"):
            if jl.is_file():
                matches.append(jl)
    return matches


def classify(rec: Path) -> dict:
    """Classify one recovered jsonl by checking whether every record in it is
    present (by stable key) in any matching real-profile <UUID>.jsonl.

    Classes:
      IDENTICAL : exact sha256 match against a real file. Safe to delete.
      SUBSET    : every record in recovered is present in the real match
                  (by uuid / messageId / canonical-json hash). Real has
                  >= as much data; safe to delete.
      DIVERGENT : recovered has at least one record not present in any real
                  match. Could be unique data lost during the agent's restore
                  step. UNSAFE to delete.
      ORPHAN    : no real-profile <UUID>.jsonl exists. Untouchable.
    """
    uuid = rec.stem
    rec_lines = read_lines(rec)
    rec_sha = sha256_file(rec)
    rec_size = rec.stat().st_size

    real_matches = find_uuid_in_real(uuid)
    if not real_matches:
        return {
            "uuid": uuid,
            "recovered": str(rec),
            "recovered_sha": rec_sha,
            "recovered_lines": len(rec_lines),
            "recovered_size": rec_size,
            "class": "ORPHAN",
            "real_match": None,
            "reason": "no <UUID>.jsonl in any real profile",
        }

    rec_records, rec_bad = parse_records(rec_lines)
    rec_keys = {k for r in rec_records if (k := record_key(r)) is not None}

    best = None
    for real in real_matches:
        real_lines = read_lines(real)
        real_sha = sha256_file(real)
        real_size = real.stat().st_size

        if rec_sha == real_sha:
            return {
                "uuid": uuid,
                "recovered": str(rec),
                "recovered_sha": rec_sha,
                "recovered_lines": len(rec_lines),
                "recovered_records": len(rec_records),
                "recovered_size": rec_size,
                "class": "IDENTICAL",
                "real_match": str(real),
                "real_sha": real_sha,
                "real_lines": len(real_lines),
                "real_size": real_size,
                "reason": "exact sha256 match",
            }

        real_records, real_bad = parse_records(real_lines)
        real_keys = {k for r in real_records if (k := record_key(r)) is not None}

        missing = rec_keys - real_keys
        # Empty recovered is a degenerate case; treat as SUBSET if real has anything.
        if len(rec_keys) == 0 and len(real_keys) > 0:
            cand_class = "SUBSET"
            reason = "recovered has no parseable records; real has data"
        elif not missing:
            cand_class = "SUBSET"
            reason = (
                f"{len(rec_keys)} record key(s) all present in real "
                f"({len(real_keys)} total in real)"
            )
        else:
            cand_class = "DIVERGENT"
            sample = sorted(missing)[:3]
            reason = (
                f"{len(missing)} of {len(rec_keys)} recovered record(s) absent "
                f"from real (sample: {sample})"
            )

        cand = {
            "uuid": uuid,
            "recovered": str(rec),
            "recovered_sha": rec_sha,
            "recovered_lines": len(rec_lines),
            "recovered_records": len(rec_records),
            "recovered_size": rec_size,
            "class": cand_class,
            "real_match": str(real),
            "real_sha": real_sha,
            "real_lines": len(real_lines),
            "real_records": len(real_records),
            "real_size": real_size,
            "reason": reason,
            "missing_keys": sorted(missing)[:10] if missing else [],
        }
        # Prefer SUBSET; only fall back to DIVERGENT if no real match yields SUBSET.
        if cand_class == "SUBSET":
            return cand
        if best is None:
            best = cand

    return best  # type: ignore[return-value]


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument(
        "--apply",
        action="store_true",
        help="Actually delete IDENTICAL + PREFIX files. Without this flag, only reports.",
    )
    args = ap.parse_args()

    if not LOST_ROOT.is_dir():
        print(f"ERROR: {LOST_ROOT} does not exist", file=sys.stderr)
        return 1

    rec_files = sorted(LOST_ROOT.glob("*/*.jsonl"))
    print(f"[verify] {len(rec_files)} recovered jsonl files under {LOST_ROOT}", file=sys.stderr)

    results = [classify(p) for p in rec_files]

    counts: dict[str, int] = {}
    for r in results:
        counts[r["class"]] = counts.get(r["class"], 0) + 1

    print()
    print("Classification:")
    for k in ("IDENTICAL", "SUBSET", "DIVERGENT", "ORPHAN"):
        print(f"  {k:<10} {counts.get(k, 0):>4}")
    print(f"  {'TOTAL':<10} {len(results):>4}")
    print()

    safe = [r for r in results if r["class"] in ("IDENTICAL", "SUBSET")]
    unsafe = [r for r in results if r["class"] == "DIVERGENT"]
    orphans = [r for r in results if r["class"] == "ORPHAN"]

    if unsafe:
        print(f"UNSAFE TO DELETE ({len(unsafe)} divergent files):")
        for r in unsafe[:10]:
            print(f"  {r['uuid']}: rec={r['recovered_lines']}L "
                  f"vs real={r['real_lines']}L  reason: {r['reason']}")
        if len(unsafe) > 10:
            print(f"  ... and {len(unsafe) - 10} more")
        print()

    if orphans:
        print(f"ORPHANS preserved ({len(orphans)}):")
        for r in orphans[:5]:
            print(f"  {r['uuid']}  ({r['recovered_size']} bytes, {r['recovered_lines']} lines)")
        if len(orphans) > 5:
            print(f"  ... and {len(orphans) - 5} more")
        print()

    # Write manifest
    MANIFEST_DIR.mkdir(parents=True, exist_ok=True)
    ts = _dt.datetime.now(_dt.timezone.utc).strftime("%Y%m%dT%H%M%S_%fZ")
    manifest_path = MANIFEST_DIR / f"verify-{ts}.json"
    with manifest_path.open("w") as f:
        json.dump(
            {
                "generated_at": ts,
                "lost_root": str(LOST_ROOT),
                "real_profiles": [str(p) for p in REAL_PROFILES],
                "counts": counts,
                "results": results,
            },
            f,
            indent=2,
            default=str,
        )
    print(f"Manifest: {manifest_path}", file=sys.stderr)

    if not args.apply:
        print()
        print(f"Dry run complete. {len(safe)} files would be deleted (IDENTICAL + PREFIX).")
        print("Re-run with --apply to actually delete.")
        return 0

    # Apply: delete IDENTICAL + PREFIX
    if not safe:
        print("Nothing safe to delete.")
        return 0

    print()
    print(f"Deleting {len(safe)} verified-duplicate files...")
    deleted = 0
    failed = 0
    for r in safe:
        p = Path(r["recovered"])
        try:
            p.unlink()
            deleted += 1
        except OSError as e:
            print(f"  FAIL: {p}: {e}", file=sys.stderr)
            failed += 1

    # Clean up now-empty parent dirs (per-project subdirs under projects/)
    pruned_dirs = 0
    for sub in sorted(LOST_ROOT.iterdir(), reverse=True):
        if sub.is_dir() and not any(sub.iterdir()):
            try:
                sub.rmdir()
                pruned_dirs += 1
            except OSError:
                pass

    print(f"Deleted {deleted} files, {failed} failures, pruned {pruned_dirs} empty dirs.")
    return 0 if failed == 0 else 2


if __name__ == "__main__":
    sys.exit(main())
