#!/usr/bin/env python3
"""Move recovered session jsonls from ~/.claude-lost/ back into the real
profile that originally owned each session.

Owner attribution rule:
    For each recovered <UUID>.jsonl in ~/.claude-lost/projects/<encoded>/,
    the owner is whichever real profile (~/.claude-personal,
    ~/.claude-work, ~/.claude-the-third) currently holds a directory
    named <UUID>/ under projects/<some-encoded>/. The encoded path of
    that directory MUST match the encoded path of the recovered file
    (consistency check). If multiple profiles host it, that is a fatal
    inconsistency and the file is not moved.

Three classifications applied to each recovered file:
    - MOVE:        UUID has <UUID>/ dir in exactly one real profile and
                   no <UUID>.jsonl file in any real profile. Move it.
    - SKIP_INBOTH: a <UUID>.jsonl ALREADY exists on disk in a real
                   profile. The recovered file is an older CoW snapshot
                   of a still-live file -- writing it would replace
                   live content with stale. Skip.
    - SKIP_ORPHAN: UUID has neither <UUID>/ dir nor <UUID>.jsonl in any
                   real profile. Likely from the wiped legacy ~/.claude/.
                   Skip; leave in ~/.claude-lost/.

Phases:
    1. PRE-AUDIT  -- hash every existing <UUID>.jsonl in real profiles;
                     validate per-line JSON; record state to manifest.
    2. PLAN       -- classify each recovered file; print summary.
    3. APPLY      -- (only with --apply) atomic os.rename for each MOVE
                     candidate; refuses to overwrite anything.
    4. POST-AUDIT -- re-hash everything; verify pre-existing files
                     unchanged; verify newly-placed files are JSON-valid
                     line-by-line; report mismatches as errors.

Read-only against ~/.claude*/ unless --apply is passed.

Usage:
    uv run scripts/restore-to-profiles            # dry-run, prints plan
    uv run scripts/restore-to-profiles --apply    # actually move files
"""

from __future__ import annotations

import argparse
import glob
import json
import os
import sys
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import NamedTuple

from claude_timeline.recovery import (
    discover_real_profiles,
    is_uuid_name,
    sha256_file,
)

HOME = Path.home()
LOST = HOME / ".claude-lost"
REAL_PROFILES = discover_real_profiles()
# Manifests live under the project's data/ dir (not /tmp/) so forensic
# provenance survives reboot per project CLAUDE.md.
PROJECT_ROOT = Path(__file__).resolve().parent.parent
MANIFEST_DIR = PROJECT_ROOT / "data" / "restore-manifests"
# Real-profile files are mode 0600; we restore that mode on every moved
# file (the lost-side files inherit 0644 from os.rename which preserves
# inode mode bits, leading to a privacy regression if not chmod'd).
TARGET_FILE_MODE = 0o600
# Pathological garbage in lost files could be a single multi-MB "line".
# Cap per-line read so validation can't OOM.
MAX_LINE_BYTES = 16 * 1024 * 1024  # 16 MB


def validate_jsonl(p: Path) -> tuple[int, int]:
    """Returns (line_count, error_count). Counts lines that don't parse as
    JSON. Used to flag files whose shape is corrupt before / after moves.

    Line size is capped at MAX_LINE_BYTES; pathological no-newline blobs
    are flagged as errors rather than read into memory.
    """
    lines = 0
    errors = 0
    with p.open("rb") as f:
        while True:
            raw = f.readline(MAX_LINE_BYTES + 1)
            if not raw:
                break
            lines += 1
            if len(raw) > MAX_LINE_BYTES:
                errors += 1
                # Drain the rest of the over-long line without buffering it.
                while raw and not raw.endswith(b"\n"):
                    raw = f.read(MAX_LINE_BYTES)
                continue
            try:
                text = raw.decode("utf-8")
            except UnicodeDecodeError:
                errors += 1
                continue
            try:
                json.loads(text)
            except json.JSONDecodeError:
                errors += 1
    return lines, errors


# ─────────────────────────────────────────────────────────────────────────
# Audit
# ─────────────────────────────────────────────────────────────────────────


class Audit(NamedTuple):
    timestamp: str
    real_files: dict          # uuid -> {profile, encoded, path, size, mtime, sha256, lines, json_errors}
    real_dirs: dict           # uuid -> {profile, encoded, path}
    lost_files: dict          # uuid -> {encoded, path, size, mtime, sha256, lines, json_errors}


def audit_state() -> Audit:
    real_files: dict[str, dict] = {}
    real_dirs: dict[str, dict] = {}
    for prof in REAL_PROFILES:
        if not prof.is_dir():
            continue
        for enc_dir in (prof / "projects").iterdir() if (prof / "projects").is_dir() else []:
            if not enc_dir.is_dir():
                continue
            encoded = enc_dir.name
            for child in enc_dir.iterdir():
                base = child.name
                if child.is_file() and base.endswith(".jsonl"):
                    uuid = base[:-len(".jsonl")]
                    if not is_uuid_name(uuid):
                        continue
                    if uuid in real_files:
                        # Same UUID jsonl in two profiles -> alarm
                        real_files[uuid]["DUPLICATE"] = True
                        continue
                    st = child.stat()
                    lines, errs = validate_jsonl(child)
                    real_files[uuid] = {
                        "profile": prof.name,
                        "encoded": encoded,
                        "path": str(child),
                        "size": st.st_size,
                        "mtime": st.st_mtime,
                        "sha256": sha256_file(child),
                        "lines": lines,
                        "json_errors": errs,
                    }
                elif child.is_dir() and is_uuid_name(base):
                    if base in real_dirs:
                        real_dirs[base]["DUPLICATE"] = True
                        continue
                    real_dirs[base] = {
                        "profile": prof.name,
                        "encoded": encoded,
                        "path": str(child),
                    }

    lost_files: dict[str, dict] = {}
    if LOST.is_dir():
        for enc_dir in (LOST / "projects").iterdir() if (LOST / "projects").is_dir() else []:
            if not enc_dir.is_dir():
                continue
            encoded = enc_dir.name
            for child in enc_dir.iterdir():
                if not (child.is_file() and child.name.endswith(".jsonl")):
                    continue
                uuid = child.name[:-len(".jsonl")]
                if not is_uuid_name(uuid):
                    continue
                if uuid in lost_files:
                    lost_files[uuid]["DUPLICATE"] = True
                    continue
                st = child.stat()
                lines, errs = validate_jsonl(child)
                lost_files[uuid] = {
                    "encoded": encoded,
                    "path": str(child),
                    "size": st.st_size,
                    "mtime": st.st_mtime,
                    "sha256": sha256_file(child),
                    "lines": lines,
                    "json_errors": errs,
                }

    return Audit(
        timestamp=datetime.now(tz=timezone.utc).isoformat(),
        real_files=real_files,
        real_dirs=real_dirs,
        lost_files=lost_files,
    )


def write_manifest(audit: Audit, path: Path) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(audit._asdict(), indent=2, default=str))


# ─────────────────────────────────────────────────────────────────────────
# Plan
# ─────────────────────────────────────────────────────────────────────────


class Move(NamedTuple):
    uuid: str
    src: Path
    dst: Path
    profile: str


def classify(audit: Audit) -> tuple[list[Move], dict[str, list[str]]]:
    """Returns (moves, skipped) where skipped maps category -> [uuid...]."""
    moves: list[Move] = []
    skipped: dict[str, list[str]] = {"in_both": [], "orphan": [], "encoded_mismatch": [], "duplicate_dir": []}

    for uuid, info in audit.lost_files.items():
        if info.get("json_errors", 0):
            print(f"WARN: lost file {uuid} has {info['json_errors']} unparseable lines",
                  file=sys.stderr)
        if uuid in audit.real_files:
            skipped["in_both"].append(uuid)
            continue
        if uuid not in audit.real_dirs:
            skipped["orphan"].append(uuid)
            continue
        rd = audit.real_dirs[uuid]
        if rd.get("DUPLICATE"):
            skipped["duplicate_dir"].append(uuid)
            continue
        if rd["encoded"] != info["encoded"]:
            # Recovered cwd encoding differs from on-disk dir's encoding.
            # Trust the on-disk dir as the canonical home for this UUID,
            # but flag for visibility.
            print(f"NOTE: encoded path drift for {uuid}: lost={info['encoded']!r} "
                  f"real={rd['encoded']!r}; using real-dir's encoding",
                  file=sys.stderr)
        # Use the on-disk dir's encoded form so the file lands next to its dir.
        target_dir = HOME / rd["profile"] / "projects" / rd["encoded"]
        moves.append(Move(
            uuid=uuid,
            src=Path(info["path"]),
            dst=target_dir / f"{uuid}.jsonl",
            profile=rd["profile"],
        ))
    return moves, skipped


# ─────────────────────────────────────────────────────────────────────────
# Apply
# ─────────────────────────────────────────────────────────────────────────


def apply_moves(moves: list[Move]) -> tuple[list[Move], list[str]]:
    """Atomic moves. Returns (successful_moves, [errors]).

    Implementation: os.link(src, dst) followed by os.unlink(src). link()
    raises FileExistsError if dst exists, which closes the TOCTOU window
    that os.rename() leaves open (rename silently overwrites). After the
    link succeeds, chmod(dst, 0600) so the moved file matches the privacy
    mode of its new neighbours (lost-side files are 0644).
    """
    errors: list[str] = []
    successful: list[Move] = []
    for m in moves:
        try:
            m.dst.parent.mkdir(parents=True, exist_ok=True, mode=0o700)
        except OSError as e:
            errors.append(f"mkdir {m.dst.parent}: {e}")
            continue
        try:
            os.link(m.src, m.dst)
        except FileExistsError:
            errors.append(f"refuse to overwrite existing {m.dst}")
            continue
        except OSError as e:
            errors.append(f"link {m.src} -> {m.dst}: {e}")
            continue
        try:
            os.chmod(m.dst, TARGET_FILE_MODE)
        except OSError as e:
            errors.append(f"chmod {m.dst}: {e}")
            # Don't unlink the dst here -- the file is in place, just with
            # the wrong mode. User can fix manually. Continue rather than
            # leaving src and dst both pointing at the same inode.
        try:
            os.unlink(m.src)
        except OSError as e:
            errors.append(f"unlink {m.src}: {e}")
            # dst exists, src exists -- both link to same inode. The
            # subsequent re-run will see dst in real_files (in_both) and
            # skip; user must manually delete the lost-side leftover.
            continue
        successful.append(m)
    # Single fsync at end to flush metadata. Cheap on btrfs.
    try:
        os.sync()
    except OSError:
        pass
    return successful, errors


# ─────────────────────────────────────────────────────────────────────────
# Verify
# ─────────────────────────────────────────────────────────────────────────


def verify(pre: Audit, post: Audit, successful_moves: list[Move]) -> list[str]:
    """Returns list of integrity errors. Empty list = pass.

    Operates only over moves that `apply_moves` reported as successful;
    failed moves leave their source intact and verification of those is
    out of scope here.
    """
    errors: list[str] = []
    moved_uuids = {m.uuid for m in successful_moves}

    # 1. Every pre-existing real file should be byte-identical post.
    for uuid, info in pre.real_files.items():
        if uuid not in post.real_files:
            errors.append(f"pre-existing file {info['path']} disappeared post")
            continue
        if post.real_files[uuid]["sha256"] != info["sha256"]:
            errors.append(f"pre-existing file {info['path']} sha256 changed")

    # 2. Every successful move's target exists post AND is content-identical
    #    to its lost-side source. Also: was chmod'd to TARGET_FILE_MODE.
    for m in successful_moves:
        post_info = post.real_files.get(m.uuid)
        pre_lost = pre.lost_files.get(m.uuid)
        if post_info is None:
            errors.append(f"moved file {m.dst} missing post-audit")
            continue
        if pre_lost and post_info["sha256"] != pre_lost["sha256"]:
            errors.append(f"moved file {m.dst} content changed during move")
        if post_info["json_errors"]:
            errors.append(f"moved file {m.dst} has {post_info['json_errors']} JSON errors")
        # Privacy mode check: file should be 0600.
        try:
            actual_mode = m.dst.stat().st_mode & 0o777
            if actual_mode != TARGET_FILE_MODE:
                errors.append(
                    f"moved file {m.dst} mode is {oct(actual_mode)}, "
                    f"expected {oct(TARGET_FILE_MODE)}"
                )
        except OSError as e:
            errors.append(f"stat {m.dst}: {e}")

    # 3. Sources of successful moves should be gone from lost.
    for m in successful_moves:
        if m.uuid in post.lost_files:
            errors.append(f"source for moved {m.uuid} still in ~/.claude-lost")

    # 4. No new UUIDs should appear in real profiles other than moves.
    new_uuids = set(post.real_files.keys()) - set(pre.real_files.keys())
    unexpected = new_uuids - moved_uuids
    if unexpected:
        errors.append(f"unexpected new UUIDs in real profiles: {sorted(unexpected)[:10]}")

    # 5. File-count delta check: real-profile jsonl count should rise by
    #    exactly len(successful_moves).
    expected_post_count = len(pre.real_files) + len(successful_moves)
    if len(post.real_files) != expected_post_count:
        errors.append(
            f"real-profile <UUID>.jsonl count delta wrong: "
            f"pre={len(pre.real_files)} + moved={len(successful_moves)} "
            f"!= post={len(post.real_files)}"
        )

    # 6. <UUID>/ dirs should not have changed.
    for uuid, info in pre.real_dirs.items():
        if uuid not in post.real_dirs:
            errors.append(f"pre-existing dir {info['path']} disappeared post")
        elif post.real_dirs[uuid]["path"] != info["path"]:
            errors.append(f"dir {uuid} moved unexpectedly")

    return errors


# ─────────────────────────────────────────────────────────────────────────
# Main
# ─────────────────────────────────────────────────────────────────────────


def summarize_plan(audit: Audit, moves: list[Move], skipped: dict[str, list[str]]) -> None:
    move_by_profile = Counter(m.profile for m in moves)
    print()
    print("=== plan summary ===")
    print(f"recovered files in ~/.claude-lost/:        {len(audit.lost_files):>5}")
    print(f"existing top-level <UUID>.jsonl in real:   {len(audit.real_files):>5}")
    print(f"existing <UUID>/ dirs in real:             {len(audit.real_dirs):>5}")
    print()
    print(f"will MOVE:                                 {len(moves):>5}")
    for prof, n in move_by_profile.most_common():
        print(f"    -> {prof:<22} {n:>5}")
    print(f"will SKIP (in_both, older snapshot):       {len(skipped['in_both']):>5}")
    print(f"will SKIP (orphan, no owner profile):      {len(skipped['orphan']):>5}")
    if skipped["duplicate_dir"]:
        print(f"will SKIP (UUID dir in multiple profiles): {len(skipped['duplicate_dir']):>5}")
    print()


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--apply", action="store_true",
                    help="Actually move files (default is dry-run)")
    ap.add_argument("--yes", action="store_true",
                    help="Skip the interactive confirmation prompt under --apply")
    ap.add_argument("--manifest-dir", type=Path, default=MANIFEST_DIR,
                    help=f"Where to write pre/post manifests (default: {MANIFEST_DIR})")
    args = ap.parse_args()

    # Profile paths are now derived from discover_real_profiles() so the
    # old `HOME == /home/m` guard is no longer needed. Refuse to run if no
    # real profiles were found at all (would otherwise silently no-op).
    if not REAL_PROFILES:
        print("ERROR: discover_real_profiles() found no profiles to restore into.",
              file=sys.stderr)
        return 1

    args.manifest_dir.mkdir(parents=True, exist_ok=True)
    # Microsecond resolution prevents timestamp collision on rapid re-runs.
    ts = datetime.now(tz=timezone.utc).strftime("%Y%m%dT%H%M%S_%fZ")

    print(f"[1/4] PRE-AUDIT  (this can take ~30s; hashing every <UUID>.jsonl)")
    pre = audit_state()
    pre_path = args.manifest_dir / f"pre-{ts}.json"
    write_manifest(pre, pre_path)
    # Stable symlink to most recent pre-audit for quick inspection.
    latest_pre = args.manifest_dir / "pre-latest.json"
    latest_pre.unlink(missing_ok=True)
    latest_pre.symlink_to(pre_path.name)
    print(f"      manifest: {pre_path}")
    pre_errors = sum(v.get("json_errors", 0) for v in pre.real_files.values())
    if pre_errors:
        print(f"      WARN: pre-existing real files have {pre_errors} unparseable lines total")

    print(f"\n[2/4] PLAN")
    moves, skipped = classify(pre)
    summarize_plan(pre, moves, skipped)

    if not args.apply:
        print("Dry-run: no changes made. Re-run with --apply to execute.")
        if moves:
            print("\nFirst 5 planned moves:")
            for m in moves[:5]:
                print(f"  {m.src.name}  ->  {m.dst}")
        return 0

    # ── Confirmation gate (skippable with --yes for scripted runs) ────────
    if not args.yes:
        print("This will move files into live ~/.claude-* profile dirs.")
        print("Recommended pre-flight: take a btrfs snapshot first, e.g.")
        print("  sudo btrfs sub snap -r ~/.claude-personal ~/.claude-personal.pre-restore")
        print("  sudo btrfs sub snap -r ~/.claude-work     ~/.claude-work.pre-restore")
        print()
        try:
            answer = input(f"Type 'yes' to move {len(moves)} files: ").strip().lower()
        except EOFError:
            answer = ""
        if answer != "yes":
            print("Aborted (no files moved).")
            return 0

    print(f"\n[3/4] APPLY  ({len(moves)} atomic link+unlink calls)")
    successful, apply_errs = apply_moves(moves)
    print(f"      moved: {len(successful)}/{len(moves)}")
    for e in apply_errs[:20]:
        print(f"      ERROR: {e}")
    if len(apply_errs) > 20:
        print(f"      ... and {len(apply_errs) - 20} more errors")

    print(f"\n[4/4] POST-AUDIT")
    post = audit_state()
    post_path = args.manifest_dir / f"post-{ts}.json"
    write_manifest(post, post_path)
    latest_post = args.manifest_dir / "post-latest.json"
    latest_post.unlink(missing_ok=True)
    latest_post.symlink_to(post_path.name)
    print(f"      manifest: {post_path}")

    integrity_errs = verify(pre, post, successful)
    if integrity_errs:
        print(f"\nINTEGRITY ERRORS ({len(integrity_errs)}):")
        for e in integrity_errs[:30]:
            print(f"  - {e}")
        if len(integrity_errs) > 30:
            print(f"  ... and {len(integrity_errs) - 30} more")
        return 2

    print(f"\nALL CHECKS PASSED")
    print(f"  pre-existing files unchanged: {len(pre.real_files)}")
    print(f"  newly-placed files JSON-valid + 0600: {len(successful)}")
    print(f"  ~/.claude-lost/ now contains: {len(post.lost_files)} files "
          f"({len(skipped['orphan'])} orphans + {len(skipped['in_both'])} in-both)")

    # ── Loud post-flight reminder ─────────────────────────────────────────
    if (HOME / ".claude-lost").exists():
        print()
        print("=" * 70)
        print("IMPORTANT: ~/.claude-lost/ still exists and matches discover_profiles()")
        print("           glob ~/.claude*. Running build-db NOW will index its")
        print(f"           remaining {len(post.lost_files)} files as a phantom")
        print("           account=.claude-lost in the dashboard.")
        print()
        print("           To handle the residue, pick one:")
        print("             A) Rename so it's no longer a profile:")
        print("                mv ~/.claude-lost ~/claude-lost-residue")
        print("             B) Keep it as a deliberate fourth account for the")
        print("                orphans + in_both diagnostics.")
        print("             C) Delete entirely:")
        print("                rm -rf ~/.claude-lost   # permanent")
        print("=" * 70)

    print(f"\nNext: handle ~/.claude-lost/ per above, then `uv run scripts/build-db`")
    return 0 if not apply_errs else 1


if __name__ == "__main__":
    sys.exit(main())
