#!/usr/bin/env python3
"""Restore forensically-recovered Claude session data into ~/.claude-lost/.

Layout written:
    ~/.claude-lost/
        README.md                        explanation of what's here
        history.jsonl                    user-typed messages synthesized
                                         from the per-session jsonls so the
                                         pipeline's messages-table ingestion
                                         picks them up
        projects/<encoded-cwd>/<UUID>.jsonl
                                         per-session top-level main + sidechain
                                         turns (the file Claude Code itself
                                         used to write before deleting it)

The pipeline auto-discovers any ~/.claude* dir that has projects/ or
history.jsonl, so a subsequent `uv run scripts/build-db` will ingest this
profile naturally and the data shows up under account=/home/m/.claude-lost
in the dashboard, fully searchable in the Finder.

Read-only against /var/tmp/btrfs-recover/. Writes only under ~/.claude-lost/.
"""

from __future__ import annotations

import argparse
import glob
import json
import os
import shutil
import sys
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

from claude_timeline.recovery import encode_cwd, first_line_meta


HOME = Path.home()
DEST_ROOT = HOME / ".claude-lost"
DEFAULT_PROCESSED = "/var/tmp/btrfs-recover/carved-*/processed"


def newest_processed_dir() -> Path | None:
    matches = sorted(glob.glob(DEFAULT_PROCESSED))
    return Path(matches[-1]) if matches else None


def parse_iso_ts(t: Any) -> datetime | None:
    """Parse a timestamp field that may be ISO string OR epoch ms int."""
    if isinstance(t, int):
        try:
            return datetime.fromtimestamp(t / 1000, tz=timezone.utc)
        except Exception:
            return None
    if isinstance(t, str):
        try:
            return datetime.fromisoformat(t.replace("Z", "+00:00"))
        except Exception:
            return None
    return None


def find_uuid_to_profile() -> dict[str, str]:
    """Map each UUID-named session subdirectory currently on disk to its
    profile name (e.g. '.claude-personal'). Used to attribute recovered
    sessions back to a profile.
    """
    out: dict[str, str] = {}
    for d in glob.glob("/home/m/.claude*/projects/*/*"):
        base = os.path.basename(d)
        if (
            os.path.isdir(d)
            and len(base) == 36
            and base.count("-") == 4
        ):
            parts = d.split("/")
            out[base] = parts[3]  # /home/m/.claude-X/projects/<encoded>/UUID
    return out


def restore(
    processed_dir: Path,
    dest: Path = DEST_ROOT,
    overwrite: bool = False,
) -> dict:
    """Main restoration. Returns a stats dict for the README."""
    if dest.exists():
        if not overwrite:
            raise SystemExit(
                f"{dest} already exists. Pass --overwrite to replace it, or "
                f"`mv` it first if you want to preserve the prior state."
            )
        shutil.rmtree(dest)

    sessions_dir = processed_dir / "sessions"
    history_in = processed_dir / "recovered-history.jsonl"
    if not sessions_dir.is_dir():
        raise SystemExit(f"no sessions/ in {processed_dir}")

    dest.mkdir(parents=True, exist_ok=True)
    (dest / "projects").mkdir(exist_ok=True)

    uuid_to_profile = find_uuid_to_profile()

    # ── Per-session restoration ───────────────────────────────────────────
    profile_counts: Counter[str] = Counter()
    project_counts: Counter[str] = Counter()
    cwds_per_profile: defaultdict[str, set[str]] = defaultdict(set)
    ts_min: datetime | None = None
    ts_max: datetime | None = None
    ts_per_profile: defaultdict[str, list[datetime]] = defaultdict(list)
    sidechain_count = 0
    main_count = 0
    sessions_restored = 0
    sessions_unattributed: list[str] = []
    sessions_attributed_for_summary: dict[str, str] = {}

    for f in sorted(sessions_dir.iterdir()):
        if not f.name.endswith(".jsonl"):
            continue
        uuid = f.stem
        meta = first_line_meta(f)
        cwd = meta.get("cwd")
        if not cwd:
            # Without a cwd we can't place it in a project dir; park under
            # _unknown_cwd so it's still ingestible.
            cwd = "/_unknown_cwd"
        encoded = encode_cwd(cwd)

        prof = uuid_to_profile.get(uuid, "<unattributed>")
        profile_counts[prof] += 1
        project_counts[cwd] += 1
        cwds_per_profile[prof].add(cwd)
        sessions_attributed_for_summary[uuid] = prof
        if prof == "<unattributed>":
            sessions_unattributed.append(uuid)

        # Walk lines for stats AND copy to destination.
        proj_dir = dest / "projects" / encoded
        proj_dir.mkdir(parents=True, exist_ok=True)
        out_path = proj_dir / f"{uuid}.jsonl"

        with f.open() as fin, out_path.open("w", encoding="utf-8") as fout:
            for line in fin:
                fout.write(line if line.endswith("\n") else line + "\n")
                try:
                    d = json.loads(line)
                except json.JSONDecodeError:
                    continue
                if d.get("isSidechain") is True:
                    sidechain_count += 1
                elif d.get("isSidechain") is False:
                    main_count += 1
                t = parse_iso_ts(d.get("timestamp"))
                if t is None:
                    continue
                if ts_min is None or t < ts_min:
                    ts_min = t
                if ts_max is None or t > ts_max:
                    ts_max = t
                ts_per_profile[prof].append(t)
        sessions_restored += 1

    # ── Synthesize history.jsonl from user-typed turns ────────────────────
    # Format mirrors what Claude Code writes natively to ~/.claude*/history.jsonl
    # so the existing messages-table ingestion picks it up untouched.
    hist_out = dest / "history.jsonl"
    written_history: set[tuple[str, str, str]] = set()  # (sessionId, ts, hash)
    history_user_count = 0
    with hist_out.open("w", encoding="utf-8") as fh:
        # First, include the 79 already-classified history-shape lines if present
        if history_in.exists():
            for line in history_in.open():
                try:
                    d = json.loads(line)
                except json.JSONDecodeError:
                    continue
                key = (
                    str(d.get("sessionId", "")),
                    str(d.get("timestamp", "")),
                    str(d.get("display", ""))[:80],
                )
                if key in written_history:
                    continue
                written_history.add(key)
                fh.write(json.dumps(d, ensure_ascii=False) + "\n")
                history_user_count += 1

        # Then synthesize from per-session user turns.
        for f in sorted(sessions_dir.iterdir()):
            if not f.name.endswith(".jsonl"):
                continue
            uuid = f.stem
            meta = first_line_meta(f)
            cwd = meta.get("cwd") or "/_unknown_cwd"
            for line in f.open():
                try:
                    d = json.loads(line)
                except json.JSONDecodeError:
                    continue
                if d.get("type") != "user":
                    continue
                if d.get("isSidechain") is True:
                    # Skip subagent-targeted user messages; they're not the
                    # human's typed prompts.
                    continue
                msg = d.get("message", {})
                content = msg.get("content")
                # User content comes either as a plain string or a list of
                # content blocks; pull text out of either form.
                if isinstance(content, str):
                    text = content
                elif isinstance(content, list):
                    parts = []
                    for c in content:
                        if isinstance(c, dict) and c.get("type") == "text":
                            parts.append(c.get("text", ""))
                    text = "\n".join(p for p in parts if p)
                else:
                    text = ""
                if not text or not text.strip():
                    continue
                ts = d.get("timestamp")
                key = (uuid, str(ts), text[:80])
                if key in written_history:
                    continue
                written_history.add(key)
                # Translate ISO timestamp to epoch-ms to match history.jsonl convention
                ts_dt = parse_iso_ts(ts)
                ts_ms = int(ts_dt.timestamp() * 1000) if ts_dt else 0
                synth = {
                    "display": text,
                    "pastedContents": [],
                    "timestamp": ts_ms,
                    "project": cwd,
                    "sessionId": uuid,
                }
                fh.write(json.dumps(synth, ensure_ascii=False) + "\n")
                history_user_count += 1

    # ── README ────────────────────────────────────────────────────────────
    readme = render_readme(
        sessions_restored=sessions_restored,
        main_count=main_count,
        sidechain_count=sidechain_count,
        history_user_count=history_user_count,
        profile_counts=profile_counts,
        project_counts=project_counts,
        ts_min=ts_min,
        ts_max=ts_max,
        ts_per_profile=ts_per_profile,
        unattributed_uuids=sessions_unattributed,
        processed_dir=processed_dir,
    )
    (dest / "README.md").write_text(readme)

    return {
        "sessions_restored": sessions_restored,
        "main_turns": main_count,
        "sidechain_turns": sidechain_count,
        "history_lines": history_user_count,
        "profiles": dict(profile_counts),
        "projects": len(project_counts),
        "ts_min": ts_min.isoformat() if ts_min else None,
        "ts_max": ts_max.isoformat() if ts_max else None,
    }


def render_readme(
    sessions_restored: int,
    main_count: int,
    sidechain_count: int,
    history_user_count: int,
    profile_counts: Counter[str],
    project_counts: Counter[str],
    ts_min: datetime | None,
    ts_max: datetime | None,
    ts_per_profile: defaultdict[str, list[datetime]],
    unattributed_uuids: list,
    processed_dir: Path,
) -> str:
    lines = [
        "# `~/.claude-lost/` -- forensically recovered Claude Code data",
        "",
        f"Generated: {datetime.now(tz=timezone.utc).isoformat()}",
        f"Source: `{processed_dir}`",
        "",
        "## What this is",
        "",
        "Claude Code writes per-session conversation logs to:",
        "",
        "    ~/.claude-<profile>/projects/<encoded-cwd>/<UUID>.jsonl  <-- the main user<->Claude conversation",
        "    ~/.claude-<profile>/projects/<encoded-cwd>/<UUID>/        <-- subdirectory with subagents/, tool-results/",
        "",
        "On this machine the top-level `<UUID>.jsonl` files have largely",
        "disappeared from disk (only ~285 remain across all profiles, while",
        "1578 session directories exist). The subdirectories with subagent",
        "logs survive; the main conversation log does not.",
        "",
        "This directory contains those missing main session logs, recovered",
        "by reading the raw bytes of `/dev/nvme0n1p6` and reconstructing JSON",
        "fragments from free-but-not-yet-overwritten extents. See",
        "`scripts/recover-btrfs` and `scripts/process-recovery` for the",
        "pipeline that produced them.",
        "",
        "## Why are these files not on disk anymore?",
        "",
        "We cannot prove the deletion mechanism, but the evidence points at:",
        "",
        "1. **Repeated `rm -rf ~/.claude/`** by the user (admitted, multiple",
        "   times). Anything that lived in the legacy `~/.claude/` profile",
        "   before it was wiped is gone from the live FS but may still be",
        "   in btrfs CoW free extents.",
        "2. **Claude Code's own log rotation / cleanup**. Every recovered",
        "   turn has a `version` field; the recovered set spans 2.1.58",
        "   through the currently-running 2.1.112 -- so the deletion is",
        "   not a one-time migration cutoff. It happens continuously.",
        "3. **CoW churn against a 71%-full disk** is overwriting older",
        "   extents fast. Older data (Nov/Dec 2025) was already gone before",
        "   recovery ran -- nothing carved older than 2026-01-21.",
        "",
        "Recovered timestamps run right up to the moment the recovery itself",
        "executed -- meaning Claude Code is still creating-then-deleting",
        "(or moving) main session logs. The blocks just hadn't been",
        "overwritten yet when we scanned.",
        "",
        "## Stats",
        "",
        f"- Sessions restored:        {sessions_restored:,}",
        f"- Main turns (human/Claude): {main_count:,}",
        f"- Sidechain turns (subagent): {sidechain_count:,} (mostly already in subagents/)",
        f"- Synthesized history.jsonl entries: {history_user_count:,}",
        f"- Distinct project cwds:    {len(project_counts):,}",
    ]
    if ts_min and ts_max:
        lines.append(f"- Time range:               {ts_min.isoformat()} -> {ts_max.isoformat()}")
    lines += ["", "## Profile attribution", ""]
    lines.append("Each recovered session UUID was matched against currently-existing")
    lines.append("session directories (`~/.claude*/projects/.../<UUID>/`). The profile")
    lines.append("hosting that directory is the original owner.")
    lines.append("")
    for prof, n in profile_counts.most_common():
        ts = ts_per_profile.get(prof, [])
        if ts:
            ts_sorted = sorted(ts)
            line = f"  - **{prof}**: {n} sessions, {ts_sorted[0].date()} -> {ts_sorted[-1].date()}"
        else:
            line = f"  - **{prof}**: {n} sessions"
        lines.append(line)
    if unattributed_uuids:
        lines.append("")
        lines.append(f"  ({len(unattributed_uuids)} sessions could not be attributed --")
        lines.append("   their UUID is not in any current `~/.claude*/projects/` tree.")
        lines.append("   They likely belonged to the wiped legacy `~/.claude/`.)")
    lines += [
        "",
        "## Top projects (by session count)",
        "",
    ]
    for cwd, n in project_counts.most_common(20):
        lines.append(f"  - {n:>4}  `{cwd}`")
    lines += [
        "",
        "## How the ClaudeTimeline pipeline picks this up",
        "",
        "`src/claude_timeline/config.py::discover_profiles()` enumerates any",
        "`~/.claude*` directory that has `history.jsonl` or `projects/`. This",
        "directory has both. The next `uv run scripts/build-db` run ingests:",
        "",
        "- `history.jsonl` -> `messages` table (user-typed prompts)",
        "- `projects/<encoded>/<UUID>.jsonl` -> `assistant_messages`,",
        "  `bash_commands`, `tool_calls`, `tool_results` tables.",
        "",
        "All recovered data ends up under `account = /home/m/.claude-lost`",
        "in the dashboard, so you can filter to it (or exclude it) cleanly",
        "in the Finder.",
        "",
        "## What was NOT recovered",
        "",
        "- Anything older than 2026-01-21. The btrfs free-extent traces for",
        "  Nov/Dec 2025 (which the user wanted) have been overwritten by",
        "  later writes. They are gone.",
        "- Subagent message content not present in the main log -- those",
        "  remain in `~/.claude*/projects/.../<UUID>/subagents/agent-*.jsonl`",
        "  on disk and are already ingested by ClaudeTimeline.",
        "- Tool-result content beyond what was inlined into main turns.",
        "",
        "## Limitations and caveats",
        "",
        "- These files were reconstructed from raw byte fragments.  Each",
        "  individual JSON line was parsed and validated, but the assembled",
        "  per-session jsonls may have ordering oddities (the carve emits",
        "  in disk order, which is not always temporal). The restorer sorts",
        "  by `timestamp` where present so the pipeline sees coherent",
        "  conversations.",
        "- The 34 truly-orphan sessions (no current session dir) are still",
        "  placed in `projects/<encoded-cwd>/` based on the cwd field of",
        "  their first turn -- this is the best guess.",
        "- Synthesized history.jsonl entries deduplicate against each other",
        "  but not against existing `~/.claude*/history.jsonl` entries.",
        "  After ingestion you may want to run a one-time dedup query if",
        "  any messages appear twice in the dashboard.",
        "",
        "## To re-run",
        "",
        "    uv run scripts/recover-btrfs --device /dev/nvme0n1p6 --force-live --skip-trees",
        "    uv run scripts/process-recovery",
        "    uv run scripts/restore-recovered --overwrite",
        "    uv run scripts/build-db",
        "",
    ]
    return "\n".join(lines)


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument(
        "processed_dir",
        nargs="?",
        type=Path,
        help="processed/ dir from scripts/process-recovery (default: newest)",
    )
    ap.add_argument(
        "--dest",
        type=Path,
        default=DEST_ROOT,
        help=f"Destination directory (default: {DEST_ROOT})",
    )
    ap.add_argument(
        "--overwrite",
        action="store_true",
        help="Replace any existing destination directory",
    )
    args = ap.parse_args()

    src = args.processed_dir or newest_processed_dir()
    if src is None or not src.is_dir():
        print(
            "ERROR: no processed/ dir found. Run scripts/process-recovery first.",
            file=sys.stderr,
        )
        return 1

    print(f"[restore] from: {src}", file=sys.stderr)
    print(f"[restore] to:   {args.dest}", file=sys.stderr)

    stats = restore(src, args.dest, overwrite=args.overwrite)

    print("\n=== restore summary ===", file=sys.stderr)
    print(json.dumps(stats, indent=2, default=str), file=sys.stderr)
    print(f"\nWrote {args.dest}", file=sys.stderr)
    print(f"  README.md", file=sys.stderr)
    print(f"  history.jsonl ({stats['history_lines']} entries)", file=sys.stderr)
    print(f"  projects/ ({stats['projects']} project dirs, {stats['sessions_restored']} sessions)", file=sys.stderr)
    print(f"\nNext: `uv run scripts/build-db` to ingest into the dashboard.", file=sys.stderr)
    return 0


if __name__ == "__main__":
    sys.exit(main())
