#!/usr/bin/env python3
"""Quantify how much unique data is in DIVERGENT recovered files.

Reads the latest verify-manifest, sums recovered-only records, and prints
a per-session breakdown of the worst offenders.
"""
from __future__ import annotations

import json
from pathlib import Path

MANIFEST_DIR = Path(__file__).resolve().parent.parent / "data" / "verify-manifests"


def latest_manifest() -> Path:
    cands = sorted(MANIFEST_DIR.glob("verify-*.json"))
    if not cands:
        raise SystemExit(f"no manifests in {MANIFEST_DIR}")
    return cands[-1]


def main() -> None:
    m_path = latest_manifest()
    print(f"Reading manifest: {m_path}")
    data = json.loads(m_path.read_text())

    divergent = [r for r in data["results"] if r["class"] == "DIVERGENT"]
    if not divergent:
        print("No DIVERGENT entries.")
        return

    total_recovered_only = 0
    rows = []
    for r in divergent:
        # Recompute precise count of unique-to-recovered records by re-parsing
        # the files (manifest only stored sample + count).
        rec = Path(r["recovered"])
        real = Path(r["real_match"])

        rec_keys = _keys(rec)
        real_keys = _keys(real)
        only_rec = rec_keys - real_keys
        only_real = real_keys - rec_keys
        rows.append({
            "uuid": r["uuid"],
            "rec_records": len(rec_keys),
            "real_records": len(real_keys),
            "only_in_rec": len(only_rec),
            "only_in_real": len(only_real),
            "recovered_path": str(rec),
        })
        total_recovered_only += len(only_rec)

    rows.sort(key=lambda x: x["only_in_rec"], reverse=True)
    print()
    print(f"Total records ONLY in recovered (would be lost on delete): {total_recovered_only}")
    print(f"Across {len(divergent)} DIVERGENT files")
    print()
    print("Top 20 by unique recovered-only records:")
    print(f"{'uuid':<40} {'rec':>6} {'real':>6} {'only_rec':>9} {'only_real':>10}")
    for r in rows[:20]:
        print(f"{r['uuid']:<40} {r['rec_records']:>6} {r['real_records']:>6} "
              f"{r['only_in_rec']:>9} {r['only_in_real']:>10}")

    # Also write back into the manifest for future reference.
    by_uuid = {x["uuid"]: x for x in rows}
    for r in data["results"]:
        if r["uuid"] in by_uuid:
            r["divergent_detail"] = by_uuid[r["uuid"]]
    out_path = m_path.with_name(m_path.stem + "-quantified.json")
    out_path.write_text(json.dumps(data, indent=2))
    print(f"\nQuantified manifest: {out_path}")


def _keys(p: Path) -> set[str]:
    """Return stable record keys (uuid / messageId / content-hash) from a jsonl."""
    import hashlib

    keys: set[str] = set()
    with p.open("rb") as f:
        for raw in f:
            s = raw.strip()
            if not s:
                continue
            try:
                obj = json.loads(s)
            except (UnicodeDecodeError, json.JSONDecodeError):
                continue
            if not isinstance(obj, dict):
                continue
            uid = obj.get("uuid")
            if isinstance(uid, str) and uid:
                keys.add(f"uuid:{uid}")
                continue
            snap = obj.get("snapshot")
            if isinstance(snap, dict):
                sid = snap.get("messageId")
                if isinstance(sid, str) and sid:
                    keys.add(f"snap:{sid}")
                    continue
            mid = obj.get("messageId")
            if isinstance(mid, str) and mid:
                keys.add(f"mid:{mid}")
                continue
            canon = json.dumps(obj, sort_keys=True, separators=(",", ":")).encode("utf-8")
            keys.add("h:" + hashlib.sha1(canon).hexdigest())
    return keys


if __name__ == "__main__":
    main()
