#!/usr/bin/env python3
"""Flatten Entity avro files into newline-delimited JSON of EAVT records.

The exported .avro files contain deeply nested Entity records (tracks -> eavts)
which exceed DuckDB's default JSON object size limit. This script extracts the
EAVT (entity-attribute-value-time) data into a flat, DuckDB-friendly format.

Usage:
    ./avro-to-eavts INPUT_DIR [OUTPUT_FILE]

    INPUT_DIR   Directory containing .avro files (e.g. ./processed_cases/CASE_ID/data_files/)
    OUTPUT_FILE Output path for flattened JSON (default: INPUT_DIR/eavts_flat.json)

Output schema (newline-delimited JSON):
    entity_id            - UUID of the entity
    object_class         - UUID of the object class
    track_id             - UUID of the track
    data_source_uuid     - UUID of the data source
    entityAttributeId    - UUID identifying the attribute type
    entityAttributeEnumId - enum variant (empty string if N/A)
    value                - attribute value (numeric or string)
    confidence           - source confidence score
    time                 - ISO 8601 timestamp
"""
import json
import sys
from datetime import date, datetime
from pathlib import Path

import fastavro


def _json_default(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    if isinstance(obj, date):
        return obj.isoformat()
    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")


def flatten_avro_to_eavts(input_dir: Path, output_file: Path) -> int:
    avro_files = sorted(input_dir.glob("*.avro"))
    if not avro_files:
        print(f"Error: no .avro files found in {input_dir}", file=sys.stderr)
        return 1

    count = 0
    with open(output_file, "w") as out:
        for avro_path in avro_files:
            with open(avro_path, "rb") as f:
                reader = fastavro.reader(f)
                for entity in reader:
                    entity_id = entity.get("id", "")
                    object_class = entity.get("object_class", "")
                    for track in entity.get("tracks", []):
                        track_id = track.get("track_id", "")
                        data_source_uuid = track.get("data_source_uuid", "")
                        for eavt in track.get("eavts", []):
                            record = {
                                "entity_id": entity_id,
                                "object_class": object_class,
                                "track_id": track_id,
                                "data_source_uuid": data_source_uuid,
                                "entityAttributeId": eavt.get("entityAttributeId", ""),
                                "entityAttributeEnumId": eavt.get("entityAttributeEnumId", ""),
                                "value": eavt.get("value"),
                                "confidence": eavt.get("entityDatumSource", {}).get("confidence", 0),
                                "time": eavt.get("time", ""),
                            }
                            out.write(json.dumps(record, default=_json_default) + "\n")
                            count += 1

    print(f"Wrote {count} EAVT records to {output_file}")
    return 0


def main():
    if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"):
        print(__doc__.strip())
        sys.exit(0)

    input_dir = Path(sys.argv[1])
    if not input_dir.is_dir():
        print(f"Error: {input_dir} is not a directory", file=sys.stderr)
        sys.exit(1)

    if len(sys.argv) >= 3:
        output_file = Path(sys.argv[2])
    else:
        output_file = input_dir / "eavts_flat.json"

    sys.exit(flatten_avro_to_eavts(input_dir, output_file))


if __name__ == "__main__":
    main()
