#!/usr/bin/env bash
#MISE description="Verify production health: zero Stathera anomalies on today + service status + monit summary (Issue #297)"
#
# Equivalent to verify-restart Steps 7-8 (verification only, no restart).
# Safe to run at any time — read-only ClickHouse queries + systemd status checks.
#
# Checks:
#   1. ClickHouse ping
#   2. Today's Stathera: overlaps=0, intraday_gaps=0
#   3. Sidecar health endpoint (/health)
#   4. Service active states
#   5. Monit summary
#
# Usage:
#   mise run deploy:verify-clean
#   REMOTE=myhost mise run deploy:verify-clean

set -euo pipefail

REMOTE="${REMOTE:-bigblack}"
CH_URL="http://localhost:8123/"
PASS=0
WARN=0
FAIL=0

_pass()  { echo "  PASS  $*"; PASS=$((PASS + 1)); }
_warn()  { echo "  WARN  $*"; WARN=$((WARN + 1)); }
_fail()  { echo "  FAIL  $*"; FAIL=$((FAIL + 1)); }

echo "=== deploy:verify-clean on $REMOTE ==="
echo ""

# ─────────────────────────────────────────────────────────────
# 1. ClickHouse ping
# ─────────────────────────────────────────────────────────────
echo "--- ClickHouse Connectivity ---"
if ssh "$REMOTE" "curl -sf '${CH_URL}ping'" &>/dev/null; then
    _pass "ClickHouse ping OK"
else
    _fail "ClickHouse not reachable — all data checks skipped"
    echo ""
    echo "=== Summary: PASS=${PASS} WARN=${WARN} FAIL=${FAIL} ==="
    exit 1
fi

# ─────────────────────────────────────────────────────────────
# 2. Stathera: today's overlaps + intraday gaps
# ─────────────────────────────────────────────────────────────
echo ""
echo "--- Stathera Audit (today) ---"

STATHERA_QUERY="
SELECT
    countIf(tid_delta < 0) AS overlaps,
    countIf(tid_delta > 0 AND day_gap = 0) AS intraday_gaps,
    count() - 1 AS total_transitions
FROM (
    SELECT
        first_agg_trade_id - lagInFrame(last_agg_trade_id, 1) OVER w - 1 AS tid_delta,
        toDate(close_time_ms / 1000) - toDate(lagInFrame(close_time_ms, 1) OVER w / 1000) AS day_gap,
        row_number() OVER w AS rn
    FROM opendeviationbar_cache.open_deviation_bars FINAL
    WHERE first_agg_trade_id > 0
      AND last_agg_trade_id > 0
      AND toDate(close_time_ms / 1000) = today()
    WINDOW w AS (
        PARTITION BY symbol, threshold_decimal_bps, ouroboros_mode
        ORDER BY first_agg_trade_id, close_time_ms
    )
)
WHERE rn > 1
FORMAT TabSeparated
"

STATHERA_RAW=$(ssh "$REMOTE" "curl -s '${CH_URL}' --data-binary \"${STATHERA_QUERY}\"" | tr -d '[:space:]')
OVERLAPS=$(echo "$STATHERA_RAW" | cut -f1)
INTRADAY_GAPS=$(echo "$STATHERA_RAW" | cut -f2)
TRANSITIONS=$(echo "$STATHERA_RAW" | cut -f3)

echo "  Today's bars: transitions=${TRANSITIONS:-0}, overlaps=${OVERLAPS:-?}, intraday_gaps=${INTRADAY_GAPS:-?}"

if [ "${OVERLAPS:-1}" = "0" ] && [ "${INTRADAY_GAPS:-1}" = "0" ]; then
    _pass "Stathera clean (overlaps=0, intraday_gaps=0)"
else
    _fail "Stathera anomalies: overlaps=${OVERLAPS}, intraday_gaps=${INTRADAY_GAPS}"
    echo ""
    echo "  Detail:"
    ssh "$REMOTE" "curl -s '${CH_URL}' --data-binary \"
SELECT symbol, threshold_decimal_bps,
       if(tid_delta < 0, 'OVERLAP', 'GAP') AS anomaly_type,
       tid_delta,
       toDate(close_time_ms / 1000) AS bar_date
FROM (
    SELECT symbol, threshold_decimal_bps,
           first_agg_trade_id - lagInFrame(last_agg_trade_id, 1) OVER w - 1 AS tid_delta,
           close_time_ms,
           toDate(close_time_ms / 1000) - toDate(lagInFrame(close_time_ms, 1) OVER w / 1000) AS day_gap,
           row_number() OVER w AS rn
    FROM opendeviationbar_cache.open_deviation_bars FINAL
    WHERE toDate(close_time_ms / 1000) = today()
    WINDOW w AS (
        PARTITION BY symbol, threshold_decimal_bps, ouroboros_mode
        ORDER BY first_agg_trade_id, close_time_ms
    )
)
WHERE rn > 1 AND (tid_delta < 0 OR (tid_delta > 0 AND day_gap = 0))
ORDER BY symbol, threshold_decimal_bps
LIMIT 20
FORMAT PrettyCompact
\"" 2>/dev/null || true
    echo ""
    echo "  Recovery: mise run deploy:verify-restart"
    echo "  Playbook: .claude/skills/sidecar-restart-zero-overlap-zero-gap-recovery-playbook/SKILL.md"
fi

# ─────────────────────────────────────────────────────────────
# 3. Sidecar health endpoint
# ─────────────────────────────────────────────────────────────
echo ""
echo "--- Sidecar Health Endpoint ---"
HEALTH_RAW=$(ssh "$REMOTE" 'curl -sf http://localhost:8081/health 2>/dev/null' || echo "")
if [ -n "$HEALTH_RAW" ]; then
    HEALTH_STATUS=$(echo "$HEALTH_RAW" | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('status','?'))" 2>/dev/null || echo "?")
    if [ "$HEALTH_STATUS" = "healthy" ]; then
        _pass "Sidecar health: ${HEALTH_STATUS}"
    else
        _warn "Sidecar health: ${HEALTH_STATUS} (check /health for details)"
    fi
    echo "  Full: $HEALTH_RAW"
else
    _warn "Sidecar /health endpoint not responding (may be starting up)"
fi

# ─────────────────────────────────────────────────────────────
# 4. Pending mutations
# ─────────────────────────────────────────────────────────────
echo ""
echo "--- ClickHouse Mutations ---"
PENDING=$(ssh "$REMOTE" "curl -s '${CH_URL}' --data-binary \"SELECT count() FROM system.mutations WHERE database = 'opendeviationbar_cache' AND is_done = 0\"" | tr -d '[:space:]')
ALL=$(ssh "$REMOTE" "curl -s '${CH_URL}' --data-binary \"SELECT count() FROM system.mutations WHERE database = 'opendeviationbar_cache'\"" | tr -d '[:space:]')
if [ "${PENDING:-1}" = "0" ]; then
    _pass "Mutations: pending=${PENDING}, total=${ALL}"
else
    _warn "Mutations: pending=${PENDING} (non-zero, may be in-flight)"
    echo "  Check: ssh $REMOTE \"curl -s http://localhost:8123/ --data-binary 'SELECT mutation_id, command, is_done FROM system.mutations WHERE database = \\\"opendeviationbar_cache\\\" ORDER BY create_time DESC LIMIT 10'\""
fi

# ─────────────────────────────────────────────────────────────
# 5. Service active states
# ─────────────────────────────────────────────────────────────
echo ""
echo "--- Service Status ---"
ssh "$REMOTE" '
for svc in opendeviationbar-sidecar opendeviationbar-kintsugi opendeviationbar-heartbeat.timer opendeviationbar-seeder.timer; do
    state=$(systemctl --user is-active "$svc" 2>/dev/null || echo "inactive")
    echo "  $svc: $state"
done
'

SIDECAR_STATE=$(ssh "$REMOTE" 'systemctl --user is-active opendeviationbar-sidecar 2>/dev/null || echo "inactive"')
KINTSUGI_STATE=$(ssh "$REMOTE" 'systemctl --user is-active opendeviationbar-kintsugi 2>/dev/null || echo "inactive"')

[ "$SIDECAR_STATE" = "active" ] && _pass "sidecar active" || _fail "sidecar ${SIDECAR_STATE}"
[ "$KINTSUGI_STATE" = "active" ] && _pass "kintsugi active" || _warn "kintsugi ${KINTSUGI_STATE} (may be completed/sleeping)"

# ─────────────────────────────────────────────────────────────
# 6. Monit summary
# ─────────────────────────────────────────────────────────────
echo ""
echo "--- Monit Summary ---"
ssh "$REMOTE" 'sudo monit summary 2>/dev/null' || echo "  (monit not available)"

# ─────────────────────────────────────────────────────────────
# Summary verdict
# ─────────────────────────────────────────────────────────────
echo ""
echo "=== Verdict: PASS=${PASS} WARN=${WARN} FAIL=${FAIL} ==="
if [ "$FAIL" -eq 0 ] && [ "$WARN" -eq 0 ]; then
    echo "    HEALTHY — production is clean."
elif [ "$FAIL" -eq 0 ]; then
    echo "    OK WITH WARNINGS — review WARNs above."
else
    echo "    UNHEALTHY — ${FAIL} check(s) failed. Run: mise run deploy:verify-restart"
    exit 1
fi
