#!/usr/bin/env bash
#MISE description="8-step sidecar restart recovery: stop → kill mutations → OPTIMIZE → clear checkpoints → start sidecar → poll sync_flush → verify Stathera → start kintsugi + monit (Issue #297)"
#
# Implements: odb-ship Phase 4.1b–4.9 + sidecar-restart-zero-overlap-zero-gap-recovery-playbook Pattern 1
# Codifies the manual 8-step deploy recovery sequence to prevent Anti-Pattern 9 (#295).
#
# Steps:
#   1. Unmonitor from monit + stop sidecar & kintsugi
#   2. Kill ALL ClickHouse mutations (completed AND pending)
#   3. OPTIMIZE TABLE FINAL (force RMT dedup, resolve pre-existing overlaps)
#   4. Clear stale streaming checkpoints
#   5. Start sidecar ONLY
#   6. Poll sidecar logs for "sync_flush: drained and wrote" (5 min timeout)
#   7. Verify zero Stathera anomalies on today via ClickHouse
#   8. Start kintsugi + re-monitor in monit
#
# Usage:
#   mise run deploy:verify-restart
#   mise run deploy:verify-restart -- --skip-stop   # skip step 1 (sidecar already stopped)
#   REMOTE=myhost mise run deploy:verify-restart     # override SSH target

set -euo pipefail

REMOTE="${REMOTE:-bigblack}"
SKIP_STOP="${1:-}"
SYNC_FLUSH_TIMEOUT_S=300  # 5 minutes
CH_URL="http://localhost:8123/"

echo "=== deploy:verify-restart on $REMOTE ==="
echo "    Implements: odb-ship Phase 4.1b-4.9 (Issue #297)"
echo ""

# ─────────────────────────────────────────────────────────────
# STEP 1: Unmonitor from monit THEN stop services
# CRITICAL: unmonitor first — monit auto-restarts within 120s if still monitoring
# ─────────────────────────────────────────────────────────────
if [ "$SKIP_STOP" != "--skip-stop" ]; then
    echo "→ [Step 1] Unmonitoring from monit..."
    ssh "$REMOTE" 'sudo monit unmonitor sidecar 2>/dev/null || true; sudo monit unmonitor kintsugi 2>/dev/null || true'
    echo "  Monit unmonitored."

    echo "→ [Step 1] Stopping sidecar + kintsugi..."
    ssh "$REMOTE" 'systemctl --user stop opendeviationbar-sidecar opendeviationbar-kintsugi opendeviationbar-kintsugi-catchup 2>/dev/null || true'

    # Verify stopped
    SIDECAR_STATE=$(ssh "$REMOTE" 'systemctl --user is-active opendeviationbar-sidecar 2>/dev/null || echo "inactive"')
    if [ "$SIDECAR_STATE" = "active" ]; then
        echo "  ERROR: Sidecar still active after stop. Trying again..."
        ssh "$REMOTE" 'systemctl --user stop opendeviationbar-sidecar --force 2>/dev/null || true'
        sleep 3
    fi
    echo "  Services stopped."
else
    echo "→ [Step 1] Skipped (--skip-stop): assuming services already stopped."
fi

# ─────────────────────────────────────────────────────────────
# STEP 2: Kill ALL ClickHouse mutations (completed AND pending)
# CRITICAL (#295): Completed lightweight DELETE mutations persist _row_exists=0 masks
# that re-apply on every part merge, silently deleting newly inserted bars.
# Kill ALL — not just pending — to clear these perpetual delete machines.
# ─────────────────────────────────────────────────────────────
echo ""
echo "→ [Step 2] Killing ALL ClickHouse mutations (database=opendeviationbar_cache)..."
ssh "$REMOTE" "curl -s '${CH_URL}' --data-binary \"KILL MUTATION WHERE database = 'opendeviationbar_cache'\" && echo '  Mutation kill sent.'"

# Verify zero remaining mutations
PENDING=$(ssh "$REMOTE" "curl -s '${CH_URL}' --data-binary \"SELECT count() FROM system.mutations WHERE database = 'opendeviationbar_cache' AND is_done = 0\"" | tr -d '[:space:]')
echo "  Pending mutations after kill: ${PENDING}"
if [ "${PENDING}" != "0" ]; then
    echo "  WARNING: ${PENDING} mutations still pending. Waiting 5s and rechecking..."
    sleep 5
    PENDING=$(ssh "$REMOTE" "curl -s '${CH_URL}' --data-binary \"SELECT count() FROM system.mutations WHERE database = 'opendeviationbar_cache' AND is_done = 0\"" | tr -d '[:space:]')
    echo "  Pending mutations (recheck): ${PENDING}"
    if [ "${PENDING}" != "0" ]; then
        echo "  ERROR: Mutations still pending after retry. Aborting — investigate before proceeding."
        echo "  Run: ssh $REMOTE 'curl -s http://localhost:8123/ --data-binary \"SELECT mutation_id, command, is_done FROM system.mutations WHERE database = '"'"'opendeviationbar_cache'"'"' ORDER BY create_time DESC LIMIT 20\"'"
        exit 1
    fi
fi
echo "  All mutations cleared."

# ─────────────────────────────────────────────────────────────
# STEP 3: OPTIMIZE TABLE FINAL
# Forces ReplacingMergeTree dedup — resolves overlaps from prior deploys,
# mutation storms, or concurrent writer races. Much faster than ALTER DELETE
# (no new mutations created). Cost: 5-30s depending on parts count.
# ─────────────────────────────────────────────────────────────
echo ""
echo "→ [Step 3] OPTIMIZE TABLE FINAL (force RMT dedup)..."
OPT_START=$(date +%s)
ssh "$REMOTE" "curl -s '${CH_URL}?wait_end_of_query=1' --data-binary 'OPTIMIZE TABLE opendeviationbar_cache.open_deviation_bars FINAL' && echo '  OPTIMIZE FINAL done.'"
OPT_END=$(date +%s)
OPT_ELAPSED=$((OPT_END - OPT_START))
echo "  OPTIMIZE FINAL completed in ${OPT_ELAPSED}s."

# ─────────────────────────────────────────────────────────────
# STEP 4: Clear stale streaming checkpoints
# Stale checkpoints from previous session may point beyond ClickHouse data,
# especially after OPTIMIZE merges parts. Sidecar's _inline_startup_backfill()
# covers any gaps from the cleared checkpoints.
# ─────────────────────────────────────────────────────────────
echo ""
echo "→ [Step 4] Clearing stale streaming checkpoints..."
ssh "$REMOTE" 'rm -f ~/.cache/opendeviationbar/checkpoints/streaming_*.json && echo "  Streaming checkpoints cleared."'

# ─────────────────────────────────────────────────────────────
# STEP 5: Start sidecar ONLY (kintsugi waits until sync_flush verified)
# Anti-Pattern 9 (#295): Starting kintsugi before sync_flush completes allows
# kintsugi mutations to delete sidecar's freshly-written bars.
# ─────────────────────────────────────────────────────────────
echo ""
echo "→ [Step 5] Starting sidecar (kintsugi held back until Step 7 verified)..."
ssh "$REMOTE" 'systemctl --user start opendeviationbar-sidecar'

# Wait for basic sidecar startup marker
echo "  Waiting for sidecar startup marker (up to 60s)..."
for i in $(seq 1 12); do
    if ssh "$REMOTE" "journalctl --user -u opendeviationbar-sidecar --since '90s ago' --no-pager 2>/dev/null | grep -qE 'sidecar_startup|StreamManager started|engine started|_create_engine'" 2>/dev/null; then
        echo "  Sidecar confirmed running (attempt ${i}/12)."
        break
    fi
    if [ "$i" -eq 12 ]; then
        echo "  ERROR: Sidecar did not start after 60s. Check logs:"
        echo "    ssh $REMOTE 'journalctl --user -u opendeviationbar-sidecar -n 50 --no-pager'"
        exit 1
    fi
    sleep 5
done

# ─────────────────────────────────────────────────────────────
# STEP 6: Poll for sync_flush completion (5 min timeout)
# "sync_flush: drained and wrote" confirms sidecar has finished writing
# today's history from fill_from_rest. Only after this is it safe to
# start kintsugi — otherwise kintsugi mutations can delete pending bars.
# ─────────────────────────────────────────────────────────────
echo ""
echo "→ [Step 6] Polling for sync_flush completion (timeout: ${SYNC_FLUSH_TIMEOUT_S}s)..."
POLL_INTERVAL=10
MAX_POLLS=$((SYNC_FLUSH_TIMEOUT_S / POLL_INTERVAL))
SYNC_FLUSH_DONE=0

for i in $(seq 1 "$MAX_POLLS"); do
    ELAPSED=$((i * POLL_INTERVAL))
    if ssh "$REMOTE" "journalctl --user -u opendeviationbar-sidecar --since '${SYNC_FLUSH_TIMEOUT_S}s ago' --no-pager 2>/dev/null | grep -q 'sync_flush.*drained and wrote'" 2>/dev/null; then
        echo "  sync_flush complete (${ELAPSED}s elapsed)."
        SYNC_FLUSH_DONE=1
        break
    fi
    echo "  Waiting for sync_flush... (${ELAPSED}/${SYNC_FLUSH_TIMEOUT_S}s, poll ${i}/${MAX_POLLS})"
    sleep "$POLL_INTERVAL"
done

if [ "$SYNC_FLUSH_DONE" -eq 0 ]; then
    echo ""
    echo "  WARNING: sync_flush not detected after ${SYNC_FLUSH_TIMEOUT_S}s."
    echo "  This may indicate:"
    echo "    - Sidecar is still catching up from REST (high-volume day)"
    echo "    - Log marker changed in a recent version"
    echo "    - Sidecar encountered an error during fill_from_rest"
    echo ""
    echo "  Recent sidecar logs:"
    ssh "$REMOTE" 'journalctl --user -u opendeviationbar-sidecar --since "5 min ago" --no-pager 2>/dev/null | tail -20' || true
    echo ""
    read -r -p "  Continue anyway? Proceeding without sync_flush confirmation risks kintsugi deleting sidecar bars. [y/N] " CONT
    if [ "${CONT:-N}" != "y" ] && [ "${CONT:-N}" != "Y" ]; then
        echo "  Aborted. Investigate sidecar logs before proceeding."
        exit 1
    fi
    echo "  Continuing on manual override."
fi

# ─────────────────────────────────────────────────────────────
# STEP 7: Verify zero Stathera anomalies on today
# Must confirm today is clean BEFORE starting kintsugi.
# Query: today's gaps + overlaps via lagInFrame window function.
# ─────────────────────────────────────────────────────────────
echo ""
echo "→ [Step 7] Verifying zero Stathera anomalies on today's data..."

STATHERA_QUERY="
SELECT
    countIf(tid_delta < 0) AS overlaps,
    countIf(tid_delta > 0 AND day_gap = 0) AS intraday_gaps
FROM (
    SELECT
        first_agg_trade_id - lagInFrame(last_agg_trade_id, 1) OVER w - 1 AS tid_delta,
        toDate(close_time_ms / 1000) - toDate(lagInFrame(close_time_ms, 1) OVER w / 1000) AS day_gap,
        row_number() OVER w AS rn
    FROM opendeviationbar_cache.open_deviation_bars FINAL
    WHERE first_agg_trade_id > 0
      AND last_agg_trade_id > 0
      AND toDate(close_time_ms / 1000) = today()
    WINDOW w AS (
        PARTITION BY symbol, threshold_decimal_bps, ouroboros_mode
        ORDER BY first_agg_trade_id, close_time_ms
    )
)
WHERE rn > 1
FORMAT TabSeparated
"

STATHERA_RESULT=$(ssh "$REMOTE" "curl -s '${CH_URL}' --data-binary \"${STATHERA_QUERY}\"" | tr -d '[:space:]')
OVERLAPS=$(echo "$STATHERA_RESULT" | cut -f1)
INTRADAY_GAPS=$(echo "$STATHERA_RESULT" | cut -f2)

echo "  Today's Stathera: overlaps=${OVERLAPS}, intraday_gaps=${INTRADAY_GAPS}"

if [ "${OVERLAPS}" != "0" ] || [ "${INTRADAY_GAPS}" != "0" ]; then
    echo ""
    echo "  ERROR: Today has Stathera anomalies!"
    echo "    overlaps=${OVERLAPS}  intraday_gaps=${INTRADAY_GAPS}"
    echo ""
    echo "  Do NOT start kintsugi — it will create mutations that may worsen the situation."
    echo ""
    echo "  If overlaps > 0: Run OPTIMIZE FINAL again (step 3). If persists, use Pattern 2:"
    echo "    Stop sidecar → DELETE today → backfill → restart."
    echo "    See: .claude/skills/sidecar-restart-zero-overlap-zero-gap-recovery-playbook/SKILL.md"
    echo ""
    echo "  Detail query:"
    echo "    ssh $REMOTE \"curl -s http://localhost:8123/ --data-binary 'SELECT symbol, threshold_decimal_bps, tid_delta, toDate(close_time_ms/1000) AS bar_date FROM (SELECT symbol, threshold_decimal_bps, first_agg_trade_id - lagInFrame(last_agg_trade_id,1) OVER w - 1 AS tid_delta, close_time_ms, row_number() OVER w AS rn FROM opendeviationbar_cache.open_deviation_bars FINAL WHERE toDate(close_time_ms/1000)=today() WINDOW w AS (PARTITION BY symbol, threshold_decimal_bps, ouroboros_mode ORDER BY first_agg_trade_id)) WHERE rn > 1 AND tid_delta != 0 ORDER BY symbol, threshold_decimal_bps LIMIT 20'\""
    exit 1
fi
echo "  Stathera clean on today. Safe to start kintsugi."

# ─────────────────────────────────────────────────────────────
# STEP 8: Start kintsugi + re-monitor in monit
# Only after today's data is verified Stathera-clean.
# ─────────────────────────────────────────────────────────────
echo ""
echo "→ [Step 8] Starting kintsugi (gap reconciliation for yesterday+)..."
ssh "$REMOTE" 'systemctl --user start opendeviationbar-kintsugi'
sleep 3

echo "→ [Step 8] Re-monitoring in monit..."
ssh "$REMOTE" 'sudo monit monitor sidecar 2>/dev/null || true; sudo monit monitor kintsugi 2>/dev/null || true'
echo "  Monit monitoring restored."

# ─────────────────────────────────────────────────────────────
# Final service status summary
# ─────────────────────────────────────────────────────────────
echo ""
echo "=== Service Status ==="
ssh "$REMOTE" '
for svc in opendeviationbar-sidecar opendeviationbar-kintsugi opendeviationbar-heartbeat.timer opendeviationbar-seeder.timer; do
    state=$(systemctl --user is-active "$svc" 2>/dev/null || echo "inactive")
    echo "  $svc: $state"
done
'

echo ""
echo "=== verify-restart complete ==="
echo ""
echo "Monitor:"
echo "  ssh $REMOTE 'journalctl --user -u opendeviationbar-sidecar -f --no-pager'"
echo "  ssh $REMOTE 'journalctl --user -u opendeviationbar-kintsugi -f --no-pager'"
echo ""
echo "Run verify-clean anytime:"
echo "  mise run deploy:verify-clean"
