#!/usr/bin/env bash
# Refresh the testnot.work staging mirror from the latest production backup.
#
# testnot is a read-only mirror of prod, gated app-side to Fan+/creator accounts
# (ACCESS_GATE). This job reloads its database from the prod backup that
# sandod-backup-fetch already pulls to fw13, so the mirror tracks live ~daily.
#
# Runs on fw13 (the Sando host, where the backup lives and which has tailnet
# root on testnot via Tailscale SSH). The restore runs as the postgres
# superuser on testnot — streamed over Tailscale SSH — so extension/owner lines
# in the dump apply cleanly without granting the app role superuser. The app
# applies any newer migrations on the next boot (MNW migrates on startup), so a
# prod dump a few migrations behind the deployed binary self-heals on restart.
#
# Idempotent and safe to re-run: it stops the app, resets the schema, restores,
# and starts the app. testnot holds no durable state of its own (it's a mirror),
# so a wiped/refreshed DB each run is the intended behavior.
set -euo pipefail

BACKUP="${TESTNOT_BACKUP:-/srv/sando/backups/latest.sql.gz}"
SSH_TARGET="${TESTNOT_SSH:-root@testnot}"
DB="${TESTNOT_DB:-makenotwork}"
SERVICE="makenotwork.service"
# Soak-pause: while this flag file exists, the refresh is a no-op so a
# multi-day soak's data + feature state stay stable. Pause/resume:
#   touch /srv/sando/testnot-refresh.paused   # pause
#   rm    /srv/sando/testnot-refresh.paused   # resume
PAUSE_FLAG="${TESTNOT_REFRESH_PAUSE:-/srv/sando/testnot-refresh.paused}"

log() { echo "[$(date -u +%H:%M:%S)] $*"; }
ts_ssh() { tailscale ssh "$SSH_TARGET" "$@"; }

if [ -e "$PAUSE_FLAG" ]; then
    log "refresh paused ($PAUSE_FLAG present) — skipping to keep the soak stable"
    exit 0
fi

[ -r "$BACKUP" ] || { echo "backup not readable: $BACKUP" >&2; exit 1; }
log "backup: $BACKUP ($(stat -c %s "$BACKUP") bytes)"

log "stopping $SERVICE on testnot"
ts_ssh "systemctl stop $SERVICE"

# Drop every non-system schema (mirrors sandod reset_scratch — migrations create
# custom schemas like tower_sessions that survive DROP SCHEMA public CASCADE).
# Recreate public OWNED BY the app role: on PG15+ a postgres-owned public grants
# no CREATE to other roles, so boot migrations would fail with "no schema has
# been selected to create in" (same gotcha as the sando scratch DB).
log "resetting schema"
ts_ssh "sudo -u postgres psql -v ON_ERROR_STOP=1 -d $DB" <<SQL
DO \$\$
DECLARE s text;
BEGIN
    FOR s IN
        SELECT nspname FROM pg_namespace
        WHERE nspname NOT LIKE 'pg_%' AND nspname <> 'information_schema'
    LOOP
        EXECUTE format('DROP SCHEMA IF EXISTS %I CASCADE', s);
    END LOOP;
    EXECUTE 'CREATE SCHEMA public AUTHORIZATION $DB';
END \$\$;
SQL

log "restoring prod dump"
gunzip -c "$BACKUP" | ts_ssh "sudo -u postgres psql -q -v ON_ERROR_STOP=1 -d $DB" >/dev/null

log "starting $SERVICE (applies any newer migrations on boot)"
ts_ssh "systemctl start $SERVICE"

# Boot smoke: the app must come back healthy after migrating.
for i in $(seq 1 20); do
    code=$(ts_ssh "curl -s -o /dev/null -w '%{http_code}' http://127.0.0.1:8080/health" || echo 000)
    [ "$code" = "200" ] && { log "health OK"; exit 0; }
    sleep 3
done
echo "testnot did not return healthy after refresh" >&2
exit 1
