Skip to main content

max / makenotwork

3.2 KB · 78 lines History Blame Raw
1 #!/usr/bin/env bash
2 # Refresh the testnot.work staging mirror from the latest production backup.
3 #
4 # testnot is a read-only mirror of prod, gated app-side to Fan+/creator accounts
5 # (ACCESS_GATE). This job reloads its database from the prod backup that
6 # sandod-backup-fetch already pulls to fw13, so the mirror tracks live ~daily.
7 #
8 # Runs on fw13 (the Sando host, where the backup lives and which has tailnet
9 # root on testnot via Tailscale SSH). The restore runs as the postgres
10 # superuser on testnot — streamed over Tailscale SSH — so extension/owner lines
11 # in the dump apply cleanly without granting the app role superuser. The app
12 # applies any newer migrations on the next boot (MNW migrates on startup), so a
13 # prod dump a few migrations behind the deployed binary self-heals on restart.
14 #
15 # Idempotent and safe to re-run: it stops the app, resets the schema, restores,
16 # and starts the app. testnot holds no durable state of its own (it's a mirror),
17 # so a wiped/refreshed DB each run is the intended behavior.
18 set -euo pipefail
19
20 BACKUP="${TESTNOT_BACKUP:-/srv/sando/backups/latest.sql.gz}"
21 SSH_TARGET="${TESTNOT_SSH:-root@testnot}"
22 DB="${TESTNOT_DB:-makenotwork}"
23 SERVICE="makenotwork.service"
24 # Soak-pause: while this flag file exists, the refresh is a no-op so a
25 # multi-day soak's data + feature state stay stable. Pause/resume:
26 # touch /srv/sando/testnot-refresh.paused # pause
27 # rm /srv/sando/testnot-refresh.paused # resume
28 PAUSE_FLAG="${TESTNOT_REFRESH_PAUSE:-/srv/sando/testnot-refresh.paused}"
29
30 log() { echo "[$(date -u +%H:%M:%S)] $*"; }
31 ts_ssh() { tailscale ssh "$SSH_TARGET" "$@"; }
32
33 if [ -e "$PAUSE_FLAG" ]; then
34 log "refresh paused ($PAUSE_FLAG present) — skipping to keep the soak stable"
35 exit 0
36 fi
37
38 [ -r "$BACKUP" ] || { echo "backup not readable: $BACKUP" >&2; exit 1; }
39 log "backup: $BACKUP ($(stat -c %s "$BACKUP") bytes)"
40
41 log "stopping $SERVICE on testnot"
42 ts_ssh "systemctl stop $SERVICE"
43
44 # Drop every non-system schema (mirrors sandod reset_scratch — migrations create
45 # custom schemas like tower_sessions that survive DROP SCHEMA public CASCADE).
46 # Recreate public OWNED BY the app role: on PG15+ a postgres-owned public grants
47 # no CREATE to other roles, so boot migrations would fail with "no schema has
48 # been selected to create in" (same gotcha as the sando scratch DB).
49 log "resetting schema"
50 ts_ssh "sudo -u postgres psql -v ON_ERROR_STOP=1 -d $DB" <<SQL
51 DO \$\$
52 DECLARE s text;
53 BEGIN
54 FOR s IN
55 SELECT nspname FROM pg_namespace
56 WHERE nspname NOT LIKE 'pg_%' AND nspname <> 'information_schema'
57 LOOP
58 EXECUTE format('DROP SCHEMA IF EXISTS %I CASCADE', s);
59 END LOOP;
60 EXECUTE 'CREATE SCHEMA public AUTHORIZATION $DB';
61 END \$\$;
62 SQL
63
64 log "restoring prod dump"
65 gunzip -c "$BACKUP" | ts_ssh "sudo -u postgres psql -q -v ON_ERROR_STOP=1 -d $DB" >/dev/null
66
67 log "starting $SERVICE (applies any newer migrations on boot)"
68 ts_ssh "systemctl start $SERVICE"
69
70 # Boot smoke: the app must come back healthy after migrating.
71 for i in $(seq 1 20); do
72 code=$(ts_ssh "curl -s -o /dev/null -w '%{http_code}' http://127.0.0.1:8080/health" || echo 000)
73 [ "$code" = "200" ] && { log "health OK"; exit 0; }
74 sleep 3
75 done
76 echo "testnot did not return healthy after refresh" >&2
77 exit 1
78