max / makenotwork

3.2 KB · 78 lines History Blame Raw

1	#!/usr/bin/env bash
2	# Refresh the testnot.work staging mirror from the latest production backup.
3	#
4	# testnot is a read-only mirror of prod, gated app-side to Fan+/creator accounts
5	# (ACCESS_GATE). This job reloads its database from the prod backup that
6	# sandod-backup-fetch already pulls to fw13, so the mirror tracks live ~daily.
7	#
8	# Runs on fw13 (the Sando host, where the backup lives and which has tailnet
9	# root on testnot via Tailscale SSH). The restore runs as the postgres
10	# superuser on testnot — streamed over Tailscale SSH — so extension/owner lines
11	# in the dump apply cleanly without granting the app role superuser. The app
12	# applies any newer migrations on the next boot (MNW migrates on startup), so a
13	# prod dump a few migrations behind the deployed binary self-heals on restart.
14	#
15	# Idempotent and safe to re-run: it stops the app, resets the schema, restores,
16	# and starts the app. testnot holds no durable state of its own (it's a mirror),
17	# so a wiped/refreshed DB each run is the intended behavior.
18	set -euo pipefail
19
20	BACKUP="${TESTNOT_BACKUP:-/srv/sando/backups/latest.sql.gz}"
21	SSH_TARGET="${TESTNOT_SSH:-root@testnot}"
22	DB="${TESTNOT_DB:-makenotwork}"
23	SERVICE="makenotwork.service"
24	# Soak-pause: while this flag file exists, the refresh is a no-op so a
25	# multi-day soak's data + feature state stay stable. Pause/resume:
26	# touch /srv/sando/testnot-refresh.paused # pause
27	# rm /srv/sando/testnot-refresh.paused # resume
28	PAUSE_FLAG="${TESTNOT_REFRESH_PAUSE:-/srv/sando/testnot-refresh.paused}"
29
30	log() { echo "[$(date -u +%H:%M:%S)] $*"; }
31	ts_ssh() { tailscale ssh "$SSH_TARGET" "$@"; }
32
33	if [ -e "$PAUSE_FLAG" ]; then
34	log "refresh paused ($PAUSE_FLAG present) — skipping to keep the soak stable"
35	exit 0
36	fi
37
38	[ -r "$BACKUP" ] \|\| { echo "backup not readable: $BACKUP" >&2; exit 1; }
39	log "backup: $BACKUP ($(stat -c %s "$BACKUP") bytes)"
40
41	log "stopping $SERVICE on testnot"
42	ts_ssh "systemctl stop $SERVICE"
43
44	# Drop every non-system schema (mirrors sandod reset_scratch — migrations create
45	# custom schemas like tower_sessions that survive DROP SCHEMA public CASCADE).
46	# Recreate public OWNED BY the app role: on PG15+ a postgres-owned public grants
47	# no CREATE to other roles, so boot migrations would fail with "no schema has
48	# been selected to create in" (same gotcha as the sando scratch DB).
49	log "resetting schema"
50	ts_ssh "sudo -u postgres psql -v ON_ERROR_STOP=1 -d $DB" <<SQL
51	DO \$\$
52	DECLARE s text;
53	BEGIN
54	FOR s IN
55	SELECT nspname FROM pg_namespace
56	WHERE nspname NOT LIKE 'pg_%' AND nspname <> 'information_schema'
57	LOOP
58	EXECUTE format('DROP SCHEMA IF EXISTS %I CASCADE', s);
59	END LOOP;
60	EXECUTE 'CREATE SCHEMA public AUTHORIZATION $DB';
61	END \$\$;
62	SQL
63
64	log "restoring prod dump"
65	gunzip -c "$BACKUP" \| ts_ssh "sudo -u postgres psql -q -v ON_ERROR_STOP=1 -d $DB" >/dev/null
66
67	log "starting $SERVICE (applies any newer migrations on boot)"
68	ts_ssh "systemctl start $SERVICE"
69
70	# Boot smoke: the app must come back healthy after migrating.
71	for i in $(seq 1 20); do
72	code=$(ts_ssh "curl -s -o /dev/null -w '%{http_code}' http://127.0.0.1:8080/health" \|\| echo 000)
73	[ "$code" = "200" ] && { log "health OK"; exit 0; }
74	sleep 3
75	done
76	echo "testnot did not return healthy after refresh" >&2
77	exit 1
78