Skip to main content

max / makenotwork

sando+harness: pull-based rebuild, shared-role test DBs, gate hygiene Removes the cross-user permission "hot potato" that stalled the 0.10.1 deploy. - Pull-based /rebuild: sandod fetches [repo].upstream's deploy branch before worktree-ing the sha, with a clear "push first" error if still absent (git.rs fetch_upstream/sha_present, build.rs, topology.rs upstream field). - Test-DB isolation: the integration harness SET ROLE mnw_test so the template and per-test clones are owned by one shared role any member can drop; the template is also namespaced by effective role as a fallback. The stale- template DROP error is propagated instead of swallowed (no more 856-failure cascade from a foreign-owned template). - cargo_test gate pre-cleans test DBs owned by any role it belongs to. - bootstrap-sandod-host.sh provisions the mnw_test role + grants. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Author: Max Johnson <me@maxj.phd> · 2026-06-12 17:34 UTC
Commit: 511f5677277c02999068387d99a3badfbf099b12
Parent: 609ab6a
9 files changed, +235 insertions, -18 deletions
M .gitignore +4
@@ -50,3 +50,7 @@ CLAUDE.md
50 50 # Private working files — live in _private/, synced via Syncthing
51 51 todo.md
52 52 audit_review.md
53 +
54 + # sandod local state (regenerable)
55 + sando/daemon/sando.db
56 + sando/daemon/sando.db-*
@@ -37,6 +37,23 @@ pub async fn run(
37 37 ) -> Result<BuildArtifact> {
38 38 let worktree = cfg.workdir.join(sha.as_str());
39 39 let bare = PathBuf::from(&topo.repo.bare_path);
40 +
41 + // Pull-based ingestion: if an upstream remote is configured, fetch the
42 + // deploy branch so a just-pushed sha is locally resolvable. A fetch
43 + // failure is non-fatal — the sha may already be present from a prior
44 + // fetch or a direct push; the presence check below is the real gate.
45 + if let Some(upstream) = topo.repo.upstream.as_deref() {
46 + if let Err(e) = git::fetch_upstream(&bare, upstream, &topo.repo.branch).await {
47 + tracing::warn!(error = %e, upstream, "upstream fetch failed; proceeding with current bare-repo state");
48 + }
49 + }
50 + anyhow::ensure!(
51 + git::sha_present(&bare, sha.as_str()).await?,
52 + "sha {} not present in bare repo {} after fetch — push the commit to the upstream remote first",
53 + sha.as_str(),
54 + bare.display(),
55 + );
56 +
40 57 git::checkout_worktree(&bare, sha.as_str(), &worktree).await?;
41 58
42 59 let server_dir = worktree.join("server");
@@ -143,6 +143,12 @@ async fn cargo_test(ctx: &GateCtx, run_id: GateRunId) -> Result<GateOutcome> {
143 143 .map(|(base, _)| base)
144 144 .unwrap_or(scratch_url);
145 145 cmd.env("TEST_DATABASE_URL", test_url);
146 + // Best-effort: drop our own role's stale `mnw_test_*` databases (the
147 + // template + any per-test clones orphaned by a previously-killed run)
148 + // before the suite, so they can't accumulate or collide. Foreign-owned
149 + // leftovers are left alone — the harness now namespaces its template
150 + // per role, so they no longer wedge the gate.
151 + clean_stale_test_dbs(scratch_url).await;
146 152 }
147 153 let started = std::time::Instant::now();
148 154 let log_path = gate_log_path(ctx, GateKind::CargoTest);
@@ -266,6 +272,49 @@ pub(crate) async fn reset_scratch(db_url: &str) -> Result<()> {
266 272 Ok(())
267 273 }
268 274
275 + /// Best-effort cleanup of stale test databases left behind by a
276 + /// previously-killed `cargo_test` run (the per-test `mnw_test_<uuid>` clones
277 + /// and the role's template). Only drops databases owned by the connecting
278 + /// role — a foreign-owned leftover can't be dropped without superuser anyway,
279 + /// and the harness now namespaces its template per role so one can't wedge us.
280 + /// Never returns an error: a cleanup miss must not turn a deploy red.
281 + async fn clean_stale_test_dbs(db_url: &str) {
282 + use sqlx::postgres::PgPoolOptions;
283 + use sqlx::Executor;
284 + let pool = match PgPoolOptions::new().max_connections(1).connect(db_url).await {
285 + Ok(p) => p,
286 + Err(e) => {
287 + tracing::warn!(error = %e, "stale test-db cleanup: could not connect; skipping");
288 + return;
289 + }
290 + };
291 + // Owned by us OR by a role we're a member of (the shared `mnw_test`
292 + // role): pg_has_role(.., 'USAGE') covers both, and membership is enough
293 + // privilege to DROP. Foreign-owned leftovers are left untouched.
294 + let names: Vec<(String,)> = sqlx::query_as(
295 + "SELECT datname FROM pg_database
296 + WHERE datname LIKE 'mnw_test_%'
297 + AND pg_catalog.pg_has_role(current_user, datdba, 'USAGE')",
298 + )
299 + .fetch_all(&pool)
300 + .await
301 + .unwrap_or_default();
302 + let count = names.len();
303 + for (name,) in names {
304 + // `name` comes straight from pg_database; quoting it is sufficient.
305 + if let Err(e) = pool
306 + .execute(format!("DROP DATABASE IF EXISTS \"{name}\" WITH (FORCE)").as_str())
307 + .await
308 + {
309 + tracing::warn!(error = %e, db = %name, "stale test-db cleanup: drop failed");
310 + }
311 + }
312 + if count > 0 {
313 + tracing::info!(count, "stale test-db cleanup: dropped leftover mnw_test_* databases");
314 + }
315 + pool.close().await;
316 + }
317 +
269 318 async fn restore_dump(db_url: &str, dump: &str, log_buf: &mut Vec<u8>) -> Result<()> {
270 319 // Two pipelines we accept:
271 320 // *.sql -> psql $url < dump
@@ -56,6 +56,42 @@ pub async fn resolve_ref(bare: &Path, refname: &str) -> Result<String> {
56 56 Ok(String::from_utf8(out.stdout)?.trim().to_string())
57 57 }
58 58
59 + /// Fetch the deploy branch from `upstream` into the bare repo so a
60 + /// freshly-pushed sha becomes locally resolvable before `checkout_worktree`.
61 + /// Force-updates `refs/heads/<branch>` (worktrees are `--detach`, so the
62 + /// branch is never checked out). Pull-based deploys: the operator pushes to
63 + /// the canonical remote, Sando fetches it here.
64 + pub async fn fetch_upstream(bare: &Path, upstream: &str, branch: &str) -> Result<()> {
65 + let out = Command::new("git")
66 + .arg("--git-dir")
67 + .arg(bare)
68 + .args(["fetch", "--quiet", upstream])
69 + .arg(format!("+refs/heads/{branch}:refs/heads/{branch}"))
70 + .output()
71 + .await
72 + .context("spawning git fetch")?;
73 + anyhow::ensure!(
74 + out.status.success(),
75 + "git fetch {upstream} {branch} failed: {}",
76 + String::from_utf8_lossy(&out.stderr),
77 + );
78 + Ok(())
79 + }
80 +
81 + /// True if `sha` resolves to a commit object in the bare repo. Used to give a
82 + /// clear "push first" error instead of a cryptic `git worktree add` failure.
83 + pub async fn sha_present(bare: &Path, sha: &str) -> Result<bool> {
84 + let out = Command::new("git")
85 + .arg("--git-dir")
86 + .arg(bare)
87 + .args(["cat-file", "-e"])
88 + .arg(format!("{sha}^{{commit}}"))
89 + .output()
90 + .await
91 + .context("spawning git cat-file")?;
92 + Ok(out.status.success())
93 + }
94 +
59 95 pub async fn checkout_worktree(bare: &Path, sha: &str, dest: &Path) -> Result<()> {
60 96 if dest.exists() {
61 97 // Pre-existing worktree (re-trigger of same sha). Idempotent — nothing to do.
@@ -611,7 +611,7 @@ mod tests {
611 611 /// without involving real ssh / postgres.
612 612 fn test_topo() -> Topology {
613 613 Topology {
614 - repo: RepoConfig { bare_path: "/tmp/test.git".into(), branch: "main".into() },
614 + repo: RepoConfig { bare_path: "/tmp/test.git".into(), branch: "main".into(), upstream: None },
615 615 backup: BackupConfig {
616 616 source: "file:///tmp/test-backup.sql".into(),
617 617 local_path: "/tmp/local-backup.sql".into(),
@@ -127,7 +127,7 @@ mod tests {
127 127
128 128 fn topo(tiers: Vec<Tier>) -> Topology {
129 129 Topology {
130 - repo: RepoConfig { bare_path: "/tmp/x".into(), branch: "main".into() },
130 + repo: RepoConfig { bare_path: "/tmp/x".into(), branch: "main".into(), upstream: None },
131 131 backup: BackupConfig {
132 132 source: "file:///tmp/b".into(),
133 133 local_path: "/tmp/b".into(),
@@ -15,6 +15,13 @@ pub struct Topology {
15 15 pub struct RepoConfig {
16 16 pub bare_path: String,
17 17 pub branch: String,
18 + /// Optional canonical git remote. When set, `/rebuild` fetches the deploy
19 + /// branch from here into the bare repo before worktree-ing the target sha,
20 + /// so a freshly-pushed commit is resolvable without anyone pushing to the
21 + /// bare repo directly (pull-based deploys). When unset, Sando relies on the
22 + /// bare repo already containing the sha (push-based / hook-driven).
23 + #[serde(default)]
24 + pub upstream: Option<String>,
18 25 }
19 26
20 27 #[derive(Debug, Clone, Serialize, Deserialize)]
@@ -30,12 +30,14 @@
30 30 # (bootstrap-node.sh on the target consumes $SANDO_PUBKEY)
31 31 # - Populate /etc/sando/sando.env with anything beyond SANDO_DAEMON if
32 32 # additional secrets are needed
33 - # - Push the MNW working tree to /srv/sando/mnw.git
34 - # - Fix `mnw_test_template` ownership — that template gets re-created by
35 - # each `cargo test` run; ownership resets when a *different* user runs
36 - # tests on the host. Out of scope for one-shot bootstrap; manage by
37 - # keeping the host single-user or by re-`ALTER TABLE ... OWNER TO sando`
38 - # before sandod's cargo_test gate.
33 + # - Push the MNW working tree to /srv/sando/mnw.git (only needed for
34 + # push-based hosts; pull-based hosts set [repo].upstream in sando.toml and
35 + # sandod fetches on /rebuild)
36 + #
37 + # Note: the old `mnw_test_template` ownership footgun (a template left owned by
38 + # a different login wedged the cargo_test gate) is handled by step 6b — the
39 + # harness `SET ROLE mnw_test` so all test DBs share one owner that every member
40 + # can drop. No more per-user ownership resets.
39 41
40 42 set -euo pipefail
41 43
@@ -56,6 +58,11 @@ BUILD_SANDOD="${BUILD_SANDOD:-1}"
56 58 # add/remove. Each entry is "name[:port]"; port defaults to 22.
57 59 SEED_HOSTS="${SEED_HOSTS:-testnot alpha-west-1:2200}"
58 60
61 + # Roles to add to the shared `mnw_test` role (owns all test databases). A pure
62 + # sando host needs only `sando`; a shared dev box should also list the human
63 + # role, e.g. TEST_DB_MEMBERS="sando max".
64 + TEST_DB_MEMBERS="${TEST_DB_MEMBERS:-$SANDO_USER}"
65 +
59 66 # Resolve the script's directory so it can copy sibling unit/config files
60 67 # without depending on cwd. Layout: `<SANDO_REPO>/deploy/this-script.sh`,
61 68 # so SANDO_REPO is one level up.
@@ -120,6 +127,33 @@ log "6/13 sando_scratch public schema owner = $SANDO_USER"
120 127 sudo -u postgres psql -v ON_ERROR_STOP=1 sando_scratch -c \
121 128 "ALTER SCHEMA public OWNER TO $SANDO_USER" >/dev/null
122 129
130 + log "6b/13 shared mnw_test role (owns all test DBs; members can drop them)"
131 + # The server integration harness (server/tests/harness/db.rs) does
132 + # `SET ROLE mnw_test` before creating its template + per-test clones, so they
133 + # are owned by this shared role no matter which login ran the suite. Every
134 + # member can then drop any `mnw_test_*` DB — no superuser, no cross-user
135 + # ownership wedge (the failure mode that stalled the 0.10.1 deploy).
136 + sudo -u postgres psql -v ON_ERROR_STOP=1 <<SQL
137 + DO \$\$
138 + BEGIN
139 + IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'mnw_test') THEN
140 + CREATE ROLE mnw_test NOLOGIN CREATEDB;
141 + ELSE
142 + ALTER ROLE mnw_test NOLOGIN CREATEDB;
143 + END IF;
144 + END
145 + \$\$;
146 + SQL
147 + for member in $TEST_DB_MEMBERS; do
148 + if sudo -u postgres psql -tAc \
149 + "SELECT 1 FROM pg_roles WHERE rolname = '$member'" | grep -q '^1$'; then
150 + sudo -u postgres psql -v ON_ERROR_STOP=1 \
151 + -c "GRANT mnw_test TO \"$member\"" >/dev/null
152 + else
153 + log " warn: role $member missing; skipping its mnw_test grant"
154 + fi
155 + done
156 +
123 157 log "7/13 sando ssh key (ed25519)"
124 158 install -d -o "$SANDO_USER" -g "$SANDO_USER" -m 0700 "$SANDO_HOME/.ssh"
125 159 if [[ ! -f "$SANDO_HOME/.ssh/id_ed25519" ]]; then
@@ -6,12 +6,48 @@
6 6
7 7 use sqlx::postgres::PgPoolOptions;
8 8 use sqlx::{Connection, Executor, PgConnection, PgPool};
9 - use std::sync::Once;
9 + use std::sync::{Once, OnceLock};
10 10 use std::time::Duration;
11 11 use uuid::Uuid;
12 12
13 - /// Name of the shared template database, created once per test suite run.
14 - const TEMPLATE_DB_NAME: &str = "mnw_test_template";
13 + /// Base name of the shared template database. The live name is suffixed with
14 + /// the connecting role (see `template_name`).
15 + const TEMPLATE_DB_BASE: &str = "mnw_test_template";
16 +
17 + /// The live template name for this run, namespaced by the connecting postgres
18 + /// role. Two suites run by different roles on a shared cluster (e.g. a
19 + /// developer's `max` and the Sando gate's `sando`) get distinct templates, so
20 + /// they never collide AND each role owns — and can therefore drop — its own.
21 + /// This is what prevents a stale, foreign-owned `mnw_test_template` from
22 + /// wedging a deploy's `cargo_test` gate. Set once by `ensure_template`.
23 + static TEMPLATE_NAME: OnceLock<String> = OnceLock::new();
24 +
25 + fn template_name() -> &'static str {
26 + TEMPLATE_NAME.get().map(String::as_str).unwrap_or(TEMPLATE_DB_BASE)
27 + }
28 +
29 + /// Reduce a postgres role to a safe identifier suffix (`[a-z0-9_]`).
30 + fn sanitize_role(role: &str) -> String {
31 + role.chars()
32 + .map(|c| if c.is_ascii_alphanumeric() || c == '_' { c.to_ascii_lowercase() } else { '_' })
33 + .collect()
34 + }
35 +
36 + /// Shared, deploy-stable role that owns all test databases when present. Both
37 + /// the interactive (`max`) and Sando-gate (`sando`) logins are members, so
38 + /// whoever runs the suite, every `mnw_test_*` DB ends up owned by this role and
39 + /// any member can drop it — no superuser, no cross-user ownership clashes.
40 + const SHARED_ROLE: &str = "mnw_test";
41 +
42 + /// Best-effort `SET ROLE mnw_test` so subsequent `CREATE DATABASE`s are owned
43 + /// by the shared role. Silently no-ops when the role is absent or the login
44 + /// isn't a member (e.g. a fresh dev box without bootstrap) — the per-role
45 + /// template namespacing is the fallback in that case.
46 + async fn assume_shared_role(conn: &mut PgConnection) {
47 + let _ = conn
48 + .execute(format!("SET ROLE \"{SHARED_ROLE}\"").as_str())
49 + .await;
50 + }
15 51
16 52 /// Ensures template creation runs exactly once, across all threads and runtimes.
17 53 static TEMPLATE_INIT: Once = Once::new();
@@ -38,19 +74,44 @@ fn ensure_template() {
38 74 .await
39 75 .expect("connect to admin DB for template setup");
40 76
41 - // Drop stale template if it exists (migrations may have changed)
42 - let _ = conn.execute(format!(
43 - "DROP DATABASE IF EXISTS \"{TEMPLATE_DB_NAME}\" WITH (FORCE)"
44 - ).as_str()).await;
77 + // Adopt the shared role so the template (and every clone) is owned
78 + // by `mnw_test`, droppable by any member. No-ops on hosts without
79 + // the role, falling back to per-role namespacing below.
80 + assume_shared_role(&mut conn).await;
81 +
82 + // Namespace the template by the *effective* role: with the shared
83 + // role assumed this collapses to one template for everyone; without
84 + // it, each login role gets its own so a developer's `max` run and
85 + // the Sando gate's `sando` run never collide on a shared cluster.
86 + let (role,): (String,) = sqlx::query_as("SELECT current_user")
87 + .fetch_one(&mut conn)
88 + .await
89 + .expect("query current_user for template namespacing");
90 + let template = format!("{TEMPLATE_DB_BASE}_{}", sanitize_role(&role));
91 + let _ = TEMPLATE_NAME.set(template.clone());
92 +
93 + // Drop stale template (migrations may have changed) before
94 + // recreating. We own it, so this succeeds — and we PROPAGATE the
95 + // error rather than swallowing it, so an ownership/lock failure
96 + // surfaces here instead of cascading into a CREATE "already exists"
97 + // that poisons this `Once` and fails the entire suite opaquely.
98 + conn.execute(format!(
99 + "DROP DATABASE IF EXISTS \"{template}\" WITH (FORCE)"
100 + ).as_str())
101 + .await
102 + .unwrap_or_else(|e| panic!(
103 + "drop stale template {template}: {e} \
104 + (if owned by a different role, drop it as the postgres superuser)"
105 + ));
45 106
46 107 conn.execute(format!(
47 - "CREATE DATABASE \"{TEMPLATE_DB_NAME}\""
108 + "CREATE DATABASE \"{template}\""
48 109 ).as_str())
49 110 .await
50 111 .expect("create template database");
51 112
52 113 // Connect to the template and run all migrations
53 - let tpl_url = replace_db_name(&admin, TEMPLATE_DB_NAME);
114 + let tpl_url = replace_db_name(&admin, &template);
54 115 let tpl_pool = PgPoolOptions::new()
55 116 .max_connections(2)
56 117 .acquire_timeout(Duration::from_secs(10))
@@ -109,10 +170,15 @@ impl TestDb {
109 170 .await
110 171 .expect("Failed to connect to admin database");
111 172
173 + // Own the clone via the shared role too, so cleanup (and any member)
174 + // can drop it. Matches the template's ownership.
175 + assume_shared_role(&mut admin_conn).await;
176 +
112 177 admin_conn
113 178 .execute(
114 179 format!(
115 - "CREATE DATABASE \"{db_name}\" TEMPLATE \"{TEMPLATE_DB_NAME}\""
180 + "CREATE DATABASE \"{db_name}\" TEMPLATE \"{}\"",
181 + template_name()
116 182 )
117 183 .as_str(),
118 184 )
@@ -167,6 +233,10 @@ impl Drop for TestDb {
167 233
168 234 rt.block_on(async {
169 235 if let Ok(mut conn) = PgConnection::connect(&admin_url).await {
236 + // Adopt the shared role so we can drop a clone it owns even
237 + // when our login role differs from the creator's.
238 + assume_shared_role(&mut conn).await;
239 +
170 240 let _ = conn
171 241 .execute(
172 242 format!(