max / makenotwork
9 files changed,
+235 insertions,
-18 deletions
| @@ -50,3 +50,7 @@ CLAUDE.md | |||
| 50 | 50 | # Private working files — live in _private/, synced via Syncthing | |
| 51 | 51 | todo.md | |
| 52 | 52 | audit_review.md | |
| 53 | + | ||
| 54 | + | # sandod local state (regenerable) | |
| 55 | + | sando/daemon/sando.db | |
| 56 | + | sando/daemon/sando.db-* |
| @@ -37,6 +37,23 @@ pub async fn run( | |||
| 37 | 37 | ) -> Result<BuildArtifact> { | |
| 38 | 38 | let worktree = cfg.workdir.join(sha.as_str()); | |
| 39 | 39 | let bare = PathBuf::from(&topo.repo.bare_path); | |
| 40 | + | ||
| 41 | + | // Pull-based ingestion: if an upstream remote is configured, fetch the | |
| 42 | + | // deploy branch so a just-pushed sha is locally resolvable. A fetch | |
| 43 | + | // failure is non-fatal — the sha may already be present from a prior | |
| 44 | + | // fetch or a direct push; the presence check below is the real gate. | |
| 45 | + | if let Some(upstream) = topo.repo.upstream.as_deref() { | |
| 46 | + | if let Err(e) = git::fetch_upstream(&bare, upstream, &topo.repo.branch).await { | |
| 47 | + | tracing::warn!(error = %e, upstream, "upstream fetch failed; proceeding with current bare-repo state"); | |
| 48 | + | } | |
| 49 | + | } | |
| 50 | + | anyhow::ensure!( | |
| 51 | + | git::sha_present(&bare, sha.as_str()).await?, | |
| 52 | + | "sha {} not present in bare repo {} after fetch — push the commit to the upstream remote first", | |
| 53 | + | sha.as_str(), | |
| 54 | + | bare.display(), | |
| 55 | + | ); | |
| 56 | + | ||
| 40 | 57 | git::checkout_worktree(&bare, sha.as_str(), &worktree).await?; | |
| 41 | 58 | ||
| 42 | 59 | let server_dir = worktree.join("server"); |
| @@ -143,6 +143,12 @@ async fn cargo_test(ctx: &GateCtx, run_id: GateRunId) -> Result<GateOutcome> { | |||
| 143 | 143 | .map(|(base, _)| base) | |
| 144 | 144 | .unwrap_or(scratch_url); | |
| 145 | 145 | cmd.env("TEST_DATABASE_URL", test_url); | |
| 146 | + | // Best-effort: drop our own role's stale `mnw_test_*` databases (the | |
| 147 | + | // template + any per-test clones orphaned by a previously-killed run) | |
| 148 | + | // before the suite, so they can't accumulate or collide. Foreign-owned | |
| 149 | + | // leftovers are left alone — the harness now namespaces its template | |
| 150 | + | // per role, so they no longer wedge the gate. | |
| 151 | + | clean_stale_test_dbs(scratch_url).await; | |
| 146 | 152 | } | |
| 147 | 153 | let started = std::time::Instant::now(); | |
| 148 | 154 | let log_path = gate_log_path(ctx, GateKind::CargoTest); | |
| @@ -266,6 +272,49 @@ pub(crate) async fn reset_scratch(db_url: &str) -> Result<()> { | |||
| 266 | 272 | Ok(()) | |
| 267 | 273 | } | |
| 268 | 274 | ||
| 275 | + | /// Best-effort cleanup of stale test databases left behind by a | |
| 276 | + | /// previously-killed `cargo_test` run (the per-test `mnw_test_<uuid>` clones | |
| 277 | + | /// and the role's template). Only drops databases owned by the connecting | |
| 278 | + | /// role — a foreign-owned leftover can't be dropped without superuser anyway, | |
| 279 | + | /// and the harness now namespaces its template per role so one can't wedge us. | |
| 280 | + | /// Never returns an error: a cleanup miss must not turn a deploy red. | |
| 281 | + | async fn clean_stale_test_dbs(db_url: &str) { | |
| 282 | + | use sqlx::postgres::PgPoolOptions; | |
| 283 | + | use sqlx::Executor; | |
| 284 | + | let pool = match PgPoolOptions::new().max_connections(1).connect(db_url).await { | |
| 285 | + | Ok(p) => p, | |
| 286 | + | Err(e) => { | |
| 287 | + | tracing::warn!(error = %e, "stale test-db cleanup: could not connect; skipping"); | |
| 288 | + | return; | |
| 289 | + | } | |
| 290 | + | }; | |
| 291 | + | // Owned by us OR by a role we're a member of (the shared `mnw_test` | |
| 292 | + | // role): pg_has_role(.., 'USAGE') covers both, and membership is enough | |
| 293 | + | // privilege to DROP. Foreign-owned leftovers are left untouched. | |
| 294 | + | let names: Vec<(String,)> = sqlx::query_as( | |
| 295 | + | "SELECT datname FROM pg_database | |
| 296 | + | WHERE datname LIKE 'mnw_test_%' | |
| 297 | + | AND pg_catalog.pg_has_role(current_user, datdba, 'USAGE')", | |
| 298 | + | ) | |
| 299 | + | .fetch_all(&pool) | |
| 300 | + | .await | |
| 301 | + | .unwrap_or_default(); | |
| 302 | + | let count = names.len(); | |
| 303 | + | for (name,) in names { | |
| 304 | + | // `name` comes straight from pg_database; quoting it is sufficient. | |
| 305 | + | if let Err(e) = pool | |
| 306 | + | .execute(format!("DROP DATABASE IF EXISTS \"{name}\" WITH (FORCE)").as_str()) | |
| 307 | + | .await | |
| 308 | + | { | |
| 309 | + | tracing::warn!(error = %e, db = %name, "stale test-db cleanup: drop failed"); | |
| 310 | + | } | |
| 311 | + | } | |
| 312 | + | if count > 0 { | |
| 313 | + | tracing::info!(count, "stale test-db cleanup: dropped leftover mnw_test_* databases"); | |
| 314 | + | } | |
| 315 | + | pool.close().await; | |
| 316 | + | } | |
| 317 | + | ||
| 269 | 318 | async fn restore_dump(db_url: &str, dump: &str, log_buf: &mut Vec<u8>) -> Result<()> { | |
| 270 | 319 | // Two pipelines we accept: | |
| 271 | 320 | // *.sql -> psql $url < dump |
| @@ -56,6 +56,42 @@ pub async fn resolve_ref(bare: &Path, refname: &str) -> Result<String> { | |||
| 56 | 56 | Ok(String::from_utf8(out.stdout)?.trim().to_string()) | |
| 57 | 57 | } | |
| 58 | 58 | ||
| 59 | + | /// Fetch the deploy branch from `upstream` into the bare repo so a | |
| 60 | + | /// freshly-pushed sha becomes locally resolvable before `checkout_worktree`. | |
| 61 | + | /// Force-updates `refs/heads/<branch>` (worktrees are `--detach`, so the | |
| 62 | + | /// branch is never checked out). Pull-based deploys: the operator pushes to | |
| 63 | + | /// the canonical remote, Sando fetches it here. | |
| 64 | + | pub async fn fetch_upstream(bare: &Path, upstream: &str, branch: &str) -> Result<()> { | |
| 65 | + | let out = Command::new("git") | |
| 66 | + | .arg("--git-dir") | |
| 67 | + | .arg(bare) | |
| 68 | + | .args(["fetch", "--quiet", upstream]) | |
| 69 | + | .arg(format!("+refs/heads/{branch}:refs/heads/{branch}")) | |
| 70 | + | .output() | |
| 71 | + | .await | |
| 72 | + | .context("spawning git fetch")?; | |
| 73 | + | anyhow::ensure!( | |
| 74 | + | out.status.success(), | |
| 75 | + | "git fetch {upstream} {branch} failed: {}", | |
| 76 | + | String::from_utf8_lossy(&out.stderr), | |
| 77 | + | ); | |
| 78 | + | Ok(()) | |
| 79 | + | } | |
| 80 | + | ||
| 81 | + | /// True if `sha` resolves to a commit object in the bare repo. Used to give a | |
| 82 | + | /// clear "push first" error instead of a cryptic `git worktree add` failure. | |
| 83 | + | pub async fn sha_present(bare: &Path, sha: &str) -> Result<bool> { | |
| 84 | + | let out = Command::new("git") | |
| 85 | + | .arg("--git-dir") | |
| 86 | + | .arg(bare) | |
| 87 | + | .args(["cat-file", "-e"]) | |
| 88 | + | .arg(format!("{sha}^{{commit}}")) | |
| 89 | + | .output() | |
| 90 | + | .await | |
| 91 | + | .context("spawning git cat-file")?; | |
| 92 | + | Ok(out.status.success()) | |
| 93 | + | } | |
| 94 | + | ||
| 59 | 95 | pub async fn checkout_worktree(bare: &Path, sha: &str, dest: &Path) -> Result<()> { | |
| 60 | 96 | if dest.exists() { | |
| 61 | 97 | // Pre-existing worktree (re-trigger of same sha). Idempotent — nothing to do. |
| @@ -611,7 +611,7 @@ mod tests { | |||
| 611 | 611 | /// without involving real ssh / postgres. | |
| 612 | 612 | fn test_topo() -> Topology { | |
| 613 | 613 | Topology { | |
| 614 | - | repo: RepoConfig { bare_path: "/tmp/test.git".into(), branch: "main".into() }, | |
| 614 | + | repo: RepoConfig { bare_path: "/tmp/test.git".into(), branch: "main".into(), upstream: None }, | |
| 615 | 615 | backup: BackupConfig { | |
| 616 | 616 | source: "file:///tmp/test-backup.sql".into(), | |
| 617 | 617 | local_path: "/tmp/local-backup.sql".into(), |
| @@ -127,7 +127,7 @@ mod tests { | |||
| 127 | 127 | ||
| 128 | 128 | fn topo(tiers: Vec<Tier>) -> Topology { | |
| 129 | 129 | Topology { | |
| 130 | - | repo: RepoConfig { bare_path: "/tmp/x".into(), branch: "main".into() }, | |
| 130 | + | repo: RepoConfig { bare_path: "/tmp/x".into(), branch: "main".into(), upstream: None }, | |
| 131 | 131 | backup: BackupConfig { | |
| 132 | 132 | source: "file:///tmp/b".into(), | |
| 133 | 133 | local_path: "/tmp/b".into(), |
| @@ -15,6 +15,13 @@ pub struct Topology { | |||
| 15 | 15 | pub struct RepoConfig { | |
| 16 | 16 | pub bare_path: String, | |
| 17 | 17 | pub branch: String, | |
| 18 | + | /// Optional canonical git remote. When set, `/rebuild` fetches the deploy | |
| 19 | + | /// branch from here into the bare repo before worktree-ing the target sha, | |
| 20 | + | /// so a freshly-pushed commit is resolvable without anyone pushing to the | |
| 21 | + | /// bare repo directly (pull-based deploys). When unset, Sando relies on the | |
| 22 | + | /// bare repo already containing the sha (push-based / hook-driven). | |
| 23 | + | #[serde(default)] | |
| 24 | + | pub upstream: Option<String>, | |
| 18 | 25 | } | |
| 19 | 26 | ||
| 20 | 27 | #[derive(Debug, Clone, Serialize, Deserialize)] |
| @@ -30,12 +30,14 @@ | |||
| 30 | 30 | # (bootstrap-node.sh on the target consumes $SANDO_PUBKEY) | |
| 31 | 31 | # - Populate /etc/sando/sando.env with anything beyond SANDO_DAEMON if | |
| 32 | 32 | # additional secrets are needed | |
| 33 | - | # - Push the MNW working tree to /srv/sando/mnw.git | |
| 34 | - | # - Fix `mnw_test_template` ownership — that template gets re-created by | |
| 35 | - | # each `cargo test` run; ownership resets when a *different* user runs | |
| 36 | - | # tests on the host. Out of scope for one-shot bootstrap; manage by | |
| 37 | - | # keeping the host single-user or by re-`ALTER TABLE ... OWNER TO sando` | |
| 38 | - | # before sandod's cargo_test gate. | |
| 33 | + | # - Push the MNW working tree to /srv/sando/mnw.git (only needed for | |
| 34 | + | # push-based hosts; pull-based hosts set [repo].upstream in sando.toml and | |
| 35 | + | # sandod fetches on /rebuild) | |
| 36 | + | # | |
| 37 | + | # Note: the old `mnw_test_template` ownership footgun (a template left owned by | |
| 38 | + | # a different login wedged the cargo_test gate) is handled by step 6b — the | |
| 39 | + | # harness `SET ROLE mnw_test` so all test DBs share one owner that every member | |
| 40 | + | # can drop. No more per-user ownership resets. | |
| 39 | 41 | ||
| 40 | 42 | set -euo pipefail | |
| 41 | 43 | ||
| @@ -56,6 +58,11 @@ BUILD_SANDOD="${BUILD_SANDOD:-1}" | |||
| 56 | 58 | # add/remove. Each entry is "name[:port]"; port defaults to 22. | |
| 57 | 59 | SEED_HOSTS="${SEED_HOSTS:-testnot alpha-west-1:2200}" | |
| 58 | 60 | ||
| 61 | + | # Roles to add to the shared `mnw_test` role (owns all test databases). A pure | |
| 62 | + | # sando host needs only `sando`; a shared dev box should also list the human | |
| 63 | + | # role, e.g. TEST_DB_MEMBERS="sando max". | |
| 64 | + | TEST_DB_MEMBERS="${TEST_DB_MEMBERS:-$SANDO_USER}" | |
| 65 | + | ||
| 59 | 66 | # Resolve the script's directory so it can copy sibling unit/config files | |
| 60 | 67 | # without depending on cwd. Layout: `<SANDO_REPO>/deploy/this-script.sh`, | |
| 61 | 68 | # so SANDO_REPO is one level up. | |
| @@ -120,6 +127,33 @@ log "6/13 sando_scratch public schema owner = $SANDO_USER" | |||
| 120 | 127 | sudo -u postgres psql -v ON_ERROR_STOP=1 sando_scratch -c \ | |
| 121 | 128 | "ALTER SCHEMA public OWNER TO $SANDO_USER" >/dev/null | |
| 122 | 129 | ||
| 130 | + | log "6b/13 shared mnw_test role (owns all test DBs; members can drop them)" | |
| 131 | + | # The server integration harness (server/tests/harness/db.rs) does | |
| 132 | + | # `SET ROLE mnw_test` before creating its template + per-test clones, so they | |
| 133 | + | # are owned by this shared role no matter which login ran the suite. Every | |
| 134 | + | # member can then drop any `mnw_test_*` DB — no superuser, no cross-user | |
| 135 | + | # ownership wedge (the failure mode that stalled the 0.10.1 deploy). | |
| 136 | + | sudo -u postgres psql -v ON_ERROR_STOP=1 <<SQL | |
| 137 | + | DO \$\$ | |
| 138 | + | BEGIN | |
| 139 | + | IF NOT EXISTS (SELECT 1 FROM pg_roles WHERE rolname = 'mnw_test') THEN | |
| 140 | + | CREATE ROLE mnw_test NOLOGIN CREATEDB; | |
| 141 | + | ELSE | |
| 142 | + | ALTER ROLE mnw_test NOLOGIN CREATEDB; | |
| 143 | + | END IF; | |
| 144 | + | END | |
| 145 | + | \$\$; | |
| 146 | + | SQL | |
| 147 | + | for member in $TEST_DB_MEMBERS; do | |
| 148 | + | if sudo -u postgres psql -tAc \ | |
| 149 | + | "SELECT 1 FROM pg_roles WHERE rolname = '$member'" | grep -q '^1$'; then | |
| 150 | + | sudo -u postgres psql -v ON_ERROR_STOP=1 \ | |
| 151 | + | -c "GRANT mnw_test TO \"$member\"" >/dev/null | |
| 152 | + | else | |
| 153 | + | log " warn: role $member missing; skipping its mnw_test grant" | |
| 154 | + | fi | |
| 155 | + | done | |
| 156 | + | ||
| 123 | 157 | log "7/13 sando ssh key (ed25519)" | |
| 124 | 158 | install -d -o "$SANDO_USER" -g "$SANDO_USER" -m 0700 "$SANDO_HOME/.ssh" | |
| 125 | 159 | if [[ ! -f "$SANDO_HOME/.ssh/id_ed25519" ]]; then |
| @@ -6,12 +6,48 @@ | |||
| 6 | 6 | ||
| 7 | 7 | use sqlx::postgres::PgPoolOptions; | |
| 8 | 8 | use sqlx::{Connection, Executor, PgConnection, PgPool}; | |
| 9 | - | use std::sync::Once; | |
| 9 | + | use std::sync::{Once, OnceLock}; | |
| 10 | 10 | use std::time::Duration; | |
| 11 | 11 | use uuid::Uuid; | |
| 12 | 12 | ||
| 13 | - | /// Name of the shared template database, created once per test suite run. | |
| 14 | - | const TEMPLATE_DB_NAME: &str = "mnw_test_template"; | |
| 13 | + | /// Base name of the shared template database. The live name is suffixed with | |
| 14 | + | /// the connecting role (see `template_name`). | |
| 15 | + | const TEMPLATE_DB_BASE: &str = "mnw_test_template"; | |
| 16 | + | ||
| 17 | + | /// The live template name for this run, namespaced by the connecting postgres | |
| 18 | + | /// role. Two suites run by different roles on a shared cluster (e.g. a | |
| 19 | + | /// developer's `max` and the Sando gate's `sando`) get distinct templates, so | |
| 20 | + | /// they never collide AND each role owns — and can therefore drop — its own. | |
| 21 | + | /// This is what prevents a stale, foreign-owned `mnw_test_template` from | |
| 22 | + | /// wedging a deploy's `cargo_test` gate. Set once by `ensure_template`. | |
| 23 | + | static TEMPLATE_NAME: OnceLock<String> = OnceLock::new(); | |
| 24 | + | ||
| 25 | + | fn template_name() -> &'static str { | |
| 26 | + | TEMPLATE_NAME.get().map(String::as_str).unwrap_or(TEMPLATE_DB_BASE) | |
| 27 | + | } | |
| 28 | + | ||
| 29 | + | /// Reduce a postgres role to a safe identifier suffix (`[a-z0-9_]`). | |
| 30 | + | fn sanitize_role(role: &str) -> String { | |
| 31 | + | role.chars() | |
| 32 | + | .map(|c| if c.is_ascii_alphanumeric() || c == '_' { c.to_ascii_lowercase() } else { '_' }) | |
| 33 | + | .collect() | |
| 34 | + | } | |
| 35 | + | ||
| 36 | + | /// Shared, deploy-stable role that owns all test databases when present. Both | |
| 37 | + | /// the interactive (`max`) and Sando-gate (`sando`) logins are members, so | |
| 38 | + | /// whoever runs the suite, every `mnw_test_*` DB ends up owned by this role and | |
| 39 | + | /// any member can drop it — no superuser, no cross-user ownership clashes. | |
| 40 | + | const SHARED_ROLE: &str = "mnw_test"; | |
| 41 | + | ||
| 42 | + | /// Best-effort `SET ROLE mnw_test` so subsequent `CREATE DATABASE`s are owned | |
| 43 | + | /// by the shared role. Silently no-ops when the role is absent or the login | |
| 44 | + | /// isn't a member (e.g. a fresh dev box without bootstrap) — the per-role | |
| 45 | + | /// template namespacing is the fallback in that case. | |
| 46 | + | async fn assume_shared_role(conn: &mut PgConnection) { | |
| 47 | + | let _ = conn | |
| 48 | + | .execute(format!("SET ROLE \"{SHARED_ROLE}\"").as_str()) | |
| 49 | + | .await; | |
| 50 | + | } | |
| 15 | 51 | ||
| 16 | 52 | /// Ensures template creation runs exactly once, across all threads and runtimes. | |
| 17 | 53 | static TEMPLATE_INIT: Once = Once::new(); | |
| @@ -38,19 +74,44 @@ fn ensure_template() { | |||
| 38 | 74 | .await | |
| 39 | 75 | .expect("connect to admin DB for template setup"); | |
| 40 | 76 | ||
| 41 | - | // Drop stale template if it exists (migrations may have changed) | |
| 42 | - | let _ = conn.execute(format!( | |
| 43 | - | "DROP DATABASE IF EXISTS \"{TEMPLATE_DB_NAME}\" WITH (FORCE)" | |
| 44 | - | ).as_str()).await; | |
| 77 | + | // Adopt the shared role so the template (and every clone) is owned | |
| 78 | + | // by `mnw_test`, droppable by any member. No-ops on hosts without | |
| 79 | + | // the role, falling back to per-role namespacing below. | |
| 80 | + | assume_shared_role(&mut conn).await; | |
| 81 | + | ||
| 82 | + | // Namespace the template by the *effective* role: with the shared | |
| 83 | + | // role assumed this collapses to one template for everyone; without | |
| 84 | + | // it, each login role gets its own so a developer's `max` run and | |
| 85 | + | // the Sando gate's `sando` run never collide on a shared cluster. | |
| 86 | + | let (role,): (String,) = sqlx::query_as("SELECT current_user") | |
| 87 | + | .fetch_one(&mut conn) | |
| 88 | + | .await | |
| 89 | + | .expect("query current_user for template namespacing"); | |
| 90 | + | let template = format!("{TEMPLATE_DB_BASE}_{}", sanitize_role(&role)); | |
| 91 | + | let _ = TEMPLATE_NAME.set(template.clone()); | |
| 92 | + | ||
| 93 | + | // Drop stale template (migrations may have changed) before | |
| 94 | + | // recreating. We own it, so this succeeds — and we PROPAGATE the | |
| 95 | + | // error rather than swallowing it, so an ownership/lock failure | |
| 96 | + | // surfaces here instead of cascading into a CREATE "already exists" | |
| 97 | + | // that poisons this `Once` and fails the entire suite opaquely. | |
| 98 | + | conn.execute(format!( | |
| 99 | + | "DROP DATABASE IF EXISTS \"{template}\" WITH (FORCE)" | |
| 100 | + | ).as_str()) | |
| 101 | + | .await | |
| 102 | + | .unwrap_or_else(|e| panic!( | |
| 103 | + | "drop stale template {template}: {e} \ | |
| 104 | + | (if owned by a different role, drop it as the postgres superuser)" | |
| 105 | + | )); | |
| 45 | 106 | ||
| 46 | 107 | conn.execute(format!( | |
| 47 | - | "CREATE DATABASE \"{TEMPLATE_DB_NAME}\"" | |
| 108 | + | "CREATE DATABASE \"{template}\"" | |
| 48 | 109 | ).as_str()) | |
| 49 | 110 | .await | |
| 50 | 111 | .expect("create template database"); | |
| 51 | 112 | ||
| 52 | 113 | // Connect to the template and run all migrations | |
| 53 | - | let tpl_url = replace_db_name(&admin, TEMPLATE_DB_NAME); | |
| 114 | + | let tpl_url = replace_db_name(&admin, &template); | |
| 54 | 115 | let tpl_pool = PgPoolOptions::new() | |
| 55 | 116 | .max_connections(2) | |
| 56 | 117 | .acquire_timeout(Duration::from_secs(10)) | |
| @@ -109,10 +170,15 @@ impl TestDb { | |||
| 109 | 170 | .await | |
| 110 | 171 | .expect("Failed to connect to admin database"); | |
| 111 | 172 | ||
| 173 | + | // Own the clone via the shared role too, so cleanup (and any member) | |
| 174 | + | // can drop it. Matches the template's ownership. | |
| 175 | + | assume_shared_role(&mut admin_conn).await; | |
| 176 | + | ||
| 112 | 177 | admin_conn | |
| 113 | 178 | .execute( | |
| 114 | 179 | format!( | |
| 115 | - | "CREATE DATABASE \"{db_name}\" TEMPLATE \"{TEMPLATE_DB_NAME}\"" | |
| 180 | + | "CREATE DATABASE \"{db_name}\" TEMPLATE \"{}\"", | |
| 181 | + | template_name() | |
| 116 | 182 | ) | |
| 117 | 183 | .as_str(), | |
| 118 | 184 | ) | |
| @@ -167,6 +233,10 @@ impl Drop for TestDb { | |||
| 167 | 233 | ||
| 168 | 234 | rt.block_on(async { | |
| 169 | 235 | if let Ok(mut conn) = PgConnection::connect(&admin_url).await { | |
| 236 | + | // Adopt the shared role so we can drop a clone it owns even | |
| 237 | + | // when our login role differs from the creator's. | |
| 238 | + | assume_shared_role(&mut conn).await; | |
| 239 | + | ||
| 170 | 240 | let _ = conn | |
| 171 | 241 | .execute( | |
| 172 | 242 | format!( |