max / makenotwork
16 files changed,
+1078 insertions,
-80 deletions
| @@ -54,3 +54,6 @@ audit_review.md | |||
| 54 | 54 | # sandod local state (regenerable) | |
| 55 | 55 | sando/daemon/sando.db | |
| 56 | 56 | sando/daemon/sando.db-* | |
| 57 | + | sando/daemon/work/ | |
| 58 | + | sando/daemon/releases/ | |
| 59 | + | sando/daemon/cargo-target/ |
| @@ -102,8 +102,10 @@ curl -X POST http://127.0.0.1:7766/promote/a \ | |||
| 102 | 102 | ||
| 103 | 103 | | Method | Path | Body | Purpose | | |
| 104 | 104 | |--------|------|------|---------| | |
| 105 | - | | GET | `/state` | — | Tier list + current/previous version + last gate outcomes | | |
| 106 | - | | POST | `/rebuild` | `{sha?: string}` | Force a build; if `sha` is absent, resolves the configured deploy branch. Aborts any in-flight build (latest wins). | | |
| 105 | + | | GET | `/state` | — | Tier list + current/previous version + last gate outcomes, plus `build` (latest build run: phase/result/failure_summary/elapsed_s, `null` until first `/rebuild`) so a poller sees in-flight/failed builds, not a frozen version | | |
| 106 | + | | POST | `/rebuild` | `{sha?: string}` | Force a build; if `sha` is absent, resolves the configured deploy branch. Aborts any in-flight build (latest wins). Returns `{accepted, sha, run_id}`. | | |
| 107 | + | | GET | `/runs/{id}` | — | Build-status of the run a `/rebuild` returned: `{run_id, sha, version, phase, result, failure_summary, gates[], started_at, finished_at}`. The pollable resource for a non-TUI driver — `/state` only reflects the last *successful* version. | | |
| 108 | + | | GET | `/runs/{id}/wait` | `?timeout_ms=` | Long-poll: blocks until the run settles or `timeout_ms` (default 30s, cap 120s) elapses, then returns the same `RunView`. Fire `/rebuild` → block on `/wait`. | | |
| 107 | 109 | | POST | `/promote/{tier}` | `{version?, hotfix?, reset_burn_in?}` | Verify predecessor gates, deploy to tier nodes, advance state. `version` defaults to the predecessor tier's `current_version`. | | |
| 108 | 110 | | POST | `/rollback/{tier}` | — | Swap `current` symlink to `previous_version` on every node in the tier | | |
| 109 | 111 | | POST | `/confirm/{tier}` | — | Insert a passing `manual_confirm` gate row for the tier's `current_version`. Replaces hand-SQL. | |
| @@ -0,0 +1,26 @@ | |||
| 1 | + | -- One row per `/rebuild` invocation: tracks a build attempt through its | |
| 2 | + | -- lifecycle so a non-TUI caller can poll `GET /runs/{id}` for build status | |
| 3 | + | -- instead of inferring it from `/state` — which only ever reflects the last | |
| 4 | + | -- *successful* version and so reports stale-green for the whole duration of a | |
| 5 | + | -- failing build (the 0.10.2 headless-deploy incident). | |
| 6 | + | -- | |
| 7 | + | -- `version` is denormalized (no FK): a run may fail before a `versions` row | |
| 8 | + | -- exists (fetch/checkout/compile error) and we still want its failure | |
| 9 | + | -- recorded. `result` is the terminal verdict — 'building' until the pipeline | |
| 10 | + | -- settles, then 'passed' | 'failed' | 'aborted'. `phase` is the in-flight | |
| 11 | + | -- sub-state ('queued' | 'fetching' | 'compiling' | 'staging' | 'gating' | | |
| 12 | + | -- 'done'). Terminal transitions are guarded on `result = 'building'` so the | |
| 13 | + | -- first writer (a build-step error, a gate failure, or the task-level catch) | |
| 14 | + | -- wins and later writes are no-ops. | |
| 15 | + | CREATE TABLE build_runs ( | |
| 16 | + | id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| 17 | + | sha TEXT NOT NULL, | |
| 18 | + | version TEXT, | |
| 19 | + | phase TEXT NOT NULL DEFAULT 'queued', | |
| 20 | + | result TEXT NOT NULL DEFAULT 'building', | |
| 21 | + | failure_summary TEXT, | |
| 22 | + | started_at TEXT NOT NULL, | |
| 23 | + | finished_at TEXT | |
| 24 | + | ); | |
| 25 | + | ||
| 26 | + | CREATE INDEX build_runs_by_sha ON build_runs(sha); |
| @@ -4,5 +4,9 @@ db_path = "./sando.db" | |||
| 4 | 4 | topology_path = "../sando.toml" | |
| 5 | 5 | workdir = "./work" | |
| 6 | 6 | release_root = "./releases" | |
| 7 | + | # Shared cargo target dir across per-sha worktrees — incremental rebuilds reuse | |
| 8 | + | # the previous sha's compiled deps instead of clean-compiling each fresh | |
| 9 | + | # worktree. Safe because builds are serialized. Omit for per-worktree target/. | |
| 10 | + | cargo_target_dir = "./cargo-target" | |
| 7 | 11 | # Dropped and recreated on every migration_dry_run. Leave unset to skip. | |
| 8 | 12 | scratch_db_url = "postgres://sando@127.0.0.1/sando_scratch" |
| @@ -6,7 +6,7 @@ | |||
| 6 | 6 | ||
| 7 | 7 | use crate::config::Config; | |
| 8 | 8 | use crate::deploy; | |
| 9 | - | use crate::domain::{GitSha, TierId, Version}; | |
| 9 | + | use crate::domain::{GitSha, RunId, TierId, Version}; | |
| 10 | 10 | use crate::gates::{self, GateCtx}; | |
| 11 | 11 | use crate::git; | |
| 12 | 12 | use crate::topology::Topology; | |
| @@ -34,10 +34,13 @@ pub async fn run( | |||
| 34 | 34 | topo: Arc<Topology>, | |
| 35 | 35 | sha: GitSha, | |
| 36 | 36 | events: crate::events::EventTx, | |
| 37 | + | run_id: RunId, | |
| 37 | 38 | ) -> Result<BuildArtifact> { | |
| 38 | 39 | let worktree = cfg.workdir.join(sha.as_str()); | |
| 39 | 40 | let bare = PathBuf::from(&topo.repo.bare_path); | |
| 40 | 41 | ||
| 42 | + | crate::runs::set_phase(&pool, run_id, crate::runs::Phase::Fetching).await.ok(); | |
| 43 | + | ||
| 41 | 44 | // Pull-based ingestion: if an upstream remote is configured, fetch the | |
| 42 | 45 | // deploy branch so a just-pushed sha is locally resolvable. A fetch | |
| 43 | 46 | // failure is non-fatal — the sha may already be present from a prior | |
| @@ -59,6 +62,7 @@ pub async fn run( | |||
| 59 | 62 | let server_dir = worktree.join("server"); | |
| 60 | 63 | let version = read_pkg_version(&server_dir.join("Cargo.toml")).await | |
| 61 | 64 | .with_context(|| format!("reading version from {}/Cargo.toml", server_dir.display()))?; | |
| 65 | + | crate::runs::set_version(&pool, run_id, &version).await.ok(); | |
| 62 | 66 | ||
| 63 | 67 | // sqlx compile-time query checking needs a live DB with the current schema. | |
| 64 | 68 | // We point cargo at the scratch DB and prep it (drop public, re-migrate) | |
| @@ -70,6 +74,12 @@ pub async fn run( | |||
| 70 | 74 | .arg("--release") | |
| 71 | 75 | .current_dir(&server_dir) | |
| 72 | 76 | .kill_on_drop(true); | |
| 77 | + | // Shared build cache across per-sha worktrees: reuse one target dir so an | |
| 78 | + | // incremental diff doesn't clean-compile from scratch. Serialized builds | |
| 79 | + | // make this contention-free. Unset → cargo's default per-worktree target/. | |
| 80 | + | if let Some(target) = cfg.cargo_target_dir.as_deref() { | |
| 81 | + | cargo_cmd.env("CARGO_TARGET_DIR", target); | |
| 82 | + | } | |
| 73 | 83 | if let Some(scratch_url) = cfg.scratch_db_url.as_deref() { | |
| 74 | 84 | tracing::info!(sha = %sha.as_str(), "preparing scratch DB schema for sqlx compile-time checks"); | |
| 75 | 85 | crate::gates::reset_scratch(scratch_url).await | |
| @@ -81,6 +91,7 @@ pub async fn run( | |||
| 81 | 91 | tracing::warn!("scratch_db_url unset; sqlx will fall back to offline mode and may fail"); | |
| 82 | 92 | } | |
| 83 | 93 | ||
| 94 | + | crate::runs::set_phase(&pool, run_id, crate::runs::Phase::Compiling).await.ok(); | |
| 84 | 95 | tracing::info!(sha = %sha, version = %version, dir = %server_dir.display(), "cargo build --release start"); | |
| 85 | 96 | crate::events::emit(&events, crate::events::Event::BuildStart { | |
| 86 | 97 | sha: sha.clone(), version: version.clone(), | |
| @@ -96,19 +107,24 @@ pub async fn run( | |||
| 96 | 107 | crate::events::emit(&events, crate::events::Event::BuildFailed { | |
| 97 | 108 | sha: sha.clone(), version: version.clone(), elapsed_s, | |
| 98 | 109 | }); | |
| 99 | - | } else { | |
| 100 | - | tracing::info!(sha = %sha, version = %version, elapsed_s, "cargo build --release ok"); | |
| 101 | - | crate::events::emit(&events, crate::events::Event::BuildOk { | |
| 102 | - | sha: sha.clone(), version: version.clone(), elapsed_s, | |
| 103 | - | }); | |
| 110 | + | // Settle the run with the headline compiler diagnostic (not the raw | |
| 111 | + | // 4 KB tail) so `GET /runs/{id}` answers "why" without a journald dive. | |
| 112 | + | let summary = crate::classify::classify_compile_error(&out.stdout, &out.stderr).summary(); | |
| 113 | + | crate::runs::mark_failed(&pool, run_id, &summary).await.ok(); | |
| 114 | + | anyhow::bail!("cargo build --release failed:\n{}", tail(&out.stderr, 4_000)); | |
| 104 | 115 | } | |
| 105 | - | anyhow::ensure!( | |
| 106 | - | out.status.success(), | |
| 107 | - | "cargo build --release failed:\n{}", | |
| 108 | - | tail(&out.stderr, 4_000), | |
| 109 | - | ); | |
| 116 | + | tracing::info!(sha = %sha, version = %version, elapsed_s, "cargo build --release ok"); | |
| 117 | + | crate::events::emit(&events, crate::events::Event::BuildOk { | |
| 118 | + | sha: sha.clone(), version: version.clone(), elapsed_s, | |
| 119 | + | }); | |
| 110 | 120 | ||
| 111 | - | let release_dir = server_dir.join("target/release"); | |
| 121 | + | // Binaries land under `<target>/release/`; with a shared target dir that's | |
| 122 | + | // not inside the worktree, so resolve it the same way cargo did above. | |
| 123 | + | let release_dir = cfg | |
| 124 | + | .cargo_target_dir | |
| 125 | + | .as_deref() | |
| 126 | + | .map(|t| t.join("release")) | |
| 127 | + | .unwrap_or_else(|| server_dir.join("target/release")); | |
| 112 | 128 | let mut binary_paths = Vec::with_capacity(cfg.bin_names.len()); | |
| 113 | 129 | for name in &cfg.bin_names { | |
| 114 | 130 | let p = release_dir.join(name); | |
| @@ -144,8 +160,11 @@ pub async fn build_and_run_host( | |||
| 144 | 160 | topo: Arc<Topology>, | |
| 145 | 161 | sha: GitSha, | |
| 146 | 162 | events: crate::events::EventTx, | |
| 163 | + | run_id: RunId, | |
| 147 | 164 | ) -> Result<()> { | |
| 148 | - | let art = run(pool.clone(), cfg.clone(), topo.clone(), sha, events.clone()).await?; | |
| 165 | + | let art = run(pool.clone(), cfg.clone(), topo.clone(), sha, events.clone(), run_id).await?; | |
| 166 | + | ||
| 167 | + | crate::runs::set_phase(&pool, run_id, crate::runs::Phase::Staging).await.ok(); | |
| 149 | 168 | ||
| 150 | 169 | // Stage the binary in the host's release_root so future gates and the | |
| 151 | 170 | // host self-deploy point at a stable path, not the worktree's target/. | |
| @@ -170,6 +189,7 @@ pub async fn build_and_run_host( | |||
| 170 | 189 | let host = topo.tiers.iter().find(|t| t.name.as_str() == "host") | |
| 171 | 190 | .context("topology has no `host` tier")?; | |
| 172 | 191 | ||
| 192 | + | crate::runs::set_phase(&pool, run_id, crate::runs::Phase::Gating).await.ok(); | |
| 173 | 193 | let ctx = GateCtx { | |
| 174 | 194 | pool: pool.clone(), | |
| 175 | 195 | cfg: cfg.clone(), | |
| @@ -194,8 +214,15 @@ pub async fn build_and_run_host( | |||
| 194 | 214 | .bind(Utc::now().to_rfc3339()) | |
| 195 | 215 | .execute(&pool) | |
| 196 | 216 | .await?; | |
| 217 | + | crate::runs::mark_passed(&pool, run_id).await.ok(); | |
| 197 | 218 | tracing::info!(version = %art.version, "host pipeline green; ready to promote to next tier"); | |
| 198 | 219 | } else { | |
| 220 | + | // Pull the first red gate's typed summary into the run so the API | |
| 221 | + | // answers "which gate, and why" — not just "failed". | |
| 222 | + | let summary = crate::runs::first_failed_gate_summary(&pool, &art.version) | |
| 223 | + | .await | |
| 224 | + | .unwrap_or_else(|| "host pipeline red".to_string()); | |
| 225 | + | crate::runs::mark_failed(&pool, run_id, &summary).await.ok(); | |
| 199 | 226 | tracing::warn!(version = %art.version, "host pipeline red; not advancing tier_state"); | |
| 200 | 227 | } | |
| 201 | 228 | Ok(()) |
| @@ -63,7 +63,99 @@ pub fn classify_cargo_test(stdout: &[u8], stderr: &[u8]) -> GateFailure { | |||
| 63 | 63 | }; | |
| 64 | 64 | } | |
| 65 | 65 | ||
| 66 | - | GateFailure::CargoTest { failed_count, first_failed } | |
| 66 | + | let first_panic = extract_first_panic(&stdout_s); | |
| 67 | + | GateFailure::CargoTest { failed_count, first_failed, first_panic } | |
| 68 | + | } | |
| 69 | + | ||
| 70 | + | /// Pull the first *root-cause* panic message out of libtest's captured | |
| 71 | + | /// output. libtest (Rust 2021+) prints each captured panic as: | |
| 72 | + | /// thread '<test>' panicked at <file>:<line>:<col>: | |
| 73 | + | /// <message> | |
| 74 | + | /// We return the first panic's message — but skip "...poisoned" messages in | |
| 75 | + | /// favour of the first non-poison one, because a single real panic in shared | |
| 76 | + | /// setup (a `std::sync::Once`) poisons it and makes every *other* test report | |
| 77 | + | /// "Once instance has previously been poisoned". The root cause is the one | |
| 78 | + | /// panic that isn't a poison report. Falls back to the first panic of any | |
| 79 | + | /// kind if every message looks like poison. | |
| 80 | + | fn extract_first_panic(stdout: &str) -> Option<String> { | |
| 81 | + | let mut first: Option<String> = None; | |
| 82 | + | let mut lines = stdout.lines(); | |
| 83 | + | while let Some(line) = lines.next() { | |
| 84 | + | if !line.contains("panicked at ") { | |
| 85 | + | continue; | |
| 86 | + | } | |
| 87 | + | // The message is the first non-empty line after the `panicked at` loc. | |
| 88 | + | let msg = lines.by_ref().map(str::trim).find(|l| !l.is_empty()); | |
| 89 | + | let Some(msg) = msg else { continue }; | |
| 90 | + | if first.is_none() { | |
| 91 | + | first = Some(msg.to_string()); | |
| 92 | + | } | |
| 93 | + | let is_poison = msg.contains("poisoned") || msg.contains("PoisonError"); | |
| 94 | + | if !is_poison { | |
| 95 | + | return Some(msg.to_string()); | |
| 96 | + | } | |
| 97 | + | } | |
| 98 | + | first | |
| 99 | + | } | |
| 100 | + | ||
| 101 | + | /// `cargo test --no-run` (the fast pre-gate compile): pull the first | |
| 102 | + | /// compiler diagnostic out of cargo's stderr so a test-only-target | |
| 103 | + | /// compile break (e.g. a missing struct field in a `#[cfg(test)]`-only | |
| 104 | + | /// target) surfaces as the actual `error[E0063]: missing field ...` | |
| 105 | + | /// line, instead of after a full build + a partial run reported as an | |
| 106 | + | /// opaque "N tests failed". | |
| 107 | + | /// | |
| 108 | + | /// Cargo writes diagnostics to stderr. We prefer the first coded | |
| 109 | + | /// `error[Ennnn]: ...` headline over the trailing `error: could not | |
| 110 | + | /// compile <crate> ... due to N previous errors` summary, which names | |
| 111 | + | /// the crate but not the cause; the summary still gives us the count. | |
| 112 | + | pub fn classify_compile_error(stdout: &[u8], stderr: &[u8]) -> GateFailure { | |
| 113 | + | let stderr_s = String::from_utf8_lossy(stderr); | |
| 114 | + | let mut first_error: Option<String> = None; | |
| 115 | + | let mut error_count: u32 = 0; | |
| 116 | + | ||
| 117 | + | for line in stderr_s.lines() { | |
| 118 | + | let t = line.trim_start(); | |
| 119 | + | if first_error.is_none() && t.starts_with("error[") { | |
| 120 | + | first_error = Some(t.to_string()); | |
| 121 | + | } | |
| 122 | + | if let Some(rest) = t.strip_prefix("error: could not compile") | |
| 123 | + | && let Some(n) = parse_due_to_count(rest) | |
| 124 | + | { | |
| 125 | + | error_count = n; | |
| 126 | + | } | |
| 127 | + | } | |
| 128 | + | ||
| 129 | + | // No coded diagnostic (e.g. a macro or resolver error prints a bare | |
| 130 | + | // `error: ...`). Take the first such line that isn't cargo's own | |
| 131 | + | // summary/abort noise. | |
| 132 | + | if first_error.is_none() { | |
| 133 | + | for line in stderr_s.lines() { | |
| 134 | + | let t = line.trim_start(); | |
| 135 | + | if t.starts_with("error:") | |
| 136 | + | && !t.starts_with("error: could not compile") | |
| 137 | + | && !t.starts_with("error: aborting") | |
| 138 | + | { | |
| 139 | + | first_error = Some(t.to_string()); | |
| 140 | + | break; | |
| 141 | + | } | |
| 142 | + | } | |
| 143 | + | } | |
| 144 | + | ||
| 145 | + | if first_error.is_none() && error_count == 0 { | |
| 146 | + | // Didn't look like a compile failure — don't masquerade as one. | |
| 147 | + | return GateFailure::Unclassified { | |
| 148 | + | legacy_detail: Some(combined_tail_for_classifier(stdout, stderr)), | |
| 149 | + | }; | |
| 150 | + | } | |
| 151 | + | GateFailure::CompileError { error_count, first_error } | |
| 152 | + | } | |
| 153 | + | ||
| 154 | + | /// Parse the count out of `... due to N previous error(s)`. | |
| 155 | + | fn parse_due_to_count(s: &str) -> Option<u32> { | |
| 156 | + | let idx = s.find("due to ")?; | |
| 157 | + | let digits: String = s[idx + 7..].chars().take_while(|c| c.is_ascii_digit()).collect(); | |
| 158 | + | digits.parse().ok() | |
| 67 | 159 | } | |
| 68 | 160 | ||
| 69 | 161 | /// `migration_dry_run` is staged: scratch reset → restore dump → run | |
| @@ -217,11 +309,61 @@ failures:\n\ | |||
| 217 | 309 | foo::baz\n\ | |
| 218 | 310 | \n\ | |
| 219 | 311 | test result: FAILED. 10 passed; 2 failed; 0 ignored\n"; | |
| 220 | - | let GateFailure::CargoTest { failed_count, first_failed } = | |
| 312 | + | let GateFailure::CargoTest { failed_count, first_failed, first_panic } = | |
| 221 | 313 | classify_cargo_test(stdout, b"") | |
| 222 | 314 | else { panic!("expected CargoTest variant"); }; | |
| 223 | 315 | assert_eq!(failed_count, 2); | |
| 224 | 316 | assert_eq!(first_failed.as_deref(), Some("foo::bar")); | |
| 317 | + | // No `panicked at` lines in this fixture. | |
| 318 | + | assert_eq!(first_panic, None); | |
| 319 | + | } | |
| 320 | + | ||
| 321 | + | #[test] | |
| 322 | + | fn cargo_test_sees_through_poison_cascade_to_root_panic() { | |
| 323 | + | // The shape that produced the opaque "856 failed": one real panic in | |
| 324 | + | // shared setup poisons a `Once`, and every other test then reports the | |
| 325 | + | // poison. The classifier must surface the real cause, not the poison. | |
| 326 | + | let stdout = b"failures:\n\n\ | |
| 327 | + | ---- harness::a stdout ----\n\ | |
| 328 | + | thread 'harness::a' panicked at tests/harness/db.rs:42:9:\n\ | |
| 329 | + | Once instance has previously been poisoned\n\ | |
| 330 | + | \n\ | |
| 331 | + | ---- harness::root stdout ----\n\ | |
| 332 | + | thread 'harness::root' panicked at tests/harness/db.rs:30:5:\n\ | |
| 333 | + | template database \"mnw_test_template\" does not exist\n\ | |
| 334 | + | \n\ | |
| 335 | + | failures:\n\ | |
| 336 | + | harness::a\n\ | |
| 337 | + | harness::root\n\ | |
| 338 | + | \n\ | |
| 339 | + | test result: FAILED. 0 passed; 856 failed; 0 ignored\n"; | |
| 340 | + | let GateFailure::CargoTest { failed_count, first_panic, .. } = | |
| 341 | + | classify_cargo_test(stdout, b"") | |
| 342 | + | else { panic!("expected CargoTest variant"); }; | |
| 343 | + | assert_eq!(failed_count, 856); | |
| 344 | + | assert_eq!( | |
| 345 | + | first_panic.as_deref(), | |
| 346 | + | Some("template database \"mnw_test_template\" does not exist"), | |
| 347 | + | "must skip the poison message for the root cause", | |
| 348 | + | ); | |
| 349 | + | } | |
| 350 | + | ||
| 351 | + | #[test] | |
| 352 | + | fn cargo_test_panic_falls_back_when_all_poison() { | |
| 353 | + | // If every panic is a poison report, return the first one rather than | |
| 354 | + | // nothing — better than an opaque count. | |
| 355 | + | let stdout = b"failures:\n\n\ | |
| 356 | + | ---- harness::a stdout ----\n\ | |
| 357 | + | thread 'harness::a' panicked at x.rs:1:1:\n\ | |
| 358 | + | Once instance has previously been poisoned\n\ | |
| 359 | + | \n\ | |
| 360 | + | failures:\n harness::a\n\ | |
| 361 | + | \n\ | |
| 362 | + | test result: FAILED. 0 passed; 3 failed; 0 ignored\n"; | |
| 363 | + | let GateFailure::CargoTest { first_panic, .. } = | |
| 364 | + | classify_cargo_test(stdout, b"") | |
| 365 | + | else { panic!("expected CargoTest variant"); }; | |
| 366 | + | assert_eq!(first_panic.as_deref(), Some("Once instance has previously been poisoned")); | |
| 225 | 367 | } | |
| 226 | 368 | ||
| 227 | 369 | #[test] | |
| @@ -238,6 +380,49 @@ test result: FAILED. 10 passed; 2 failed; 0 ignored\n"; | |||
| 238 | 380 | } | |
| 239 | 381 | ||
| 240 | 382 | #[test] | |
| 383 | + | fn compile_error_extracts_first_coded_diagnostic_and_count() { | |
| 384 | + | // Real `cargo test --no-run` shape: the headline diagnostic, then | |
| 385 | + | // the trailing summary that carries the count. | |
| 386 | + | let stderr = b" Compiling makenotwork v0.10.2\n\ | |
| 387 | + | error[E0063]: missing field `user_pages_host` in initializer of `Config`\n \ | |
| 388 | + | --> src/config.rs:412:21\n\ | |
| 389 | + | error: could not compile `makenotwork` (lib test) due to 1 previous error\n"; | |
| 390 | + | let GateFailure::CompileError { error_count, first_error } = | |
| 391 | + | classify_compile_error(b"", stderr) | |
| 392 | + | else { panic!("expected CompileError variant"); }; | |
| 393 | + | assert_eq!(error_count, 1); | |
| 394 | + | assert_eq!( | |
| 395 | + | first_error.as_deref(), | |
| 396 | + | Some("error[E0063]: missing field `user_pages_host` in initializer of `Config`"), | |
| 397 | + | ); | |
| 398 | + | } | |
| 399 | + | ||
| 400 | + | #[test] | |
| 401 | + | fn compile_error_falls_back_to_bare_error_line() { | |
| 402 | + | // A macro/resolver error has no `error[Ennnn]` code; we still want | |
| 403 | + | // the first real `error:` line, not the cargo summary. | |
| 404 | + | let stderr = b"error: cannot find macro `foo` in this scope\n\ | |
| 405 | + | error: could not compile `makenotwork` (lib test) due to 2 previous errors\n"; | |
| 406 | + | let GateFailure::CompileError { error_count, first_error } = | |
| 407 | + | classify_compile_error(b"", stderr) | |
| 408 | + | else { panic!("expected CompileError variant"); }; | |
| 409 | + | assert_eq!(error_count, 2); | |
| 410 | + | assert_eq!(first_error.as_deref(), Some("error: cannot find macro `foo` in this scope")); | |
| 411 | + | } | |
| 412 | + | ||
| 413 | + | #[test] | |
| 414 | + | fn compile_error_unclassified_when_not_a_compile_failure() { | |
| 415 | + | // No `error[...]`, no `could not compile` — hand back the tail. | |
| 416 | + | let f = classify_compile_error(b"", b"warning: unused import\n"); | |
| 417 | + | match f { | |
| 418 | + | GateFailure::Unclassified { legacy_detail: Some(d) } => { | |
| 419 | + | assert!(d.contains("unused import")); | |
| 420 | + | } | |
| 421 | + | other => panic!("expected Unclassified, got {other:?}"), | |
| 422 | + | } | |
| 423 | + | } | |
| 424 | + | ||
| 425 | + | #[test] | |
| 241 | 426 | fn migration_drift_extracts_name() { | |
| 242 | 427 | let err = "migration 0047_widgets was previously applied but is missing in the resolved migrations"; | |
| 243 | 428 | let f = classify_migration_error(err, None); |
| @@ -26,6 +26,16 @@ pub struct Config { | |||
| 26 | 26 | /// Served via `GET /logs/{version}/{gate}`. Defaults to `/srv/sando/logs`. | |
| 27 | 27 | #[serde(default = "default_logs_root")] | |
| 28 | 28 | pub logs_root: PathBuf, | |
| 29 | + | /// Shared cargo target dir. When set, every `cargo build`/`cargo test` the | |
| 30 | + | /// pipeline runs uses this one `CARGO_TARGET_DIR` instead of each per-sha | |
| 31 | + | /// worktree's own `target/`, so a 1-line diff reuses the previous sha's | |
| 32 | + | /// compiled dependencies (a ~10-min clean build becomes a 1–2-min | |
| 33 | + | /// incremental one). Safe because builds are serialized — a new `/rebuild` | |
| 34 | + | /// aborts the in-flight one — so no two cargo invocations ever share the | |
| 35 | + | /// dir concurrently. Unset = per-worktree `target/` (the historical | |
| 36 | + | /// behavior). Cargo creates the dir if absent. | |
| 37 | + | #[serde(default)] | |
| 38 | + | pub cargo_target_dir: Option<PathBuf>, | |
| 29 | 39 | /// Non-binary contents to stage into each release dir alongside | |
| 30 | 40 | /// `bin_names`. Each entry copies `worktree/<src>` into | |
| 31 | 41 | /// `<release>/<dst>`. `required=false` makes a missing source a warn | |
| @@ -54,6 +64,32 @@ pub struct ReleaseEntry { | |||
| 54 | 64 | fn default_bin_names() -> Vec<String> { vec!["server".into()] } | |
| 55 | 65 | fn default_logs_root() -> PathBuf { PathBuf::from("/srv/sando/logs") } | |
| 56 | 66 | ||
| 67 | + | #[cfg(test)] | |
| 68 | + | mod tests { | |
| 69 | + | use super::*; | |
| 70 | + | ||
| 71 | + | const MINIMAL: &str = r#" | |
| 72 | + | listen = "127.0.0.1:7766" | |
| 73 | + | db_path = "./sando.db" | |
| 74 | + | topology_path = "../sando.toml" | |
| 75 | + | workdir = "./work" | |
| 76 | + | release_root = "./releases" | |
| 77 | + | "#; | |
| 78 | + | ||
| 79 | + | #[test] | |
| 80 | + | fn cargo_target_dir_parses_when_present() { | |
| 81 | + | let raw = format!("{MINIMAL}\ncargo_target_dir = \"/srv/sando/cargo-target\"\n"); | |
| 82 | + | let cfg: Config = toml::from_str(&raw).unwrap(); | |
| 83 | + | assert_eq!(cfg.cargo_target_dir.as_deref(), Some(std::path::Path::new("/srv/sando/cargo-target"))); | |
| 84 | + | } | |
| 85 | + | ||
| 86 | + | #[test] | |
| 87 | + | fn cargo_target_dir_defaults_to_none() { | |
| 88 | + | let cfg: Config = toml::from_str(MINIMAL).unwrap(); | |
| 89 | + | assert!(cfg.cargo_target_dir.is_none(), "omitting it keeps the per-worktree target/"); | |
| 90 | + | } | |
| 91 | + | } | |
| 92 | + | ||
| 57 | 93 | impl Config { | |
| 58 | 94 | /// Primary binary — the one the systemd unit's ExecStart points at. | |
| 59 | 95 | pub fn primary_bin(&self) -> &str { | |
| @@ -79,6 +115,7 @@ impl Config { | |||
| 79 | 115 | bin_names: vec!["server".into()], | |
| 80 | 116 | logs_root: PathBuf::from("/tmp/sando-test-logs"), | |
| 81 | 117 | release_contents: Vec::new(), | |
| 118 | + | cargo_target_dir: None, | |
| 82 | 119 | } | |
| 83 | 120 | } | |
| 84 | 121 | } |
| @@ -327,6 +327,18 @@ impl fmt::Display for DeployId { | |||
| 327 | 327 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } | |
| 328 | 328 | } | |
| 329 | 329 | ||
| 330 | + | /// Primary key of `build_runs` — the resource a `/rebuild` returns and a | |
| 331 | + | /// non-TUI driver polls via `GET /runs/{id}`. Distinct from `GateRunId` | |
| 332 | + | /// (one build run drives many gate runs). | |
| 333 | + | #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, sqlx::Type)] | |
| 334 | + | #[sqlx(transparent)] | |
| 335 | + | #[serde(transparent)] | |
| 336 | + | pub struct RunId(pub i64); | |
| 337 | + | ||
| 338 | + | impl fmt::Display for RunId { | |
| 339 | + | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } | |
| 340 | + | } | |
| 341 | + | ||
| 330 | 342 | #[cfg(test)] | |
| 331 | 343 | mod tests { | |
| 332 | 344 | use super::*; |
| @@ -118,42 +118,46 @@ pub async fn run_all(ctx: &GateCtx, gates: &[Gate]) -> Result<bool> { | |||
| 118 | 118 | ||
| 119 | 119 | async fn cargo_test(ctx: &GateCtx, run_id: GateRunId) -> Result<GateOutcome> { | |
| 120 | 120 | let server_dir = ctx.worktree.join("server"); | |
| 121 | - | let mut cmd = Command::new("cargo"); | |
| 122 | - | // Match CI (`server/deploy/run-ci.sh`): `--features fast-tests` relaxes | |
| 123 | - | // auth rate-limit burst (5 → 20) and argon2 cost so signup-heavy + lockout | |
| 124 | - | // workflow tests can complete without hitting Governor before the | |
| 125 | - | // hand-rolled lockout check. The feature is specifically documented for | |
| 126 | - | // this in `server/src/constants.rs:87`. | |
| 127 | - | cmd.args(["test", "--release", "--features", "fast-tests"]) | |
| 128 | - | .current_dir(&server_dir) | |
| 129 | - | .stdout(std::process::Stdio::piped()) | |
| 130 | - | .stderr(std::process::Stdio::piped()) | |
| 131 | - | .kill_on_drop(true); | |
| 132 | - | // Same online-mode rationale as the build step: sqlx query macros need a | |
| 133 | - | // live DB to type-check against. The scratch DB is left in migrated state | |
| 134 | - | // by the preceding build, so we can reuse it here. | |
| 121 | + | let log_path = gate_log_path(ctx, GateKind::CargoTest); | |
| 122 | + | let log_ref = LogRef::new(&ctx.version, GateKind::CargoTest); | |
| 123 | + | ||
| 124 | + | // Best-effort: drop our own role's stale `mnw_test_*` databases (the | |
| 125 | + | // template + any per-test clones orphaned by a previously-killed run) | |
| 126 | + | // before the suite, so they can't accumulate or collide. Foreign-owned | |
| 127 | + | // leftovers are left alone — the harness now namespaces its template per | |
| 128 | + | // role, so they no longer wedge the gate. | |
| 135 | 129 | if let Some(scratch_url) = ctx.cfg.scratch_db_url.as_deref() { | |
| 136 | - | cmd.env("DATABASE_URL", scratch_url); | |
| 137 | - | // The server test harness (tests/harness/db.rs) parses TEST_DATABASE_URL | |
| 138 | - | // with rfind('/'), which mangles URLs whose query string contains '/' | |
| 139 | - | // (e.g. `?host=/var/run/postgresql`). Strip the query — libpq defaults | |
| 140 | - | // to /var/run/postgresql on Debian/Ubuntu when host is unspecified. | |
| 141 | - | let test_url = scratch_url | |
| 142 | - | .split_once('?') | |
| 143 | - | .map(|(base, _)| base) | |
| 144 | - | .unwrap_or(scratch_url); | |
| 145 | - | cmd.env("TEST_DATABASE_URL", test_url); | |
| 146 | - | // Best-effort: drop our own role's stale `mnw_test_*` databases (the | |
| 147 | - | // template + any per-test clones orphaned by a previously-killed run) | |
| 148 | - | // before the suite, so they can't accumulate or collide. Foreign-owned | |
| 149 | - | // leftovers are left alone — the harness now namespaces its template | |
| 150 | - | // per role, so they no longer wedge the gate. | |
| 151 | 130 | clean_stale_test_dbs(scratch_url).await; | |
| 152 | 131 | } | |
| 132 | + | ||
| 153 | 133 | let started = std::time::Instant::now(); | |
| 154 | - | let log_path = gate_log_path(ctx, GateKind::CargoTest); | |
| 155 | - | let log_ref = LogRef::new(&ctx.version, GateKind::CargoTest); | |
| 156 | - | let mut child = match cmd.spawn() { | |
| 134 | + | ||
| 135 | + | // Fast pre-gate: compile the test targets WITHOUT running them. This | |
| 136 | + | // builds the exact `--release --features fast-tests` artifacts the full | |
| 137 | + | // run needs (so the subsequent run reuses the cache — no wasted work), | |
| 138 | + | // but fails in ~minutes with the real `error[Ennnn]: ...` on a | |
| 139 | + | // test-only-target compile break. That class (a field missing in a | |
| 140 | + | // `#[cfg(test)]`-only binary like `load`) otherwise compiles fine under | |
| 141 | + | // the build step + `--test integration` and only blows up here, after a | |
| 142 | + | // full build, as an opaque mass test failure. | |
| 143 | + | let mut pre = match cargo_test_command(ctx, &server_dir, &["--no-run"]).spawn() { | |
| 144 | + | Ok(c) => c, | |
| 145 | + | Err(e) => { | |
| 146 | + | return Ok(GateOutcome::failed(GateFailure::SpawnFailed { | |
| 147 | + | message: e.to_string(), | |
| 148 | + | }).with_log_ref(log_ref)); | |
| 149 | + | } | |
| 150 | + | }; | |
| 151 | + | let (pre_out, pre_err, pre_status) = | |
| 152 | + | stream_child_to_live_log(&mut pre, ctx.events.clone(), run_id, log_path.clone()).await?; | |
| 153 | + | if !pre_status.success() { | |
| 154 | + | let failure = classify::classify_compile_error(&pre_out, &pre_err); | |
| 155 | + | return Ok(GateOutcome::failed(failure).with_log_ref(log_ref)); | |
| 156 | + | } | |
| 157 | + | ||
| 158 | + | // Full run: the test binaries are already built above, so cargo's | |
| 159 | + | // up-to-date check skips compilation and this just runs the tests. | |
| 160 | + | let mut child = match cargo_test_command(ctx, &server_dir, &[]).spawn() { | |
| 157 | 161 | Ok(c) => c, | |
| 158 | 162 | Err(e) => { | |
| 159 | 163 | return Ok(GateOutcome::failed(GateFailure::SpawnFailed { | |
| @@ -172,6 +176,46 @@ async fn cargo_test(ctx: &GateCtx, run_id: GateRunId) -> Result<GateOutcome> { | |||
| 172 | 176 | } | |
| 173 | 177 | } | |
| 174 | 178 | ||
| 179 | + | /// Configure (but don't spawn) `cargo test --release --features fast-tests | |
| 180 | + | /// <extra>` in `dir`, wired to the scratch DB. Shared by the `--no-run` | |
| 181 | + | /// pre-gate compile and the full test run so both go through one env setup. | |
| 182 | + | /// | |
| 183 | + | /// `--features fast-tests` matches CI (`server/deploy/run-ci.sh`): it relaxes | |
| 184 | + | /// the auth rate-limit burst (5 → 20) and argon2 cost so signup-heavy + | |
| 185 | + | /// lockout workflow tests complete without hitting Governor before the | |
| 186 | + | /// hand-rolled lockout check (documented at `server/src/constants.rs:87`). | |
| 187 | + | fn cargo_test_command(ctx: &GateCtx, dir: &std::path::Path, extra: &[&str]) -> Command { | |
| 188 | + | let mut cmd = Command::new("cargo"); | |
| 189 | + | cmd.args(["test", "--release", "--features", "fast-tests"]) | |
| 190 | + | .args(extra) | |
| 191 | + | .current_dir(dir) | |
| 192 | + | .stdout(std::process::Stdio::piped()) | |
| 193 | + | .stderr(std::process::Stdio::piped()) | |
| 194 | + | .kill_on_drop(true); | |
| 195 | + | // Share the build step's target dir so the test compile reuses its | |
| 196 | + | // artifacts (and the `--no-run` precompile reuses them again). Must match | |
| 197 | + | // `build.rs` or the gate would clean-compile the whole tree a second time. | |
| 198 | + | if let Some(target) = ctx.cfg.cargo_target_dir.as_deref() { | |
| 199 | + | cmd.env("CARGO_TARGET_DIR", target); | |
| 200 | + | } | |
| 201 | + | // Same online-mode rationale as the build step: sqlx query macros need a | |
| 202 | + | // live DB to type-check against. The scratch DB is left in migrated state | |
| 203 | + | // by the preceding build, so we can reuse it here. | |
| 204 | + | if let Some(scratch_url) = ctx.cfg.scratch_db_url.as_deref() { | |
| 205 | + | cmd.env("DATABASE_URL", scratch_url); | |
| 206 | + | // The server test harness (tests/harness/db.rs) parses TEST_DATABASE_URL | |
| 207 | + | // with rfind('/'), which mangles URLs whose query string contains '/' | |
| 208 | + | // (e.g. `?host=/var/run/postgresql`). Strip the query — libpq defaults | |
| 209 | + | // to /var/run/postgresql on Debian/Ubuntu when host is unspecified. | |
| 210 | + | let test_url = scratch_url | |
| 211 | + | .split_once('?') | |
| 212 | + | .map(|(base, _)| base) | |
| 213 | + | .unwrap_or(scratch_url); | |
| 214 | + | cmd.env("TEST_DATABASE_URL", test_url); | |
| 215 | + | } | |
| 216 | + | cmd | |
| 217 | + | } | |
| 218 | + | ||
| 175 | 219 | async fn migration_dry_run(ctx: &GateCtx) -> Result<GateOutcome> { | |
| 176 | 220 | let mut log_buf: Vec<u8> = Vec::new(); | |
| 177 | 221 | let log_ref = LogRef::new(&ctx.version, GateKind::MigrationDryRun); | |
| @@ -272,12 +316,18 @@ pub(crate) async fn reset_scratch(db_url: &str) -> Result<()> { | |||
| 272 | 316 | Ok(()) | |
| 273 | 317 | } | |
| 274 | 318 | ||
| 275 | - | /// Best-effort cleanup of stale test databases left behind by a | |
| 276 | - | /// previously-killed `cargo_test` run (the per-test `mnw_test_<uuid>` clones | |
| 277 | - | /// and the role's template). Only drops databases owned by the connecting | |
| 278 | - | /// role — a foreign-owned leftover can't be dropped without superuser anyway, | |
| 279 | - | /// and the harness now namespaces its template per role so one can't wedge us. | |
| 280 | - | /// Never returns an error: a cleanup miss must not turn a deploy red. | |
| 319 | + | /// Best-effort cleanup of stale per-test database clones (`mnw_test_<uuid>`) | |
| 320 | + | /// left behind by a previously-killed `cargo_test` run. Only drops databases | |
| 321 | + | /// owned by the connecting role — a foreign-owned leftover can't be dropped | |
| 322 | + | /// without superuser anyway, and the harness namespaces its template per role | |
| 323 | + | /// so one can't wedge us. | |
| 324 | + | /// | |
| 325 | + | /// Deliberately **excludes the template** (`mnw_test_template_*`): the harness | |
| 326 | + | /// reuses it across runs when it's migration-current (skipping a full | |
| 327 | + | /// drop+migrate), so dropping it here would force a needless rebuild every | |
| 328 | + | /// gate run. Templates are bounded (one per role) and never accumulate, so | |
| 329 | + | /// leaving them is free. Never returns an error: a cleanup miss must not turn a | |
| 330 | + | /// deploy red. | |
| 281 | 331 | async fn clean_stale_test_dbs(db_url: &str) { | |
| 282 | 332 | use sqlx::postgres::PgPoolOptions; | |
| 283 | 333 | use sqlx::Executor; | |
| @@ -294,6 +344,7 @@ async fn clean_stale_test_dbs(db_url: &str) { | |||
| 294 | 344 | let names: Vec<(String,)> = sqlx::query_as( | |
| 295 | 345 | "SELECT datname FROM pg_database | |
| 296 | 346 | WHERE datname LIKE 'mnw_test_%' | |
| 347 | + | AND datname NOT LIKE '%template%' | |
| 297 | 348 | AND pg_catalog.pg_has_role(current_user, datdba, 'USAGE')", | |
| 298 | 349 | ) | |
| 299 | 350 | .fetch_all(&pool) |
| @@ -22,6 +22,7 @@ pub mod live_log; | |||
| 22 | 22 | pub mod metrics; | |
| 23 | 23 | pub mod outcome; | |
| 24 | 24 | pub mod routes; | |
| 25 | + | pub mod runs; | |
| 25 | 26 | pub mod state; | |
| 26 | 27 | pub mod sync; | |
| 27 | 28 | pub mod topology; |
| @@ -146,7 +146,23 @@ impl GateBlocker { | |||
| 146 | 146 | pub enum GateFailure { | |
| 147 | 147 | /// `cargo_test` exited non-zero. `failed_count` may be 0 if the | |
| 148 | 148 | /// classifier couldn't parse the count (e.g. compile error). | |
| 149 | - | CargoTest { failed_count: u32, first_failed: Option<String> }, | |
| 149 | + | /// `first_failed` is the first failing test's name; `first_panic` is the | |
| 150 | + | /// first panic *message* (root cause), chosen to skip the "Once instance | |
| 151 | + | /// has previously been poisoned" cascade so 800 poisoned tests don't bury | |
| 152 | + | /// the one real panic that poisoned them. | |
| 153 | + | CargoTest { | |
| 154 | + | failed_count: u32, | |
| 155 | + | first_failed: Option<String>, | |
| 156 | + | #[serde(default, skip_serializing_if = "Option::is_none")] | |
| 157 | + | first_panic: Option<String>, | |
| 158 | + | }, | |
| 159 | + | /// `cargo_test` fast pre-gate (`cargo test --no-run`): the test | |
| 160 | + | /// targets failed to compile, so no tests ran. `first_error` is the | |
| 161 | + | /// headline diagnostic (e.g. `error[E0063]: missing field | |
| 162 | + | /// user_pages_host`) and `error_count` is cargo's "N previous errors". | |
| 163 | + | /// Distinct from `CargoTest` so a test-only-target compile break reads | |
| 164 | + | /// as a build error, not "0 tests failed". | |
| 165 | + | CompileError { error_count: u32, first_error: Option<String> }, | |
| 150 | 166 | /// `migration_dry_run`: a migration that was previously applied is | |
| 151 | 167 | /// no longer present in the resolved migrations directory. | |
| 152 | 168 | MigrationDrift { migration: String }, | |
| @@ -175,10 +191,17 @@ pub enum GateFailure { | |||
| 175 | 191 | impl GateFailure { | |
| 176 | 192 | pub fn summary(&self) -> String { | |
| 177 | 193 | match self { | |
| 178 | - | GateFailure::CargoTest { failed_count, first_failed: Some(name) } => | |
| 194 | + | // The panic message is the diagnostic; prefer it over the test name. | |
| 195 | + | GateFailure::CargoTest { failed_count, first_panic: Some(p), .. } => | |
| 196 | + | format!("{failed_count} test(s) failed; first panic: {p}"), | |
| 197 | + | GateFailure::CargoTest { failed_count, first_failed: Some(name), first_panic: None } => | |
| 179 | 198 | format!("{failed_count} test(s) failed; first: {name}"), | |
| 180 | - | GateFailure::CargoTest { failed_count, first_failed: None } => | |
| 199 | + | GateFailure::CargoTest { failed_count, first_failed: None, first_panic: None } => | |
| 181 | 200 | format!("{failed_count} test(s) failed"), | |
| 201 | + | GateFailure::CompileError { error_count, first_error: Some(e) } => | |
| 202 | + | format!("compile failed ({error_count} error(s)); first: {e}"), | |
| 203 | + | GateFailure::CompileError { error_count, first_error: None } => | |
| 204 | + | format!("compile failed ({error_count} error(s))"), | |
| 182 | 205 | GateFailure::MigrationDrift { migration } => | |
| 183 | 206 | format!("migration {migration} previously applied but missing"), | |
| 184 | 207 | GateFailure::MigrationModified { migration } => |
| @@ -1,6 +1,6 @@ | |||
| 1 | 1 | use crate::error::Result; | |
| 2 | 2 | use crate::state::AppState; | |
| 3 | - | use axum::extract::{Path, State, WebSocketUpgrade}; | |
| 3 | + | use axum::extract::{Path, Query, State, WebSocketUpgrade}; | |
| 4 | 4 | use axum::response::IntoResponse; | |
| 5 | 5 | use axum::routing::{get, post}; | |
| 6 | 6 | use axum::{Json, Router}; | |
| @@ -27,6 +27,8 @@ pub fn router(state: AppState) -> Router { | |||
| 27 | 27 | ||
| 28 | 28 | let open = Router::new() | |
| 29 | 29 | .route("/state", get(get_state)) | |
| 30 | + | .route("/runs/{id}", get(get_run)) | |
| 31 | + | .route("/runs/{id}/wait", get(get_run_wait)) | |
| 30 | 32 | .route("/logs/{version}/{gate}", get(get_gate_log)) | |
| 31 | 33 | .route("/events", get(events_ws)); | |
| 32 | 34 | ||
| @@ -84,6 +86,12 @@ struct StateView { | |||
| 84 | 86 | /// the *deployed product*, not the controller). | |
| 85 | 87 | sandod_version: &'static str, | |
| 86 | 88 | tiers: Vec<TierView>, | |
| 89 | + | /// The most recent build run (the resource `GET /runs/{id}` exposes in | |
| 90 | + | /// full). Surfaced here so a `/state` poller sees an in-flight or failed | |
| 91 | + | /// build — the tier versions only ever reflect the last *success*, so | |
| 92 | + | /// without this `/state` looks frozen for the whole build. `null` until | |
| 93 | + | /// the first `/rebuild`. | |
| 94 | + | build: Option<crate::runs::BuildSummary>, | |
| 87 | 95 | } | |
| 88 | 96 | ||
| 89 | 97 | #[derive(Serialize)] | |
| @@ -198,7 +206,8 @@ async fn get_state(State(s): State<AppState>) -> Result<Json<StateView>> { | |||
| 198 | 206 | }); | |
| 199 | 207 | } | |
| 200 | 208 | ||
| 201 | - | Ok(Json(StateView { sandod_version: env!("CARGO_PKG_VERSION"), tiers })) | |
| 209 | + | let build = crate::runs::latest_summary(&s.pool).await?; | |
| 210 | + | Ok(Json(StateView { sandod_version: env!("CARGO_PKG_VERSION"), tiers, build })) | |
| 202 | 211 | } | |
| 203 | 212 | ||
| 204 | 213 | #[derive(Deserialize, Default)] | |
| @@ -757,13 +766,24 @@ async fn rebuild( | |||
| 757 | 766 | ) -> Result<Json<serde_json::Value>> { | |
| 758 | 767 | let body = body.map(|Json(b)| b).unwrap_or_default(); | |
| 759 | 768 | let sha = match body.sha { | |
| 760 | - | Some(s) => s, | |
| 761 | - | None => crate::git::resolve_ref( | |
| 762 | - | std::path::Path::new(&s.topo.repo.bare_path), | |
| 763 | - | &s.topo.repo.branch, | |
| 764 | - | ) | |
| 765 | - | .await | |
| 766 | - | .map_err(crate::error::Error::Other)?, | |
| 769 | + | Some(sha) => sha, | |
| 770 | + | None => { | |
| 771 | + | // Omitted sha = "build the deploy branch's tip". Fetch upstream | |
| 772 | + | // first so we resolve the *upstream* HEAD, not a possibly-stale | |
| 773 | + | // local branch ref — the build task fetches too, but only after the | |
| 774 | + | // sha is already chosen, so without this `/rebuild {}` could build | |
| 775 | + | // an old commit. A fetch failure is non-fatal: fall back to the | |
| 776 | + | // current bare-repo tip (same policy as the build task). | |
| 777 | + | let bare = std::path::Path::new(&s.topo.repo.bare_path); | |
| 778 | + | if let Some(upstream) = s.topo.repo.upstream.as_deref() | |
| 779 | + | && let Err(e) = crate::git::fetch_upstream(bare, upstream, &s.topo.repo.branch).await | |
| 780 | + | { | |
| 781 | + | tracing::warn!(error = %e, "pre-resolve upstream fetch failed; resolving current bare-repo branch tip"); | |
| 782 | + | } | |
| 783 | + | crate::git::resolve_ref(bare, &s.topo.repo.branch) | |
| 784 | + | .await | |
| 785 | + | .map_err(crate::error::Error::Other)? | |
| 786 | + | } | |
| 767 | 787 | }; | |
| 768 | 788 | ||
| 769 | 789 | // Boundary parse: a sha entering Sando must be hex of plausible length. | |
| @@ -774,17 +794,27 @@ async fn rebuild( | |||
| 774 | 794 | tracing::info!(sha = %sha, "rebuild requested"); | |
| 775 | 795 | crate::events::emit(&s.events, crate::events::Event::RebuildRequested { sha: sha.clone() }); | |
| 776 | 796 | ||
| 797 | + | // One pollable resource per triggered build. Created before the spawn so | |
| 798 | + | // the run id is in the response even if the task is aborted milliseconds | |
| 799 | + | // later by a still-newer /rebuild. | |
| 800 | + | let run_id = crate::runs::create(&s.pool, sha.as_str()) | |
| 801 | + | .await | |
| 802 | + | .map_err(crate::error::Error::Other)?; | |
| 803 | + | ||
| 777 | 804 | // Latest /rebuild wins: abort any in-flight build before spawning a new | |
| 778 | 805 | // one. Aborting drops the spawned task's future, which drops any | |
| 779 | 806 | // tokio::process::Child it owns; with `kill_on_drop(true)` set on the | |
| 780 | 807 | // cargo Command, SIGKILL propagates to cargo + its rustc children. | |
| 781 | 808 | let mut slot = s.active_build.lock().await; | |
| 782 | - | if let Some(prev) = slot.take() { | |
| 783 | - | if !prev.is_finished() { | |
| 784 | - | tracing::warn!("aborting in-flight build for newer /rebuild request"); | |
| 785 | - | crate::events::emit(&s.events, crate::events::Event::BuildAborted { sha_aborted: sha.clone() }); | |
| 786 | - | prev.abort(); | |
| 787 | - | } | |
| 809 | + | if let Some(prev) = slot.take() | |
| 810 | + | && !prev.handle.is_finished() | |
| 811 | + | { | |
| 812 | + | tracing::warn!("aborting in-flight build for newer /rebuild request"); | |
| 813 | + | crate::events::emit(&s.events, crate::events::Event::BuildAborted { sha_aborted: sha.clone() }); | |
| 814 | + | prev.handle.abort(); | |
| 815 | + | // Aborting drops the task before it can settle its own row, so | |
| 816 | + | // record the supersession here. | |
| 817 | + | crate::runs::mark_aborted(&s.pool, prev.run_id).await.ok(); | |
| 788 | 818 | } | |
| 789 | 819 | ||
| 790 | 820 | let pool = s.pool.clone(); | |
| @@ -793,14 +823,71 @@ async fn rebuild( | |||
| 793 | 823 | let events_for_task = s.events.clone(); | |
| 794 | 824 | let sha_for_task = sha.clone(); | |
| 795 | 825 | let sha_response = sha.to_string(); | |
| 826 | + | let pool_for_task = s.pool.clone(); | |
| 796 | 827 | let handle = tokio::spawn(async move { | |
| 797 | - | if let Err(e) = crate::build::build_and_run_host(pool, cfg, topo, sha_for_task.clone(), events_for_task).await { | |
| 828 | + | if let Err(e) = crate::build::build_and_run_host(pool, cfg, topo, sha_for_task.clone(), events_for_task, run_id).await { | |
| 798 | 829 | tracing::error!(sha = %sha_for_task, error = %e, "rebuild pipeline failed"); | |
| 830 | + | // Pre-gate bails (fetch/checkout/version/scratch) don't settle the | |
| 831 | + | // run themselves; the build-step compile error already did. First | |
| 832 | + | // terminal write wins, so this is a safety net for the rest. | |
| 833 | + | crate::runs::mark_failed(&pool_for_task, run_id, &format!("{e:#}")).await.ok(); | |
| 799 | 834 | } | |
| 800 | 835 | }); | |
| 801 | - | *slot = Some(handle.abort_handle()); | |
| 836 | + | *slot = Some(crate::state::ActiveBuild { handle: handle.abort_handle(), run_id }); | |
| 802 | 837 | ||
| 803 | - | Ok(Json(serde_json::json!({ "accepted": true, "sha": sha_response }))) | |
| 838 | + | Ok(Json(serde_json::json!({ "accepted": true, "sha": sha_response, "run_id": run_id.0 }))) | |
| 839 | + | } | |
| 840 | + | ||
| 841 | + | /// `GET /runs/{id}` — the build-status resource a non-TUI driver polls after | |
| 842 | + | /// `/rebuild`. Open (read-only) like `/state` and `/logs`. | |
| 843 | + | async fn get_run( | |
| 844 | + | State(s): State<AppState>, | |
| 845 | + | Path(id): Path<i64>, | |
| 846 | + | ) -> Result<Json<crate::runs::RunView>> { | |
| 847 | + | crate::runs::get(&s.pool, crate::domain::RunId(id)) | |
| 848 | + | .await | |
| 849 | + | .map_err(crate::error::Error::Other)? | |
| 850 | + | .map(Json) | |
| 851 | + | .ok_or(crate::error::Error::NotFound) | |
| 852 | + | } | |
| 853 | + | ||
| 854 | + | #[derive(Deserialize)] | |
| 855 | + | struct WaitParams { | |
| 856 | + | /// How long to hold the request open before returning a still-building | |
| 857 | + | /// run. Default 30s, capped at 120s. | |
| 858 | + | #[serde(default)] | |
| 859 | + | timeout_ms: Option<u64>, | |
| 860 | + | } | |
| 861 | + | ||
| 862 | + | /// `GET /runs/{id}/wait` — long-poll: hold the request open until the run | |
| 863 | + | /// settles (`result != building`) or the timeout elapses, then return the | |
| 864 | + | /// current `RunView`. Removes polling-cadence guessing for a headless driver | |
| 865 | + | /// (fire `/rebuild`, block on `/wait`). On timeout the run is returned | |
| 866 | + | /// still-building (200) and the caller re-issues `/wait`. 404 if unknown. | |
| 867 | + | async fn get_run_wait( | |
| 868 | + | State(s): State<AppState>, | |
| 869 | + | Path(id): Path<i64>, | |
| 870 | + | Query(p): Query<WaitParams>, | |
| 871 | + | ) -> Result<Json<crate::runs::RunView>> { | |
| 872 | + | let run_id = crate::domain::RunId(id); | |
| 873 | + | let timeout = std::time::Duration::from_millis(p.timeout_ms.unwrap_or(30_000).min(120_000)); | |
| 874 | + | let deadline = tokio::time::Instant::now() + timeout; | |
| 875 | + | // Poll the row rather than wiring a per-run notifier: a build settles on | |
| 876 | + | // the minute scale, so a sub-second tick is plenty responsive and the | |
| 877 | + | // query is a single indexed read. The request releases its pool handle | |
| 878 | + | // between ticks. | |
| 879 | + | let tick = std::time::Duration::from_millis(750); | |
| 880 | + | loop { | |
| 881 | + | let view = crate::runs::get(&s.pool, run_id) | |
| 882 | + | .await | |
| 883 | + | .map_err(crate::error::Error::Other)? | |
| 884 | + | .ok_or(crate::error::Error::NotFound)?; | |
| 885 | + | let now = tokio::time::Instant::now(); | |
| 886 | + | if view.result != "building" || now >= deadline { | |
| 887 | + | return Ok(Json(view)); | |
| 888 | + | } | |
| 889 | + | tokio::time::sleep((deadline - now).min(tick)).await; | |
| 890 | + | } | |
| 804 | 891 | } | |
| 805 | 892 | ||
| 806 | 893 | #[derive(Deserialize)] | |
| @@ -835,7 +922,7 @@ async fn self_update( | |||
| 835 | 922 | // the restart would SIGKILL it mid-deploy. Make the operator retry once idle. | |
| 836 | 923 | { | |
| 837 | 924 | let slot = s.active_build.lock().await; | |
| 838 | - | if slot.as_ref().is_some_and(|h| !h.is_finished()) { | |
| 925 | + | if slot.as_ref().is_some_and(|b| !b.handle.is_finished()) { | |
| 839 | 926 | return Err(crate::error::Error::GateBlocked( | |
| 840 | 927 | "a server build is in flight; retry /self-update once it settles".into(), | |
| 841 | 928 | )); | |
| @@ -1074,6 +1161,7 @@ mod tests { | |||
| 1074 | 1161 | bin_names: vec!["makenotwork".into()], | |
| 1075 | 1162 | logs_root: PathBuf::from("/tmp/sando-logs"), | |
| 1076 | 1163 | release_contents: vec![], | |
| 1164 | + | cargo_target_dir: None, | |
| 1077 | 1165 | } | |
| 1078 | 1166 | } | |
| 1079 | 1167 | ||
| @@ -1385,6 +1473,112 @@ mod tests { | |||
| 1385 | 1473 | assert_eq!(resp.status(), StatusCode::NOT_FOUND); | |
| 1386 | 1474 | } | |
| 1387 | 1475 | ||
| 1476 | + | #[tokio::test] | |
| 1477 | + | async fn get_run_404s_for_unknown_id() { | |
| 1478 | + | let state = test_state().await; | |
| 1479 | + | let app = router(state); | |
| 1480 | + | let resp = app | |
| 1481 | + | .oneshot(Request::builder().uri("/runs/999").body(Body::empty()).unwrap()) | |
| 1482 | + | .await | |
| 1483 | + | .unwrap(); | |
| 1484 | + | assert_eq!(resp.status(), StatusCode::NOT_FOUND); | |
| 1485 | + | } | |
| 1486 | + | ||
| 1487 | + | #[tokio::test] | |
| 1488 | + | async fn get_run_returns_view_with_gates() { | |
| 1489 | + | let state = test_state().await; | |
| 1490 | + | // A run that reached version 0.10.2 and ran two host gates (one red). | |
| 1491 | + | let run_id = crate::runs::create(&state.pool, "abc1234def").await.unwrap(); | |
| 1492 | + | let ver: crate::domain::Version = "0.10.2".parse().unwrap(); | |
| 1493 | + | seed(&state.pool, "host", "0.10.2").await; | |
| 1494 | + | crate::runs::set_version(&state.pool, run_id, &ver).await.unwrap(); | |
| 1495 | + | insert_gate(&state.pool, "host", "0.10.2", "cargo_test", 0).await; | |
| 1496 | + | insert_gate(&state.pool, "host", "0.10.2", "boot_smoke", 1).await; | |
| 1497 | + | ||
| 1498 | + | let app = router(state); | |
| 1499 | + | let resp = app | |
| 1500 | + | .oneshot( | |
| 1501 | + | Request::builder() | |
| 1502 | + | .uri(format!("/runs/{}", run_id.0)) | |
| 1503 | + | .body(Body::empty()) | |
| 1504 | + | .unwrap(), | |
| 1505 | + | ) | |
| 1506 | + | .await | |
| 1507 | + | .unwrap(); | |
| 1508 | + | assert_eq!(resp.status(), StatusCode::OK); | |
| 1509 | + | let v: serde_json::Value = serde_json::from_str(&body_string(resp).await).unwrap(); | |
| 1510 | + | assert_eq!(v["run_id"], run_id.0); | |
| 1511 | + | assert_eq!(v["sha"], "abc1234def"); | |
| 1512 | + | assert_eq!(v["version"], "0.10.2"); | |
| 1513 | + | assert_eq!(v["result"], "building"); | |
| 1514 | + | // Both host gates surface, latest-per-kind, alphabetized by kind. | |
| 1515 | + | assert_eq!(v["gates"].as_array().unwrap().len(), 2); | |
| 1516 | + | assert_eq!(v["gates"][0]["kind"], "boot_smoke"); | |
| 1517 | + | assert_eq!(v["gates"][0]["status"], "passed"); | |
| 1518 | + | assert_eq!(v["gates"][1]["kind"], "cargo_test"); | |
| 1519 | + | assert_eq!(v["gates"][1]["status"], "failed"); | |
| 1520 | + | } | |
| 1521 | + | ||
| 1522 | + | #[tokio::test] | |
| 1523 | + | async fn get_run_wait_returns_immediately_when_settled() { | |
| 1524 | + | let state = test_state().await; | |
| 1525 | + | let run_id = crate::runs::create(&state.pool, "abc1234def").await.unwrap(); | |
| 1526 | + | crate::runs::mark_passed(&state.pool, run_id).await.unwrap(); | |
| 1527 | + | ||
| 1528 | + | let app = router(state); | |
| 1529 | + | // Generous timeout, but an already-settled run must not wait for it. | |
| 1530 | + | let resp = app | |
| 1531 | + | .oneshot( | |
| 1532 | + | Request::builder() | |
| 1533 | + | .uri(format!("/runs/{}/wait?timeout_ms=60000", run_id.0)) | |
| 1534 | + | .body(Body::empty()) | |
| 1535 | + | .unwrap(), | |
| 1536 | + | ) | |
| 1537 | + | .await | |
| 1538 | + | .unwrap(); | |
| 1539 | + | assert_eq!(resp.status(), StatusCode::OK); | |
| 1540 | + | let v: serde_json::Value = serde_json::from_str(&body_string(resp).await).unwrap(); | |
| 1541 | + | assert_eq!(v["result"], "passed"); | |
| 1542 | + | } | |
| 1543 | + | ||
| 1544 | + | #[tokio::test] | |
| 1545 | + | async fn get_run_wait_returns_building_at_timeout() { | |
| 1546 | + | let state = test_state().await; | |
| 1547 | + | let run_id = crate::runs::create(&state.pool, "abc1234def").await.unwrap(); | |
| 1548 | + | ||
| 1549 | + | let app = router(state); | |
| 1550 | + | // timeout_ms=0 → deadline is now → the first poll returns the | |
| 1551 | + | // still-building run rather than blocking. | |
| 1552 | + | let resp = app | |
| 1553 | + | .oneshot( | |
| 1554 | + | Request::builder() | |
| 1555 | + | .uri(format!("/runs/{}/wait?timeout_ms=0", run_id.0)) | |
| 1556 | + | .body(Body::empty()) | |
| 1557 | + | .unwrap(), | |
| 1558 | + | ) | |
| 1559 | + | .await | |
| 1560 | + | .unwrap(); | |
| 1561 | + | assert_eq!(resp.status(), StatusCode::OK); | |
| 1562 | + | let v: serde_json::Value = serde_json::from_str(&body_string(resp).await).unwrap(); | |
| 1563 | + | assert_eq!(v["result"], "building"); | |
| 1564 | + | } | |
| 1565 | + | ||
| 1566 | + | #[tokio::test] | |
| 1567 | + | async fn get_run_wait_404s_for_unknown_id() { | |
| 1568 | + | let state = test_state().await; | |
| 1569 | + | let app = router(state); | |
| 1570 | + | let resp = app | |
| 1571 | + | .oneshot( | |
| 1572 | + | Request::builder() | |
| 1573 | + | .uri("/runs/999/wait?timeout_ms=0") | |
| 1574 | + | .body(Body::empty()) | |
| 1575 | + | .unwrap(), | |
| 1576 | + | ) | |
| 1577 | + | .await | |
| 1578 | + | .unwrap(); | |
| 1579 | + | assert_eq!(resp.status(), StatusCode::NOT_FOUND); | |
| 1580 | + | } | |
| 1581 | + | ||
| 1388 | 1582 | #[test] | |
| 1389 | 1583 | fn self_update_unit_maps_sha_to_instance() { | |
| 1390 | 1584 | let sha = crate::domain::GitSha::parse("abc1234def5678").unwrap(); | |
| @@ -1630,6 +1824,25 @@ mod tests { | |||
| 1630 | 1824 | } | |
| 1631 | 1825 | ||
| 1632 | 1826 | #[tokio::test] | |
| 1827 | + | async fn state_build_is_null_until_first_rebuild_then_surfaces_latest() { | |
| 1828 | + | use axum::extract::State; | |
| 1829 | + | let state = test_state().await; | |
| 1830 | + | // No build runs yet → build is null, so /state doesn't pretend a build | |
| 1831 | + | // is happening. | |
| 1832 | + | let Json(view) = get_state(State(state.clone())).await.unwrap(); | |
| 1833 | + | assert!(view.build.is_none()); | |
| 1834 | + | ||
| 1835 | + | // A failed run must surface its cause in /state, not just in /runs. | |
| 1836 | + | let run_id = crate::runs::create(&state.pool, "deadbeef").await.unwrap(); | |
| 1837 | + | crate::runs::mark_failed(&state.pool, run_id, "cargo_test: 3 test(s) failed").await.unwrap(); | |
| 1838 | + | let Json(view) = get_state(State(state)).await.unwrap(); | |
| 1839 | + | let b = view.build.expect("build surfaced"); | |
| 1840 | + | assert_eq!(b.run_id, run_id.0); | |
| 1841 | + | assert_eq!(b.result, "failed"); | |
| 1842 | + | assert_eq!(b.failure_summary.as_deref(), Some("cargo_test: 3 test(s) failed")); | |
| 1843 | + | } | |
| 1844 | + | ||
| 1845 | + | #[tokio::test] | |
| 1633 | 1846 | async fn promote_with_explicit_version_but_missing_artifact_404s() { | |
| 1634 | 1847 | // Explicit version supplied, gates trivially pass (mm has none in | |
| 1635 | 1848 | // test_topo), but `versions` table has no row → 404. |
| @@ -0,0 +1,394 @@ | |||
| 1 | + | //! Build-run tracking: one `build_runs` row per `/rebuild`, updated as the | |
| 2 | + | //! pipeline moves through its phases, terminating in passed/failed/aborted. | |
| 3 | + | //! | |
| 4 | + | //! This is the resource that makes Sando driveable headlessly. `/state` only | |
| 5 | + | //! ever reflects the last *successful* deploy, so on a red pipeline a poller | |
| 6 | + | //! of `/state` sees stale-green for the whole build (the 0.10.2 incident). A | |
| 7 | + | //! `RunId` returned by `/rebuild` + `GET /runs/{id}` gives a non-TUI caller | |
| 8 | + | //! one pollable resource tied to the build it triggered, carrying the phase, | |
| 9 | + | //! the per-gate status, and — the highest-value bit — a `failure_summary` | |
| 10 | + | //! (first compile error / first failed gate) so the cause is in the API, not | |
| 11 | + | //! behind `sudo journalctl`. | |
| 12 | + | //! | |
| 13 | + | //! Terminal writes (`mark_passed`/`mark_failed`/`mark_aborted`) are guarded on | |
| 14 | + | //! `result = 'building'`, so whichever site settles the run first wins: a | |
| 15 | + | //! build-step compile error, the first red gate, or the task-level catch for | |
| 16 | + | //! pre-build bails. Later writes are silent no-ops. | |
| 17 | + | ||
| 18 | + | use crate::domain::{RunId, Version}; | |
| 19 | + | use anyhow::Result; | |
| 20 | + | use chrono::Utc; | |
| 21 | + | use serde::Serialize; | |
| 22 | + | use sqlx::{Row, SqlitePool}; | |
| 23 | + | ||
| 24 | + | /// In-flight sub-state. Plain strings in the DB; this enum names the values so | |
| 25 | + | /// call sites can't typo them. | |
| 26 | + | #[derive(Debug, Clone, Copy)] | |
| 27 | + | pub enum Phase { | |
| 28 | + | Fetching, | |
| 29 | + | Compiling, | |
| 30 | + | Staging, | |
| 31 | + | Gating, | |
| 32 | + | } | |
| 33 | + | ||
| 34 | + | impl Phase { | |
| 35 | + | pub fn as_str(self) -> &'static str { | |
| 36 | + | match self { | |
| 37 | + | Phase::Fetching => "fetching", | |
| 38 | + | Phase::Compiling => "compiling", | |
| 39 | + | Phase::Staging => "staging", | |
| 40 | + | Phase::Gating => "gating", | |
| 41 | + | } | |
| 42 | + | } | |
| 43 | + | } | |
| 44 | + | ||
| 45 | + | /// Insert a fresh `building` run for `sha` and return its id. | |
| 46 | + | pub async fn create(pool: &SqlitePool, sha: &str) -> Result<RunId> { | |
| 47 | + | let id: i64 = sqlx::query_scalar( | |
| 48 | + | "INSERT INTO build_runs (sha, phase, result, started_at) | |
| 49 | + | VALUES (?, 'queued', 'building', ?) RETURNING id", | |
| 50 | + | ) | |
| 51 | + | .bind(sha) | |
| 52 | + | .bind(Utc::now().to_rfc3339()) | |
| 53 | + | .fetch_one(pool) | |
| 54 | + | .await?; | |
| 55 | + | Ok(RunId(id)) | |
| 56 | + | } | |
| 57 | + | ||
| 58 | + | /// Advance the in-flight phase. No-op once the run is terminal so a late | |
| 59 | + | /// phase write can't resurrect a finished row. | |
| 60 | + | pub async fn set_phase(pool: &SqlitePool, run_id: RunId, phase: Phase) -> Result<()> { | |
| 61 | + | sqlx::query("UPDATE build_runs SET phase = ? WHERE id = ? AND result = 'building'") | |
| 62 | + | .bind(phase.as_str()) | |
| 63 | + | .bind(run_id.0) | |
| 64 | + | .execute(pool) | |
| 65 | + | .await?; | |
| 66 | + | Ok(()) | |
| 67 | + | } | |
| 68 | + | ||
| 69 | + | /// Record the version once it's been read from the worktree's Cargo.toml. | |
| 70 | + | pub async fn set_version(pool: &SqlitePool, run_id: RunId, version: &Version) -> Result<()> { | |
| 71 | + | sqlx::query("UPDATE build_runs SET version = ? WHERE id = ? AND result = 'building'") | |
| 72 | + | .bind(version.to_string()) | |
| 73 | + | .bind(run_id.0) | |
| 74 | + | .execute(pool) | |
| 75 | + | .await?; | |
| 76 | + | Ok(()) | |
| 77 | + | } | |
| 78 | + | ||
| 79 | + | /// Settle the run green. First terminal write wins (guarded on `building`). | |
| 80 | + | pub async fn mark_passed(pool: &SqlitePool, run_id: RunId) -> Result<()> { | |
| 81 | + | sqlx::query( | |
| 82 | + | "UPDATE build_runs SET result = 'passed', phase = 'done', finished_at = ? | |
| 83 | + | WHERE id = ? AND result = 'building'", | |
| 84 | + | ) | |
| 85 | + | .bind(Utc::now().to_rfc3339()) | |
| 86 | + | .bind(run_id.0) | |
| 87 | + | .execute(pool) | |
| 88 | + | .await?; | |
| 89 | + | Ok(()) | |
| 90 | + | } | |
| 91 | + | ||
| 92 | + | /// Settle the run red with a human-readable cause. First terminal write wins, | |
| 93 | + | /// so the most specific failure (build compile error, first red gate) recorded | |
| 94 | + | /// before the task-level catch is the one that sticks. | |
| 95 | + | pub async fn mark_failed(pool: &SqlitePool, run_id: RunId, summary: &str) -> Result<()> { | |
| 96 | + | // Bound the stored summary — it's a headline, not the log. The full output | |
| 97 | + | // is at the gate's log_ref / journald. | |
| 98 | + | let summary: String = summary.chars().take(600).collect(); | |
| 99 | + | sqlx::query( | |
| 100 | + | "UPDATE build_runs SET result = 'failed', phase = 'done', failure_summary = ?, finished_at = ? | |
| 101 | + | WHERE id = ? AND result = 'building'", | |
| 102 | + | ) | |
| 103 | + | .bind(&summary) | |
| 104 | + | .bind(Utc::now().to_rfc3339()) | |
| 105 | + | .bind(run_id.0) | |
| 106 | + | .execute(pool) | |
| 107 | + | .await?; | |
| 108 | + | Ok(()) | |
| 109 | + | } | |
| 110 | + | ||
| 111 | + | /// Settle the run as superseded by a newer `/rebuild`. | |
| 112 | + | pub async fn mark_aborted(pool: &SqlitePool, run_id: RunId) -> Result<()> { | |
| 113 | + | sqlx::query( | |
| 114 | + | "UPDATE build_runs SET result = 'aborted', phase = 'done', | |
| 115 | + | failure_summary = 'superseded by a newer /rebuild', finished_at = ? | |
| 116 | + | WHERE id = ? AND result = 'building'", | |
| 117 | + | ) | |
| 118 | + | .bind(Utc::now().to_rfc3339()) | |
| 119 | + | .bind(run_id.0) | |
| 120 | + | .execute(pool) | |
| 121 | + | .await?; | |
| 122 | + | Ok(()) | |
| 123 | + | } | |
| 124 | + | ||
| 125 | + | /// One gate's status within a run view. | |
| 126 | + | #[derive(Debug, Serialize)] | |
| 127 | + | pub struct RunGateView { | |
| 128 | + | pub kind: String, | |
| 129 | + | /// `'passed' | 'failed' | 'blocked'` or NULL while in-flight. | |
| 130 | + | pub status: Option<String>, | |
| 131 | + | /// Relative path under `cfg.logs_root` for the full byte stream. | |
| 132 | + | pub log_ref: Option<String>, | |
| 133 | + | } | |
| 134 | + | ||
| 135 | + | /// The `GET /runs/{id}` payload. | |
| 136 | + | #[derive(Debug, Serialize)] | |
| 137 | + | pub struct RunView { | |
| 138 | + | pub run_id: i64, | |
| 139 | + | pub sha: String, | |
| 140 | + | pub version: Option<String>, | |
| 141 | + | pub phase: String, | |
| 142 | + | /// `'building' | 'passed' | 'failed' | 'aborted'`. | |
| 143 | + | pub result: String, | |
| 144 | + | pub started_at: String, | |
| 145 | + | pub finished_at: Option<String>, | |
| 146 | + | /// Headline cause when `result = 'failed'`: first compile error or first | |
| 147 | + | /// red gate. NULL otherwise. | |
| 148 | + | pub failure_summary: Option<String>, | |
| 149 | + | /// Gates run on the host tier for this run's version, latest row per kind. | |
| 150 | + | /// Empty until the run reaches a version + the gating phase. | |
| 151 | + | pub gates: Vec<RunGateView>, | |
| 152 | + | } | |
| 153 | + | ||
| 154 | + | /// Load a run plus its host-tier gate statuses. `None` if the id is unknown. | |
| 155 | + | pub async fn get(pool: &SqlitePool, run_id: RunId) -> Result<Option<RunView>> { | |
| 156 | + | let Some(row) = sqlx::query( | |
| 157 | + | "SELECT id, sha, version, phase, result, started_at, finished_at, failure_summary | |
| 158 | + | FROM build_runs WHERE id = ?", | |
| 159 | + | ) | |
| 160 | + | .bind(run_id.0) | |
| 161 | + | .fetch_optional(pool) | |
| 162 | + | .await? | |
| 163 | + | else { | |
| 164 | + | return Ok(None); | |
| 165 | + | }; | |
| 166 | + | ||
| 167 | + | let version: Option<String> = row.get("version"); | |
| 168 | + | // Gates are keyed by (tier, version); a build run drives the `host` tier. | |
| 169 | + | // Latest row per gate_kind, matching `/state`'s per-tier query shape. | |
| 170 | + | let gates: Vec<RunGateView> = if let Some(ver) = version.as_deref() { | |
| 171 | + | sqlx::query( | |
| 172 | + | "SELECT gate_kind, status, log_ref | |
| 173 | + | FROM gate_runs g | |
| 174 | + | WHERE tier = 'host' AND version = ?1 | |
| 175 | + | AND id = (SELECT MAX(id) FROM gate_runs | |
| 176 | + | WHERE tier = 'host' AND version = ?1 AND gate_kind = g.gate_kind) | |
| 177 | + | ORDER BY gate_kind", | |
| 178 | + | ) | |
| 179 | + | .bind(ver) | |
| 180 | + | .fetch_all(pool) | |
| 181 | + | .await? | |
| 182 | + | .into_iter() | |
| 183 | + | .map(|gr| RunGateView { | |
| 184 | + | kind: gr.get("gate_kind"), | |
| 185 | + | status: gr.get("status"), | |
| 186 | + | log_ref: gr.get("log_ref"), | |
| 187 | + | }) | |
| 188 | + | .collect() | |
| 189 | + | } else { | |
| 190 | + | Vec::new() | |
| 191 | + | }; | |
| 192 | + | ||
| 193 | + | Ok(Some(RunView { | |
| 194 | + | run_id: row.get("id"), | |
| 195 | + | sha: row.get("sha"), | |
| 196 | + | version, | |
| 197 | + | phase: row.get("phase"), | |
| 198 | + | result: row.get("result"), | |
| 199 | + | started_at: row.get("started_at"), | |
| 200 | + | finished_at: row.get("finished_at"), | |
| 201 | + | failure_summary: row.get("failure_summary"), | |
| 202 | + | gates, | |
| 203 | + | })) | |
| 204 | + | } | |
| 205 | + | ||
| 206 | + | /// Compact view of the latest build run for `/state`'s liveness line. | |
| 207 | + | #[derive(Debug, Serialize)] | |
| 208 | + | pub struct BuildSummary { | |
| 209 | + | pub run_id: i64, | |
| 210 | + | pub sha: String, | |
| 211 | + | pub version: Option<String>, | |
| 212 | + | pub phase: String, | |
| 213 | + | pub result: String, | |
| 214 | + | pub failure_summary: Option<String>, | |
| 215 | + | /// Seconds from start to finish (or to now while building). Lets a | |
| 216 | + | /// `/state` poller show "building <ver>, phase=<x>, elapsed Ns" instead of | |
| 217 | + | /// a version frozen at the last success for the whole ~10-min build. | |
| 218 | + | pub elapsed_s: i64, | |
| 219 | + | } | |
| 220 | + | ||
| 221 | + | /// The most recent build run, for `/state`. `None` until the first `/rebuild`. | |
| 222 | + | pub async fn latest_summary(pool: &SqlitePool) -> Result<Option<BuildSummary>> { | |
| 223 | + | let Some(row) = sqlx::query( | |
| 224 | + | "SELECT id, sha, version, phase, result, failure_summary, started_at, finished_at | |
| 225 | + | FROM build_runs ORDER BY id DESC LIMIT 1", | |
| 226 | + | ) | |
| 227 | + | .fetch_optional(pool) | |
| 228 | + | .await? | |
| 229 | + | else { | |
| 230 | + | return Ok(None); | |
| 231 | + | }; | |
| 232 | + | let started_at: String = row.get("started_at"); | |
| 233 | + | let finished_at: Option<String> = row.get("finished_at"); | |
| 234 | + | Ok(Some(BuildSummary { | |
| 235 | + | run_id: row.get("id"), | |
| 236 | + | sha: row.get("sha"), | |
| 237 | + | version: row.get("version"), | |
| 238 | + | phase: row.get("phase"), | |
| 239 | + | result: row.get("result"), | |
| 240 | + | failure_summary: row.get("failure_summary"), | |
| 241 | + | elapsed_s: elapsed_seconds(&started_at, finished_at.as_deref()), | |
| 242 | + | })) | |
| 243 | + | } | |
| 244 | + | ||
| 245 | + | /// Seconds between an rfc3339 `started_at` and (`finished_at` or now), clamped | |
| 246 | + | /// at 0. A parse failure yields 0 rather than erroring the whole `/state` call. | |
| 247 | + | fn elapsed_seconds(started_at: &str, finished_at: Option<&str>) -> i64 { | |
| 248 | + | let Ok(start) = chrono::DateTime::parse_from_rfc3339(started_at) else { | |
| 249 | + | return 0; | |
| 250 | + | }; | |
| 251 | + | let end = match finished_at { | |
| 252 | + | Some(f) => chrono::DateTime::parse_from_rfc3339(f) | |
| 253 | + | .map(|d| d.with_timezone(&Utc)) | |
| 254 | + | .unwrap_or_else(|_| Utc::now()), | |
| 255 | + | None => Utc::now(), | |
| 256 | + | }; | |
| 257 | + | (end - start.with_timezone(&Utc)).num_seconds().max(0) | |
| 258 | + | } | |
| 259 | + | ||
| 260 | + | /// The summary of the first failed gate for `version` on the host tier, if | |
| 261 | + | /// any — used by the build pipeline to populate `failure_summary` when | |
| 262 | + | /// `run_all` reports a red pipeline. Reads the typed `outcome_json` so the | |
| 263 | + | /// stored headline matches what the TUI renders. | |
| 264 | + | pub async fn first_failed_gate_summary(pool: &SqlitePool, version: &Version) -> Option<String> { | |
| 265 | + | let row = sqlx::query( | |
| 266 | + | "SELECT gate_kind, outcome_json FROM gate_runs | |
| 267 | + | WHERE tier = 'host' AND version = ? AND status = 'failed' | |
| 268 | + | ORDER BY id ASC LIMIT 1", | |
| 269 | + | ) | |
| 270 | + | .bind(version.to_string()) | |
| 271 | + | .fetch_optional(pool) | |
| 272 | + | .await | |
| 273 | + | .ok() | |
| 274 | + | .flatten()?; | |
| 275 | + | let kind: String = row.get("gate_kind"); | |
| 276 | + | let outcome_json: Option<String> = row.get("outcome_json"); | |
| 277 | + | let summary = outcome_json | |
| 278 | + | .and_then(|s| serde_json::from_str::<crate::outcome::GateOutcome>(&s).ok()) | |
| 279 | + | .map(|o| match o.status { | |
| 280 | + | crate::outcome::GateStatus::Failed { failure } => failure.summary(), | |
| 281 | + | other => format!("{:?}", other), | |
| 282 | + | }) | |
| 283 | + | .unwrap_or_else(|| "gate failed".to_string()); | |
| 284 | + | Some(format!("{kind}: {summary}")) | |
| 285 | + | } | |
| 286 | + | ||
| 287 | + | #[cfg(test)] | |
| 288 | + | mod tests { | |
| 289 | + | use super::*; | |
| 290 | + | use sqlx::sqlite::SqlitePoolOptions; | |
| 291 | + | ||
| 292 | + | async fn pool() -> SqlitePool { | |
| 293 | + | let pool = SqlitePoolOptions::new() | |
| 294 | + | .max_connections(1) | |
| 295 | + | .connect("sqlite::memory:") | |
| 296 | + | .await | |
| 297 | + | .unwrap(); | |
| 298 | + | crate::db::migrate(&pool).await.unwrap(); | |
| 299 | + | pool | |
| 300 | + | } | |
| 301 | + | ||
| 302 | + | #[tokio::test] | |
| 303 | + | async fn create_then_get_roundtrips_building() { | |
| 304 | + | let pool = pool().await; | |
| 305 | + | let id = create(&pool, "abc1234").await.unwrap(); | |
| 306 | + | let v = get(&pool, id).await.unwrap().expect("run exists"); | |
| 307 | + | assert_eq!(v.sha, "abc1234"); | |
| 308 | + | assert_eq!(v.result, "building"); | |
| 309 | + | assert_eq!(v.phase, "queued"); | |
| 310 | + | assert!(v.version.is_none()); | |
| 311 | + | assert!(v.gates.is_empty()); | |
| 312 | + | assert!(v.failure_summary.is_none()); | |
| 313 | + | } | |
| 314 | + | ||
| 315 | + | #[tokio::test] | |
| 316 | + | async fn phase_and_version_advance_then_pass() { | |
| 317 | + | let pool = pool().await; | |
| 318 | + | let id = create(&pool, "abc1234").await.unwrap(); | |
| 319 | + | set_phase(&pool, id, Phase::Compiling).await.unwrap(); | |
| 320 | + | let ver: Version = "0.10.2".parse().unwrap(); | |
| 321 | + | set_version(&pool, id, &ver).await.unwrap(); | |
| 322 | + | mark_passed(&pool, id).await.unwrap(); | |
| 323 | + | ||
| 324 | + | let v = get(&pool, id).await.unwrap().unwrap(); | |
| 325 | + | assert_eq!(v.result, "passed"); | |
| 326 | + | assert_eq!(v.phase, "done"); | |
| 327 | + | assert_eq!(v.version.as_deref(), Some("0.10.2")); | |
| 328 | + | assert!(v.finished_at.is_some()); | |
| 329 | + | } | |
| 330 | + | ||
| 331 | + | #[tokio::test] | |
| 332 | + | async fn first_terminal_write_wins() { | |
| 333 | + | let pool = pool().await; | |
| 334 | + | let id = create(&pool, "abc1234").await.unwrap(); | |
| 335 | + | mark_failed(&pool, id, "error[E0063]: missing field user_pages_host").await.unwrap(); | |
| 336 | + | // A later pass attempt (e.g. the task catch racing a build-step error) | |
| 337 | + | // must not overwrite the recorded failure. | |
| 338 | + | mark_passed(&pool, id).await.unwrap(); | |
| 339 | + | // And a second failure summary doesn't clobber the first. | |
| 340 | + | mark_failed(&pool, id, "something else").await.unwrap(); | |
| 341 | + | ||
| 342 | + | let v = get(&pool, id).await.unwrap().unwrap(); | |
| 343 | + | assert_eq!(v.result, "failed"); | |
| 344 | + | assert_eq!(v.failure_summary.as_deref(), Some("error[E0063]: missing field user_pages_host")); | |
| 345 | + | } | |
| 346 | + | ||
| 347 | + | #[tokio::test] | |
| 348 | + | async fn phase_write_after_terminal_is_noop() { | |
| 349 | + | let pool = pool().await; | |
| 350 | + | let id = create(&pool, "abc1234").await.unwrap(); | |
| 351 | + | mark_passed(&pool, id).await.unwrap(); | |
| 352 | + | set_phase(&pool, id, Phase::Gating).await.unwrap(); | |
| 353 | + | let v = get(&pool, id).await.unwrap().unwrap(); | |
| 354 | + | assert_eq!(v.phase, "done", "a late phase write must not move a finished run"); | |
| 355 | + | } | |
| 356 | + | ||
| 357 | + | #[test] | |
| 358 | + | fn elapsed_seconds_uses_finished_when_present() { | |
| 359 | + | // Both timestamps present → exact span, no wall-clock dependency. | |
| 360 | + | let s = elapsed_seconds("2026-06-13T00:00:00Z", Some("2026-06-13T00:02:05Z")); | |
| 361 | + | assert_eq!(s, 125); | |
| 362 | + | // Unparseable start → 0, never a panic / negative. | |
| 363 | + | assert_eq!(elapsed_seconds("not-a-date", None), 0); | |
| 364 | + | } | |
| 365 | + | ||
| 366 | + | #[tokio::test] | |
| 367 | + | async fn latest_summary_reports_most_recent_run() { | |
| 368 | + | let pool = pool().await; | |
| 369 | + | assert!(latest_summary(&pool).await.unwrap().is_none()); | |
| 370 | + | let _old = create(&pool, "old1234").await.unwrap(); | |
| 371 | + | let new = create(&pool, "new5678").await.unwrap(); | |
| 372 | + | set_phase(&pool, new, Phase::Compiling).await.unwrap(); | |
| 373 | + | let sum = latest_summary(&pool).await.unwrap().expect("a run exists"); | |
| 374 | + | assert_eq!(sum.run_id, new.0); | |
| 375 | + | assert_eq!(sum.sha, "new5678"); | |
| 376 | + | assert_eq!(sum.phase, "compiling"); | |
| 377 | + | assert_eq!(sum.result, "building"); | |
| 378 | + | } | |
| 379 | + | ||
| 380 | + | #[tokio::test] | |
| 381 | + | async fn get_unknown_id_is_none() { | |
| 382 | + | let pool = pool().await; | |
| 383 | + | assert!(get(&pool, RunId(999)).await.unwrap().is_none()); | |
| 384 | + | } | |
| 385 | + | ||
| 386 | + | #[tokio::test] | |
| 387 | + | async fn failure_summary_is_bounded() { | |
| 388 | + | let pool = pool().await; | |
| 389 | + | let id = create(&pool, "abc1234").await.unwrap(); | |
| 390 | + | mark_failed(&pool, id, &"x".repeat(5_000)).await.unwrap(); | |
| 391 | + | let v = get(&pool, id).await.unwrap().unwrap(); | |
| 392 | + | assert!(v.failure_summary.unwrap().len() <= 600); | |
| 393 | + | } | |
| 394 | + | } |
| @@ -15,6 +15,13 @@ use tokio::task::AbortHandle; | |||
| 15 | 15 | /// constructing ssh/rsync invocations inline. | |
| 16 | 16 | pub type ExecutorMap = HashMap<NodeId, Arc<dyn Executor>>; | |
| 17 | 17 | ||
| 18 | + | /// The in-flight build pipeline: its abort handle plus the `build_runs` row it | |
| 19 | + | /// drives. A newer `/rebuild` aborts the handle and settles the row `aborted`. | |
| 20 | + | pub struct ActiveBuild { | |
| 21 | + | pub handle: AbortHandle, | |
| 22 | + | pub run_id: crate::domain::RunId, | |
| 23 | + | } | |
| 24 | + | ||
| 18 | 25 | #[derive(Clone)] | |
| 19 | 26 | pub struct AppState { | |
| 20 | 27 | pub pool: SqlitePool, | |
| @@ -22,8 +29,10 @@ pub struct AppState { | |||
| 22 | 29 | pub cfg: Arc<Config>, | |
| 23 | 30 | pub prom: PrometheusHandle, | |
| 24 | 31 | /// Single-slot guard for the build pipeline. A new /rebuild aborts any | |
| 25 | - | /// in-flight build (cargo + gates) so the latest push always wins. | |
| 26 | - | pub active_build: Arc<Mutex<Option<AbortHandle>>>, | |
| 32 | + | /// in-flight build (cargo + gates) so the latest push always wins. Carries | |
| 33 | + | /// the run id alongside the handle so the aborting `/rebuild` can settle | |
| 34 | + | /// the superseded `build_runs` row as `aborted`. | |
| 35 | + | pub active_build: Arc<Mutex<Option<ActiveBuild>>>, | |
| 27 | 36 | /// Serializes the deploy mutators (`/promote`, `/rollback`) so their | |
| 28 | 37 | /// check -> deploy -> advance sequences never interleave. Without it two | |
| 29 | 38 | /// concurrent promotes (or a promote racing a rollback) could deploy mixed |
| @@ -16,6 +16,12 @@ release_root = "/srv/sando" | |||
| 16 | 16 | scratch_db_url = "postgres:///sando_scratch?host=/var/run/postgresql" | |
| 17 | 17 | bin_names = ["makenotwork", "mnw-admin"] | |
| 18 | 18 | logs_root = "/srv/sando/logs" | |
| 19 | + | # Shared cargo target dir across per-sha worktrees. Without it every /rebuild | |
| 20 | + | # clean-compiles a fresh worktree (~10 min) even for a 1-line diff; with it the | |
| 21 | + | # incremental rebuild reuses the previous sha's compiled deps (1–2 min). Safe | |
| 22 | + | # because builds are serialized (a new /rebuild aborts the in-flight one). The | |
| 23 | + | # sando user must be able to write it; cargo creates it if absent. | |
| 24 | + | cargo_target_dir = "/srv/sando/cargo-target" | |
| 19 | 25 | ||
| 20 | 26 | # Non-binary content shipped as part of each release. Multiple entries can | |
| 21 | 27 | # target the same `dst` (additive merge — used to build `docs/` from three |
| @@ -489,9 +489,14 @@ fn pass_note_short(n: &PassNote) -> String { | |||
| 489 | 489 | ||
| 490 | 490 | fn failure_short(f: &GateFailure) -> String { | |
| 491 | 491 | match f { | |
| 492 | - | GateFailure::CargoTest { failed_count, first_failed: Some(name) } => | |
| 492 | + | GateFailure::CargoTest { failed_count, first_panic: Some(p), .. } => | |
| 493 | + | format!("{failed_count} test(s); panic: {p}"), | |
| 494 | + | GateFailure::CargoTest { failed_count, first_failed: Some(name), .. } => | |
| 493 | 495 | format!("{failed_count} test(s); first {name}"), | |
| 494 | 496 | GateFailure::CargoTest { failed_count, .. } => format!("{failed_count} test(s) failed"), | |
| 497 | + | GateFailure::CompileError { error_count, first_error: Some(e) } => | |
| 498 | + | format!("compile failed ({error_count}); {e}"), | |
| 499 | + | GateFailure::CompileError { error_count, .. } => format!("compile failed ({error_count})"), | |
| 495 | 500 | GateFailure::MigrationDrift { migration } => format!("drift {migration}"), | |
| 496 | 501 | GateFailure::MigrationModified { migration } => format!("modified {migration}"), | |
| 497 | 502 | GateFailure::MigrationSqlError { migration, sqlstate: Some(s) } => |