Skip to main content

max / makenotwork

sando: build-run observability, fast pre-gate, shared build cache Close the D2/D2.1 deploy-reliability findings from the 2026-06-12/13 carousel + custom-pages deploys. Build-run tracking (the headless-driver gap): - build_runs table (migration 007) + RunId; /rebuild returns run_id - GET /runs/{id} -> phase/result/gates/failure_summary; GET /runs/{id}/wait long-polls to completion - /state carries a `build` summary so a poller sees in-flight/failed builds instead of a version frozen at the last success - failure_summary carries the cause: classified compile error, first red gate's typed summary, or the anyhow chain Gate speed + diagnosis: - cargo_test runs `cargo test --no-run` as a fast pre-gate; typed CompileError surfaces error[Ennnn] immediately, cache-shared with the run - classify_cargo_test extracts the first root-cause panic, skipping the "Once poisoned" cascade - cargo_target_dir config: one shared CARGO_TARGET_DIR across worktrees so an incremental diff reuses the prior sha's deps (~10 min -> 1-2 min) - /rebuild {} fetches upstream then resolves the branch HEAD - stale-test-db cleanup excludes %template% so the harness reuses it Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Author: Max Johnson <me@maxj.phd> · 2026-06-13 22:41 UTC
Commit: 614c3e7c8a37d5880a734de1b64fd97bf632efbe
Parent: af2c94f
16 files changed, +1078 insertions, -80 deletions
M .gitignore +3
@@ -54,3 +54,6 @@ audit_review.md
54 54 # sandod local state (regenerable)
55 55 sando/daemon/sando.db
56 56 sando/daemon/sando.db-*
57 + sando/daemon/work/
58 + sando/daemon/releases/
59 + sando/daemon/cargo-target/
M sando/README.md +4 -2
@@ -102,8 +102,10 @@ curl -X POST http://127.0.0.1:7766/promote/a \
102 102
103 103 | Method | Path | Body | Purpose |
104 104 |--------|------|------|---------|
105 - | GET | `/state` | — | Tier list + current/previous version + last gate outcomes |
106 - | POST | `/rebuild` | `{sha?: string}` | Force a build; if `sha` is absent, resolves the configured deploy branch. Aborts any in-flight build (latest wins). |
105 + | GET | `/state` | — | Tier list + current/previous version + last gate outcomes, plus `build` (latest build run: phase/result/failure_summary/elapsed_s, `null` until first `/rebuild`) so a poller sees in-flight/failed builds, not a frozen version |
106 + | POST | `/rebuild` | `{sha?: string}` | Force a build; if `sha` is absent, resolves the configured deploy branch. Aborts any in-flight build (latest wins). Returns `{accepted, sha, run_id}`. |
107 + | GET | `/runs/{id}` | — | Build-status of the run a `/rebuild` returned: `{run_id, sha, version, phase, result, failure_summary, gates[], started_at, finished_at}`. The pollable resource for a non-TUI driver — `/state` only reflects the last *successful* version. |
108 + | GET | `/runs/{id}/wait` | `?timeout_ms=` | Long-poll: blocks until the run settles or `timeout_ms` (default 30s, cap 120s) elapses, then returns the same `RunView`. Fire `/rebuild` → block on `/wait`. |
107 109 | POST | `/promote/{tier}` | `{version?, hotfix?, reset_burn_in?}` | Verify predecessor gates, deploy to tier nodes, advance state. `version` defaults to the predecessor tier's `current_version`. |
108 110 | POST | `/rollback/{tier}` | — | Swap `current` symlink to `previous_version` on every node in the tier |
109 111 | POST | `/confirm/{tier}` | — | Insert a passing `manual_confirm` gate row for the tier's `current_version`. Replaces hand-SQL. |
@@ -0,0 +1,26 @@
1 + -- One row per `/rebuild` invocation: tracks a build attempt through its
2 + -- lifecycle so a non-TUI caller can poll `GET /runs/{id}` for build status
3 + -- instead of inferring it from `/state` — which only ever reflects the last
4 + -- *successful* version and so reports stale-green for the whole duration of a
5 + -- failing build (the 0.10.2 headless-deploy incident).
6 + --
7 + -- `version` is denormalized (no FK): a run may fail before a `versions` row
8 + -- exists (fetch/checkout/compile error) and we still want its failure
9 + -- recorded. `result` is the terminal verdict — 'building' until the pipeline
10 + -- settles, then 'passed' | 'failed' | 'aborted'. `phase` is the in-flight
11 + -- sub-state ('queued' | 'fetching' | 'compiling' | 'staging' | 'gating' |
12 + -- 'done'). Terminal transitions are guarded on `result = 'building'` so the
13 + -- first writer (a build-step error, a gate failure, or the task-level catch)
14 + -- wins and later writes are no-ops.
15 + CREATE TABLE build_runs (
16 + id INTEGER PRIMARY KEY AUTOINCREMENT,
17 + sha TEXT NOT NULL,
18 + version TEXT,
19 + phase TEXT NOT NULL DEFAULT 'queued',
20 + result TEXT NOT NULL DEFAULT 'building',
21 + failure_summary TEXT,
22 + started_at TEXT NOT NULL,
23 + finished_at TEXT
24 + );
25 +
26 + CREATE INDEX build_runs_by_sha ON build_runs(sha);
@@ -4,5 +4,9 @@ db_path = "./sando.db"
4 4 topology_path = "../sando.toml"
5 5 workdir = "./work"
6 6 release_root = "./releases"
7 + # Shared cargo target dir across per-sha worktrees — incremental rebuilds reuse
8 + # the previous sha's compiled deps instead of clean-compiling each fresh
9 + # worktree. Safe because builds are serialized. Omit for per-worktree target/.
10 + cargo_target_dir = "./cargo-target"
7 11 # Dropped and recreated on every migration_dry_run. Leave unset to skip.
8 12 scratch_db_url = "postgres://sando@127.0.0.1/sando_scratch"
@@ -6,7 +6,7 @@
6 6
7 7 use crate::config::Config;
8 8 use crate::deploy;
9 - use crate::domain::{GitSha, TierId, Version};
9 + use crate::domain::{GitSha, RunId, TierId, Version};
10 10 use crate::gates::{self, GateCtx};
11 11 use crate::git;
12 12 use crate::topology::Topology;
@@ -34,10 +34,13 @@ pub async fn run(
34 34 topo: Arc<Topology>,
35 35 sha: GitSha,
36 36 events: crate::events::EventTx,
37 + run_id: RunId,
37 38 ) -> Result<BuildArtifact> {
38 39 let worktree = cfg.workdir.join(sha.as_str());
39 40 let bare = PathBuf::from(&topo.repo.bare_path);
40 41
42 + crate::runs::set_phase(&pool, run_id, crate::runs::Phase::Fetching).await.ok();
43 +
41 44 // Pull-based ingestion: if an upstream remote is configured, fetch the
42 45 // deploy branch so a just-pushed sha is locally resolvable. A fetch
43 46 // failure is non-fatal — the sha may already be present from a prior
@@ -59,6 +62,7 @@ pub async fn run(
59 62 let server_dir = worktree.join("server");
60 63 let version = read_pkg_version(&server_dir.join("Cargo.toml")).await
61 64 .with_context(|| format!("reading version from {}/Cargo.toml", server_dir.display()))?;
65 + crate::runs::set_version(&pool, run_id, &version).await.ok();
62 66
63 67 // sqlx compile-time query checking needs a live DB with the current schema.
64 68 // We point cargo at the scratch DB and prep it (drop public, re-migrate)
@@ -70,6 +74,12 @@ pub async fn run(
70 74 .arg("--release")
71 75 .current_dir(&server_dir)
72 76 .kill_on_drop(true);
77 + // Shared build cache across per-sha worktrees: reuse one target dir so an
78 + // incremental diff doesn't clean-compile from scratch. Serialized builds
79 + // make this contention-free. Unset → cargo's default per-worktree target/.
80 + if let Some(target) = cfg.cargo_target_dir.as_deref() {
81 + cargo_cmd.env("CARGO_TARGET_DIR", target);
82 + }
73 83 if let Some(scratch_url) = cfg.scratch_db_url.as_deref() {
74 84 tracing::info!(sha = %sha.as_str(), "preparing scratch DB schema for sqlx compile-time checks");
75 85 crate::gates::reset_scratch(scratch_url).await
@@ -81,6 +91,7 @@ pub async fn run(
81 91 tracing::warn!("scratch_db_url unset; sqlx will fall back to offline mode and may fail");
82 92 }
83 93
94 + crate::runs::set_phase(&pool, run_id, crate::runs::Phase::Compiling).await.ok();
84 95 tracing::info!(sha = %sha, version = %version, dir = %server_dir.display(), "cargo build --release start");
85 96 crate::events::emit(&events, crate::events::Event::BuildStart {
86 97 sha: sha.clone(), version: version.clone(),
@@ -96,19 +107,24 @@ pub async fn run(
96 107 crate::events::emit(&events, crate::events::Event::BuildFailed {
97 108 sha: sha.clone(), version: version.clone(), elapsed_s,
98 109 });
99 - } else {
100 - tracing::info!(sha = %sha, version = %version, elapsed_s, "cargo build --release ok");
101 - crate::events::emit(&events, crate::events::Event::BuildOk {
102 - sha: sha.clone(), version: version.clone(), elapsed_s,
103 - });
110 + // Settle the run with the headline compiler diagnostic (not the raw
111 + // 4 KB tail) so `GET /runs/{id}` answers "why" without a journald dive.
112 + let summary = crate::classify::classify_compile_error(&out.stdout, &out.stderr).summary();
113 + crate::runs::mark_failed(&pool, run_id, &summary).await.ok();
114 + anyhow::bail!("cargo build --release failed:\n{}", tail(&out.stderr, 4_000));
104 115 }
105 - anyhow::ensure!(
106 - out.status.success(),
107 - "cargo build --release failed:\n{}",
108 - tail(&out.stderr, 4_000),
109 - );
116 + tracing::info!(sha = %sha, version = %version, elapsed_s, "cargo build --release ok");
117 + crate::events::emit(&events, crate::events::Event::BuildOk {
118 + sha: sha.clone(), version: version.clone(), elapsed_s,
119 + });
110 120
111 - let release_dir = server_dir.join("target/release");
121 + // Binaries land under `<target>/release/`; with a shared target dir that's
122 + // not inside the worktree, so resolve it the same way cargo did above.
123 + let release_dir = cfg
124 + .cargo_target_dir
125 + .as_deref()
126 + .map(|t| t.join("release"))
127 + .unwrap_or_else(|| server_dir.join("target/release"));
112 128 let mut binary_paths = Vec::with_capacity(cfg.bin_names.len());
113 129 for name in &cfg.bin_names {
114 130 let p = release_dir.join(name);
@@ -144,8 +160,11 @@ pub async fn build_and_run_host(
144 160 topo: Arc<Topology>,
145 161 sha: GitSha,
146 162 events: crate::events::EventTx,
163 + run_id: RunId,
147 164 ) -> Result<()> {
148 - let art = run(pool.clone(), cfg.clone(), topo.clone(), sha, events.clone()).await?;
165 + let art = run(pool.clone(), cfg.clone(), topo.clone(), sha, events.clone(), run_id).await?;
166 +
167 + crate::runs::set_phase(&pool, run_id, crate::runs::Phase::Staging).await.ok();
149 168
150 169 // Stage the binary in the host's release_root so future gates and the
151 170 // host self-deploy point at a stable path, not the worktree's target/.
@@ -170,6 +189,7 @@ pub async fn build_and_run_host(
170 189 let host = topo.tiers.iter().find(|t| t.name.as_str() == "host")
171 190 .context("topology has no `host` tier")?;
172 191
192 + crate::runs::set_phase(&pool, run_id, crate::runs::Phase::Gating).await.ok();
173 193 let ctx = GateCtx {
174 194 pool: pool.clone(),
175 195 cfg: cfg.clone(),
@@ -194,8 +214,15 @@ pub async fn build_and_run_host(
194 214 .bind(Utc::now().to_rfc3339())
195 215 .execute(&pool)
196 216 .await?;
217 + crate::runs::mark_passed(&pool, run_id).await.ok();
197 218 tracing::info!(version = %art.version, "host pipeline green; ready to promote to next tier");
198 219 } else {
220 + // Pull the first red gate's typed summary into the run so the API
221 + // answers "which gate, and why" — not just "failed".
222 + let summary = crate::runs::first_failed_gate_summary(&pool, &art.version)
223 + .await
224 + .unwrap_or_else(|| "host pipeline red".to_string());
225 + crate::runs::mark_failed(&pool, run_id, &summary).await.ok();
199 226 tracing::warn!(version = %art.version, "host pipeline red; not advancing tier_state");
200 227 }
201 228 Ok(())
@@ -63,7 +63,99 @@ pub fn classify_cargo_test(stdout: &[u8], stderr: &[u8]) -> GateFailure {
63 63 };
64 64 }
65 65
66 - GateFailure::CargoTest { failed_count, first_failed }
66 + let first_panic = extract_first_panic(&stdout_s);
67 + GateFailure::CargoTest { failed_count, first_failed, first_panic }
68 + }
69 +
70 + /// Pull the first *root-cause* panic message out of libtest's captured
71 + /// output. libtest (Rust 2021+) prints each captured panic as:
72 + /// thread '<test>' panicked at <file>:<line>:<col>:
73 + /// <message>
74 + /// We return the first panic's message — but skip "...poisoned" messages in
75 + /// favour of the first non-poison one, because a single real panic in shared
76 + /// setup (a `std::sync::Once`) poisons it and makes every *other* test report
77 + /// "Once instance has previously been poisoned". The root cause is the one
78 + /// panic that isn't a poison report. Falls back to the first panic of any
79 + /// kind if every message looks like poison.
80 + fn extract_first_panic(stdout: &str) -> Option<String> {
81 + let mut first: Option<String> = None;
82 + let mut lines = stdout.lines();
83 + while let Some(line) = lines.next() {
84 + if !line.contains("panicked at ") {
85 + continue;
86 + }
87 + // The message is the first non-empty line after the `panicked at` loc.
88 + let msg = lines.by_ref().map(str::trim).find(|l| !l.is_empty());
89 + let Some(msg) = msg else { continue };
90 + if first.is_none() {
91 + first = Some(msg.to_string());
92 + }
93 + let is_poison = msg.contains("poisoned") || msg.contains("PoisonError");
94 + if !is_poison {
95 + return Some(msg.to_string());
96 + }
97 + }
98 + first
99 + }
100 +
101 + /// `cargo test --no-run` (the fast pre-gate compile): pull the first
102 + /// compiler diagnostic out of cargo's stderr so a test-only-target
103 + /// compile break (e.g. a missing struct field in a `#[cfg(test)]`-only
104 + /// target) surfaces as the actual `error[E0063]: missing field ...`
105 + /// line, instead of after a full build + a partial run reported as an
106 + /// opaque "N tests failed".
107 + ///
108 + /// Cargo writes diagnostics to stderr. We prefer the first coded
109 + /// `error[Ennnn]: ...` headline over the trailing `error: could not
110 + /// compile <crate> ... due to N previous errors` summary, which names
111 + /// the crate but not the cause; the summary still gives us the count.
112 + pub fn classify_compile_error(stdout: &[u8], stderr: &[u8]) -> GateFailure {
113 + let stderr_s = String::from_utf8_lossy(stderr);
114 + let mut first_error: Option<String> = None;
115 + let mut error_count: u32 = 0;
116 +
117 + for line in stderr_s.lines() {
118 + let t = line.trim_start();
119 + if first_error.is_none() && t.starts_with("error[") {
120 + first_error = Some(t.to_string());
121 + }
122 + if let Some(rest) = t.strip_prefix("error: could not compile")
123 + && let Some(n) = parse_due_to_count(rest)
124 + {
125 + error_count = n;
126 + }
127 + }
128 +
129 + // No coded diagnostic (e.g. a macro or resolver error prints a bare
130 + // `error: ...`). Take the first such line that isn't cargo's own
131 + // summary/abort noise.
132 + if first_error.is_none() {
133 + for line in stderr_s.lines() {
134 + let t = line.trim_start();
135 + if t.starts_with("error:")
136 + && !t.starts_with("error: could not compile")
137 + && !t.starts_with("error: aborting")
138 + {
139 + first_error = Some(t.to_string());
140 + break;
141 + }
142 + }
143 + }
144 +
145 + if first_error.is_none() && error_count == 0 {
146 + // Didn't look like a compile failure — don't masquerade as one.
147 + return GateFailure::Unclassified {
148 + legacy_detail: Some(combined_tail_for_classifier(stdout, stderr)),
149 + };
150 + }
151 + GateFailure::CompileError { error_count, first_error }
152 + }
153 +
154 + /// Parse the count out of `... due to N previous error(s)`.
155 + fn parse_due_to_count(s: &str) -> Option<u32> {
156 + let idx = s.find("due to ")?;
157 + let digits: String = s[idx + 7..].chars().take_while(|c| c.is_ascii_digit()).collect();
158 + digits.parse().ok()
67 159 }
68 160
69 161 /// `migration_dry_run` is staged: scratch reset → restore dump → run
@@ -217,11 +309,61 @@ failures:\n\
217 309 foo::baz\n\
218 310 \n\
219 311 test result: FAILED. 10 passed; 2 failed; 0 ignored\n";
220 - let GateFailure::CargoTest { failed_count, first_failed } =
312 + let GateFailure::CargoTest { failed_count, first_failed, first_panic } =
221 313 classify_cargo_test(stdout, b"")
222 314 else { panic!("expected CargoTest variant"); };
223 315 assert_eq!(failed_count, 2);
224 316 assert_eq!(first_failed.as_deref(), Some("foo::bar"));
317 + // No `panicked at` lines in this fixture.
318 + assert_eq!(first_panic, None);
319 + }
320 +
321 + #[test]
322 + fn cargo_test_sees_through_poison_cascade_to_root_panic() {
323 + // The shape that produced the opaque "856 failed": one real panic in
324 + // shared setup poisons a `Once`, and every other test then reports the
325 + // poison. The classifier must surface the real cause, not the poison.
326 + let stdout = b"failures:\n\n\
327 + ---- harness::a stdout ----\n\
328 + thread 'harness::a' panicked at tests/harness/db.rs:42:9:\n\
329 + Once instance has previously been poisoned\n\
330 + \n\
331 + ---- harness::root stdout ----\n\
332 + thread 'harness::root' panicked at tests/harness/db.rs:30:5:\n\
333 + template database \"mnw_test_template\" does not exist\n\
334 + \n\
335 + failures:\n\
336 + harness::a\n\
337 + harness::root\n\
338 + \n\
339 + test result: FAILED. 0 passed; 856 failed; 0 ignored\n";
340 + let GateFailure::CargoTest { failed_count, first_panic, .. } =
341 + classify_cargo_test(stdout, b"")
342 + else { panic!("expected CargoTest variant"); };
343 + assert_eq!(failed_count, 856);
344 + assert_eq!(
345 + first_panic.as_deref(),
346 + Some("template database \"mnw_test_template\" does not exist"),
347 + "must skip the poison message for the root cause",
348 + );
349 + }
350 +
351 + #[test]
352 + fn cargo_test_panic_falls_back_when_all_poison() {
353 + // If every panic is a poison report, return the first one rather than
354 + // nothing — better than an opaque count.
355 + let stdout = b"failures:\n\n\
356 + ---- harness::a stdout ----\n\
357 + thread 'harness::a' panicked at x.rs:1:1:\n\
358 + Once instance has previously been poisoned\n\
359 + \n\
360 + failures:\n harness::a\n\
361 + \n\
362 + test result: FAILED. 0 passed; 3 failed; 0 ignored\n";
363 + let GateFailure::CargoTest { first_panic, .. } =
364 + classify_cargo_test(stdout, b"")
365 + else { panic!("expected CargoTest variant"); };
366 + assert_eq!(first_panic.as_deref(), Some("Once instance has previously been poisoned"));
225 367 }
226 368
227 369 #[test]
@@ -238,6 +380,49 @@ test result: FAILED. 10 passed; 2 failed; 0 ignored\n";
238 380 }
239 381
240 382 #[test]
383 + fn compile_error_extracts_first_coded_diagnostic_and_count() {
384 + // Real `cargo test --no-run` shape: the headline diagnostic, then
385 + // the trailing summary that carries the count.
386 + let stderr = b" Compiling makenotwork v0.10.2\n\
387 + error[E0063]: missing field `user_pages_host` in initializer of `Config`\n \
388 + --> src/config.rs:412:21\n\
389 + error: could not compile `makenotwork` (lib test) due to 1 previous error\n";
390 + let GateFailure::CompileError { error_count, first_error } =
391 + classify_compile_error(b"", stderr)
392 + else { panic!("expected CompileError variant"); };
393 + assert_eq!(error_count, 1);
394 + assert_eq!(
395 + first_error.as_deref(),
396 + Some("error[E0063]: missing field `user_pages_host` in initializer of `Config`"),
397 + );
398 + }
399 +
400 + #[test]
401 + fn compile_error_falls_back_to_bare_error_line() {
402 + // A macro/resolver error has no `error[Ennnn]` code; we still want
403 + // the first real `error:` line, not the cargo summary.
404 + let stderr = b"error: cannot find macro `foo` in this scope\n\
405 + error: could not compile `makenotwork` (lib test) due to 2 previous errors\n";
406 + let GateFailure::CompileError { error_count, first_error } =
407 + classify_compile_error(b"", stderr)
408 + else { panic!("expected CompileError variant"); };
409 + assert_eq!(error_count, 2);
410 + assert_eq!(first_error.as_deref(), Some("error: cannot find macro `foo` in this scope"));
411 + }
412 +
413 + #[test]
414 + fn compile_error_unclassified_when_not_a_compile_failure() {
415 + // No `error[...]`, no `could not compile` — hand back the tail.
416 + let f = classify_compile_error(b"", b"warning: unused import\n");
417 + match f {
418 + GateFailure::Unclassified { legacy_detail: Some(d) } => {
419 + assert!(d.contains("unused import"));
420 + }
421 + other => panic!("expected Unclassified, got {other:?}"),
422 + }
423 + }
424 +
425 + #[test]
241 426 fn migration_drift_extracts_name() {
242 427 let err = "migration 0047_widgets was previously applied but is missing in the resolved migrations";
243 428 let f = classify_migration_error(err, None);
@@ -26,6 +26,16 @@ pub struct Config {
26 26 /// Served via `GET /logs/{version}/{gate}`. Defaults to `/srv/sando/logs`.
27 27 #[serde(default = "default_logs_root")]
28 28 pub logs_root: PathBuf,
29 + /// Shared cargo target dir. When set, every `cargo build`/`cargo test` the
30 + /// pipeline runs uses this one `CARGO_TARGET_DIR` instead of each per-sha
31 + /// worktree's own `target/`, so a 1-line diff reuses the previous sha's
32 + /// compiled dependencies (a ~10-min clean build becomes a 1–2-min
33 + /// incremental one). Safe because builds are serialized — a new `/rebuild`
34 + /// aborts the in-flight one — so no two cargo invocations ever share the
35 + /// dir concurrently. Unset = per-worktree `target/` (the historical
36 + /// behavior). Cargo creates the dir if absent.
37 + #[serde(default)]
38 + pub cargo_target_dir: Option<PathBuf>,
29 39 /// Non-binary contents to stage into each release dir alongside
30 40 /// `bin_names`. Each entry copies `worktree/<src>` into
31 41 /// `<release>/<dst>`. `required=false` makes a missing source a warn
@@ -54,6 +64,32 @@ pub struct ReleaseEntry {
54 64 fn default_bin_names() -> Vec<String> { vec!["server".into()] }
55 65 fn default_logs_root() -> PathBuf { PathBuf::from("/srv/sando/logs") }
56 66
67 + #[cfg(test)]
68 + mod tests {
69 + use super::*;
70 +
71 + const MINIMAL: &str = r#"
72 + listen = "127.0.0.1:7766"
73 + db_path = "./sando.db"
74 + topology_path = "../sando.toml"
75 + workdir = "./work"
76 + release_root = "./releases"
77 + "#;
78 +
79 + #[test]
80 + fn cargo_target_dir_parses_when_present() {
81 + let raw = format!("{MINIMAL}\ncargo_target_dir = \"/srv/sando/cargo-target\"\n");
82 + let cfg: Config = toml::from_str(&raw).unwrap();
83 + assert_eq!(cfg.cargo_target_dir.as_deref(), Some(std::path::Path::new("/srv/sando/cargo-target")));
84 + }
85 +
86 + #[test]
87 + fn cargo_target_dir_defaults_to_none() {
88 + let cfg: Config = toml::from_str(MINIMAL).unwrap();
89 + assert!(cfg.cargo_target_dir.is_none(), "omitting it keeps the per-worktree target/");
90 + }
91 + }
92 +
57 93 impl Config {
58 94 /// Primary binary — the one the systemd unit's ExecStart points at.
59 95 pub fn primary_bin(&self) -> &str {
@@ -79,6 +115,7 @@ impl Config {
79 115 bin_names: vec!["server".into()],
80 116 logs_root: PathBuf::from("/tmp/sando-test-logs"),
81 117 release_contents: Vec::new(),
118 + cargo_target_dir: None,
82 119 }
83 120 }
84 121 }
@@ -327,6 +327,18 @@ impl fmt::Display for DeployId {
327 327 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) }
328 328 }
329 329
330 + /// Primary key of `build_runs` — the resource a `/rebuild` returns and a
331 + /// non-TUI driver polls via `GET /runs/{id}`. Distinct from `GateRunId`
332 + /// (one build run drives many gate runs).
333 + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, sqlx::Type)]
334 + #[sqlx(transparent)]
335 + #[serde(transparent)]
336 + pub struct RunId(pub i64);
337 +
338 + impl fmt::Display for RunId {
339 + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) }
340 + }
341 +
330 342 #[cfg(test)]
331 343 mod tests {
332 344 use super::*;
@@ -118,42 +118,46 @@ pub async fn run_all(ctx: &GateCtx, gates: &[Gate]) -> Result<bool> {
118 118
119 119 async fn cargo_test(ctx: &GateCtx, run_id: GateRunId) -> Result<GateOutcome> {
120 120 let server_dir = ctx.worktree.join("server");
121 - let mut cmd = Command::new("cargo");
122 - // Match CI (`server/deploy/run-ci.sh`): `--features fast-tests` relaxes
123 - // auth rate-limit burst (5 → 20) and argon2 cost so signup-heavy + lockout
124 - // workflow tests can complete without hitting Governor before the
125 - // hand-rolled lockout check. The feature is specifically documented for
126 - // this in `server/src/constants.rs:87`.
127 - cmd.args(["test", "--release", "--features", "fast-tests"])
128 - .current_dir(&server_dir)
129 - .stdout(std::process::Stdio::piped())
130 - .stderr(std::process::Stdio::piped())
131 - .kill_on_drop(true);
132 - // Same online-mode rationale as the build step: sqlx query macros need a
133 - // live DB to type-check against. The scratch DB is left in migrated state
134 - // by the preceding build, so we can reuse it here.
121 + let log_path = gate_log_path(ctx, GateKind::CargoTest);
122 + let log_ref = LogRef::new(&ctx.version, GateKind::CargoTest);
123 +
124 + // Best-effort: drop our own role's stale `mnw_test_*` databases (the
125 + // template + any per-test clones orphaned by a previously-killed run)
126 + // before the suite, so they can't accumulate or collide. Foreign-owned
127 + // leftovers are left alone — the harness now namespaces its template per
128 + // role, so they no longer wedge the gate.
135 129 if let Some(scratch_url) = ctx.cfg.scratch_db_url.as_deref() {
136 - cmd.env("DATABASE_URL", scratch_url);
137 - // The server test harness (tests/harness/db.rs) parses TEST_DATABASE_URL
138 - // with rfind('/'), which mangles URLs whose query string contains '/'
139 - // (e.g. `?host=/var/run/postgresql`). Strip the query — libpq defaults
140 - // to /var/run/postgresql on Debian/Ubuntu when host is unspecified.
141 - let test_url = scratch_url
142 - .split_once('?')
143 - .map(|(base, _)| base)
144 - .unwrap_or(scratch_url);
145 - cmd.env("TEST_DATABASE_URL", test_url);
146 - // Best-effort: drop our own role's stale `mnw_test_*` databases (the
147 - // template + any per-test clones orphaned by a previously-killed run)
148 - // before the suite, so they can't accumulate or collide. Foreign-owned
149 - // leftovers are left alone — the harness now namespaces its template
150 - // per role, so they no longer wedge the gate.
151 130 clean_stale_test_dbs(scratch_url).await;
152 131 }
132 +
153 133 let started = std::time::Instant::now();
154 - let log_path = gate_log_path(ctx, GateKind::CargoTest);
155 - let log_ref = LogRef::new(&ctx.version, GateKind::CargoTest);
156 - let mut child = match cmd.spawn() {
134 +
135 + // Fast pre-gate: compile the test targets WITHOUT running them. This
136 + // builds the exact `--release --features fast-tests` artifacts the full
137 + // run needs (so the subsequent run reuses the cache — no wasted work),
138 + // but fails in ~minutes with the real `error[Ennnn]: ...` on a
139 + // test-only-target compile break. That class (a field missing in a
140 + // `#[cfg(test)]`-only binary like `load`) otherwise compiles fine under
141 + // the build step + `--test integration` and only blows up here, after a
142 + // full build, as an opaque mass test failure.
143 + let mut pre = match cargo_test_command(ctx, &server_dir, &["--no-run"]).spawn() {
144 + Ok(c) => c,
145 + Err(e) => {
146 + return Ok(GateOutcome::failed(GateFailure::SpawnFailed {
147 + message: e.to_string(),
148 + }).with_log_ref(log_ref));
149 + }
150 + };
151 + let (pre_out, pre_err, pre_status) =
152 + stream_child_to_live_log(&mut pre, ctx.events.clone(), run_id, log_path.clone()).await?;
153 + if !pre_status.success() {
154 + let failure = classify::classify_compile_error(&pre_out, &pre_err);
155 + return Ok(GateOutcome::failed(failure).with_log_ref(log_ref));
156 + }
157 +
158 + // Full run: the test binaries are already built above, so cargo's
159 + // up-to-date check skips compilation and this just runs the tests.
160 + let mut child = match cargo_test_command(ctx, &server_dir, &[]).spawn() {
157 161 Ok(c) => c,
158 162 Err(e) => {
159 163 return Ok(GateOutcome::failed(GateFailure::SpawnFailed {
@@ -172,6 +176,46 @@ async fn cargo_test(ctx: &GateCtx, run_id: GateRunId) -> Result<GateOutcome> {
172 176 }
173 177 }
174 178
179 + /// Configure (but don't spawn) `cargo test --release --features fast-tests
180 + /// <extra>` in `dir`, wired to the scratch DB. Shared by the `--no-run`
181 + /// pre-gate compile and the full test run so both go through one env setup.
182 + ///
183 + /// `--features fast-tests` matches CI (`server/deploy/run-ci.sh`): it relaxes
184 + /// the auth rate-limit burst (5 → 20) and argon2 cost so signup-heavy +
185 + /// lockout workflow tests complete without hitting Governor before the
186 + /// hand-rolled lockout check (documented at `server/src/constants.rs:87`).
187 + fn cargo_test_command(ctx: &GateCtx, dir: &std::path::Path, extra: &[&str]) -> Command {
188 + let mut cmd = Command::new("cargo");
189 + cmd.args(["test", "--release", "--features", "fast-tests"])
190 + .args(extra)
191 + .current_dir(dir)
192 + .stdout(std::process::Stdio::piped())
193 + .stderr(std::process::Stdio::piped())
194 + .kill_on_drop(true);
195 + // Share the build step's target dir so the test compile reuses its
196 + // artifacts (and the `--no-run` precompile reuses them again). Must match
197 + // `build.rs` or the gate would clean-compile the whole tree a second time.
198 + if let Some(target) = ctx.cfg.cargo_target_dir.as_deref() {
199 + cmd.env("CARGO_TARGET_DIR", target);
200 + }
201 + // Same online-mode rationale as the build step: sqlx query macros need a
202 + // live DB to type-check against. The scratch DB is left in migrated state
203 + // by the preceding build, so we can reuse it here.
204 + if let Some(scratch_url) = ctx.cfg.scratch_db_url.as_deref() {
205 + cmd.env("DATABASE_URL", scratch_url);
206 + // The server test harness (tests/harness/db.rs) parses TEST_DATABASE_URL
207 + // with rfind('/'), which mangles URLs whose query string contains '/'
208 + // (e.g. `?host=/var/run/postgresql`). Strip the query — libpq defaults
209 + // to /var/run/postgresql on Debian/Ubuntu when host is unspecified.
210 + let test_url = scratch_url
211 + .split_once('?')
212 + .map(|(base, _)| base)
213 + .unwrap_or(scratch_url);
214 + cmd.env("TEST_DATABASE_URL", test_url);
215 + }
216 + cmd
217 + }
218 +
175 219 async fn migration_dry_run(ctx: &GateCtx) -> Result<GateOutcome> {
176 220 let mut log_buf: Vec<u8> = Vec::new();
177 221 let log_ref = LogRef::new(&ctx.version, GateKind::MigrationDryRun);
@@ -272,12 +316,18 @@ pub(crate) async fn reset_scratch(db_url: &str) -> Result<()> {
272 316 Ok(())
273 317 }
274 318
275 - /// Best-effort cleanup of stale test databases left behind by a
276 - /// previously-killed `cargo_test` run (the per-test `mnw_test_<uuid>` clones
277 - /// and the role's template). Only drops databases owned by the connecting
278 - /// role — a foreign-owned leftover can't be dropped without superuser anyway,
279 - /// and the harness now namespaces its template per role so one can't wedge us.
280 - /// Never returns an error: a cleanup miss must not turn a deploy red.
319 + /// Best-effort cleanup of stale per-test database clones (`mnw_test_<uuid>`)
320 + /// left behind by a previously-killed `cargo_test` run. Only drops databases
321 + /// owned by the connecting role — a foreign-owned leftover can't be dropped
322 + /// without superuser anyway, and the harness namespaces its template per role
323 + /// so one can't wedge us.
324 + ///
325 + /// Deliberately **excludes the template** (`mnw_test_template_*`): the harness
326 + /// reuses it across runs when it's migration-current (skipping a full
327 + /// drop+migrate), so dropping it here would force a needless rebuild every
328 + /// gate run. Templates are bounded (one per role) and never accumulate, so
329 + /// leaving them is free. Never returns an error: a cleanup miss must not turn a
330 + /// deploy red.
281 331 async fn clean_stale_test_dbs(db_url: &str) {
282 332 use sqlx::postgres::PgPoolOptions;
283 333 use sqlx::Executor;
@@ -294,6 +344,7 @@ async fn clean_stale_test_dbs(db_url: &str) {
294 344 let names: Vec<(String,)> = sqlx::query_as(
295 345 "SELECT datname FROM pg_database
296 346 WHERE datname LIKE 'mnw_test_%'
347 + AND datname NOT LIKE '%template%'
297 348 AND pg_catalog.pg_has_role(current_user, datdba, 'USAGE')",
298 349 )
299 350 .fetch_all(&pool)
@@ -22,6 +22,7 @@ pub mod live_log;
22 22 pub mod metrics;
23 23 pub mod outcome;
24 24 pub mod routes;
25 + pub mod runs;
25 26 pub mod state;
26 27 pub mod sync;
27 28 pub mod topology;
@@ -146,7 +146,23 @@ impl GateBlocker {
146 146 pub enum GateFailure {
147 147 /// `cargo_test` exited non-zero. `failed_count` may be 0 if the
148 148 /// classifier couldn't parse the count (e.g. compile error).
149 - CargoTest { failed_count: u32, first_failed: Option<String> },
149 + /// `first_failed` is the first failing test's name; `first_panic` is the
150 + /// first panic *message* (root cause), chosen to skip the "Once instance
151 + /// has previously been poisoned" cascade so 800 poisoned tests don't bury
152 + /// the one real panic that poisoned them.
153 + CargoTest {
154 + failed_count: u32,
155 + first_failed: Option<String>,
156 + #[serde(default, skip_serializing_if = "Option::is_none")]
157 + first_panic: Option<String>,
158 + },
159 + /// `cargo_test` fast pre-gate (`cargo test --no-run`): the test
160 + /// targets failed to compile, so no tests ran. `first_error` is the
161 + /// headline diagnostic (e.g. `error[E0063]: missing field
162 + /// user_pages_host`) and `error_count` is cargo's "N previous errors".
163 + /// Distinct from `CargoTest` so a test-only-target compile break reads
164 + /// as a build error, not "0 tests failed".
165 + CompileError { error_count: u32, first_error: Option<String> },
150 166 /// `migration_dry_run`: a migration that was previously applied is
151 167 /// no longer present in the resolved migrations directory.
152 168 MigrationDrift { migration: String },
@@ -175,10 +191,17 @@ pub enum GateFailure {
175 191 impl GateFailure {
176 192 pub fn summary(&self) -> String {
177 193 match self {
178 - GateFailure::CargoTest { failed_count, first_failed: Some(name) } =>
194 + // The panic message is the diagnostic; prefer it over the test name.
195 + GateFailure::CargoTest { failed_count, first_panic: Some(p), .. } =>
196 + format!("{failed_count} test(s) failed; first panic: {p}"),
197 + GateFailure::CargoTest { failed_count, first_failed: Some(name), first_panic: None } =>
179 198 format!("{failed_count} test(s) failed; first: {name}"),
180 - GateFailure::CargoTest { failed_count, first_failed: None } =>
199 + GateFailure::CargoTest { failed_count, first_failed: None, first_panic: None } =>
181 200 format!("{failed_count} test(s) failed"),
201 + GateFailure::CompileError { error_count, first_error: Some(e) } =>
202 + format!("compile failed ({error_count} error(s)); first: {e}"),
203 + GateFailure::CompileError { error_count, first_error: None } =>
204 + format!("compile failed ({error_count} error(s))"),
182 205 GateFailure::MigrationDrift { migration } =>
183 206 format!("migration {migration} previously applied but missing"),
184 207 GateFailure::MigrationModified { migration } =>
@@ -1,6 +1,6 @@
1 1 use crate::error::Result;
2 2 use crate::state::AppState;
3 - use axum::extract::{Path, State, WebSocketUpgrade};
3 + use axum::extract::{Path, Query, State, WebSocketUpgrade};
4 4 use axum::response::IntoResponse;
5 5 use axum::routing::{get, post};
6 6 use axum::{Json, Router};
@@ -27,6 +27,8 @@ pub fn router(state: AppState) -> Router {
27 27
28 28 let open = Router::new()
29 29 .route("/state", get(get_state))
30 + .route("/runs/{id}", get(get_run))
31 + .route("/runs/{id}/wait", get(get_run_wait))
30 32 .route("/logs/{version}/{gate}", get(get_gate_log))
31 33 .route("/events", get(events_ws));
32 34
@@ -84,6 +86,12 @@ struct StateView {
84 86 /// the *deployed product*, not the controller).
85 87 sandod_version: &'static str,
86 88 tiers: Vec<TierView>,
89 + /// The most recent build run (the resource `GET /runs/{id}` exposes in
90 + /// full). Surfaced here so a `/state` poller sees an in-flight or failed
91 + /// build — the tier versions only ever reflect the last *success*, so
92 + /// without this `/state` looks frozen for the whole build. `null` until
93 + /// the first `/rebuild`.
94 + build: Option<crate::runs::BuildSummary>,
87 95 }
88 96
89 97 #[derive(Serialize)]
@@ -198,7 +206,8 @@ async fn get_state(State(s): State<AppState>) -> Result<Json<StateView>> {
198 206 });
199 207 }
200 208
201 - Ok(Json(StateView { sandod_version: env!("CARGO_PKG_VERSION"), tiers }))
209 + let build = crate::runs::latest_summary(&s.pool).await?;
210 + Ok(Json(StateView { sandod_version: env!("CARGO_PKG_VERSION"), tiers, build }))
202 211 }
203 212
204 213 #[derive(Deserialize, Default)]
@@ -757,13 +766,24 @@ async fn rebuild(
757 766 ) -> Result<Json<serde_json::Value>> {
758 767 let body = body.map(|Json(b)| b).unwrap_or_default();
759 768 let sha = match body.sha {
760 - Some(s) => s,
761 - None => crate::git::resolve_ref(
762 - std::path::Path::new(&s.topo.repo.bare_path),
763 - &s.topo.repo.branch,
764 - )
765 - .await
766 - .map_err(crate::error::Error::Other)?,
769 + Some(sha) => sha,
770 + None => {
771 + // Omitted sha = "build the deploy branch's tip". Fetch upstream
772 + // first so we resolve the *upstream* HEAD, not a possibly-stale
773 + // local branch ref — the build task fetches too, but only after the
774 + // sha is already chosen, so without this `/rebuild {}` could build
775 + // an old commit. A fetch failure is non-fatal: fall back to the
776 + // current bare-repo tip (same policy as the build task).
777 + let bare = std::path::Path::new(&s.topo.repo.bare_path);
778 + if let Some(upstream) = s.topo.repo.upstream.as_deref()
779 + && let Err(e) = crate::git::fetch_upstream(bare, upstream, &s.topo.repo.branch).await
780 + {
781 + tracing::warn!(error = %e, "pre-resolve upstream fetch failed; resolving current bare-repo branch tip");
782 + }
783 + crate::git::resolve_ref(bare, &s.topo.repo.branch)
784 + .await
785 + .map_err(crate::error::Error::Other)?
786 + }
767 787 };
768 788
769 789 // Boundary parse: a sha entering Sando must be hex of plausible length.
@@ -774,17 +794,27 @@ async fn rebuild(
774 794 tracing::info!(sha = %sha, "rebuild requested");
775 795 crate::events::emit(&s.events, crate::events::Event::RebuildRequested { sha: sha.clone() });
776 796
797 + // One pollable resource per triggered build. Created before the spawn so
798 + // the run id is in the response even if the task is aborted milliseconds
799 + // later by a still-newer /rebuild.
800 + let run_id = crate::runs::create(&s.pool, sha.as_str())
801 + .await
802 + .map_err(crate::error::Error::Other)?;
803 +
777 804 // Latest /rebuild wins: abort any in-flight build before spawning a new
778 805 // one. Aborting drops the spawned task's future, which drops any
779 806 // tokio::process::Child it owns; with `kill_on_drop(true)` set on the
780 807 // cargo Command, SIGKILL propagates to cargo + its rustc children.
781 808 let mut slot = s.active_build.lock().await;
782 - if let Some(prev) = slot.take() {
783 - if !prev.is_finished() {
784 - tracing::warn!("aborting in-flight build for newer /rebuild request");
785 - crate::events::emit(&s.events, crate::events::Event::BuildAborted { sha_aborted: sha.clone() });
786 - prev.abort();
787 - }
809 + if let Some(prev) = slot.take()
810 + && !prev.handle.is_finished()
811 + {
812 + tracing::warn!("aborting in-flight build for newer /rebuild request");
813 + crate::events::emit(&s.events, crate::events::Event::BuildAborted { sha_aborted: sha.clone() });
814 + prev.handle.abort();
815 + // Aborting drops the task before it can settle its own row, so
816 + // record the supersession here.
817 + crate::runs::mark_aborted(&s.pool, prev.run_id).await.ok();
788 818 }
789 819
790 820 let pool = s.pool.clone();
@@ -793,14 +823,71 @@ async fn rebuild(
793 823 let events_for_task = s.events.clone();
794 824 let sha_for_task = sha.clone();
795 825 let sha_response = sha.to_string();
826 + let pool_for_task = s.pool.clone();
796 827 let handle = tokio::spawn(async move {
797 - if let Err(e) = crate::build::build_and_run_host(pool, cfg, topo, sha_for_task.clone(), events_for_task).await {
828 + if let Err(e) = crate::build::build_and_run_host(pool, cfg, topo, sha_for_task.clone(), events_for_task, run_id).await {
798 829 tracing::error!(sha = %sha_for_task, error = %e, "rebuild pipeline failed");
830 + // Pre-gate bails (fetch/checkout/version/scratch) don't settle the
831 + // run themselves; the build-step compile error already did. First
832 + // terminal write wins, so this is a safety net for the rest.
833 + crate::runs::mark_failed(&pool_for_task, run_id, &format!("{e:#}")).await.ok();
799 834 }
800 835 });
801 - *slot = Some(handle.abort_handle());
836 + *slot = Some(crate::state::ActiveBuild { handle: handle.abort_handle(), run_id });
802 837
803 - Ok(Json(serde_json::json!({ "accepted": true, "sha": sha_response })))
838 + Ok(Json(serde_json::json!({ "accepted": true, "sha": sha_response, "run_id": run_id.0 })))
839 + }
840 +
841 + /// `GET /runs/{id}` — the build-status resource a non-TUI driver polls after
842 + /// `/rebuild`. Open (read-only) like `/state` and `/logs`.
843 + async fn get_run(
844 + State(s): State<AppState>,
845 + Path(id): Path<i64>,
846 + ) -> Result<Json<crate::runs::RunView>> {
847 + crate::runs::get(&s.pool, crate::domain::RunId(id))
848 + .await
849 + .map_err(crate::error::Error::Other)?
850 + .map(Json)
851 + .ok_or(crate::error::Error::NotFound)
852 + }
853 +
854 + #[derive(Deserialize)]
855 + struct WaitParams {
856 + /// How long to hold the request open before returning a still-building
857 + /// run. Default 30s, capped at 120s.
858 + #[serde(default)]
859 + timeout_ms: Option<u64>,
860 + }
861 +
862 + /// `GET /runs/{id}/wait` — long-poll: hold the request open until the run
863 + /// settles (`result != building`) or the timeout elapses, then return the
864 + /// current `RunView`. Removes polling-cadence guessing for a headless driver
865 + /// (fire `/rebuild`, block on `/wait`). On timeout the run is returned
866 + /// still-building (200) and the caller re-issues `/wait`. 404 if unknown.
867 + async fn get_run_wait(
868 + State(s): State<AppState>,
869 + Path(id): Path<i64>,
870 + Query(p): Query<WaitParams>,
871 + ) -> Result<Json<crate::runs::RunView>> {
872 + let run_id = crate::domain::RunId(id);
873 + let timeout = std::time::Duration::from_millis(p.timeout_ms.unwrap_or(30_000).min(120_000));
874 + let deadline = tokio::time::Instant::now() + timeout;
875 + // Poll the row rather than wiring a per-run notifier: a build settles on
876 + // the minute scale, so a sub-second tick is plenty responsive and the
877 + // query is a single indexed read. The request releases its pool handle
878 + // between ticks.
879 + let tick = std::time::Duration::from_millis(750);
880 + loop {
881 + let view = crate::runs::get(&s.pool, run_id)
882 + .await
883 + .map_err(crate::error::Error::Other)?
884 + .ok_or(crate::error::Error::NotFound)?;
885 + let now = tokio::time::Instant::now();
886 + if view.result != "building" || now >= deadline {
887 + return Ok(Json(view));
888 + }
889 + tokio::time::sleep((deadline - now).min(tick)).await;
890 + }
804 891 }
805 892
806 893 #[derive(Deserialize)]
@@ -835,7 +922,7 @@ async fn self_update(
835 922 // the restart would SIGKILL it mid-deploy. Make the operator retry once idle.
836 923 {
837 924 let slot = s.active_build.lock().await;
838 - if slot.as_ref().is_some_and(|h| !h.is_finished()) {
925 + if slot.as_ref().is_some_and(|b| !b.handle.is_finished()) {
839 926 return Err(crate::error::Error::GateBlocked(
840 927 "a server build is in flight; retry /self-update once it settles".into(),
841 928 ));
@@ -1074,6 +1161,7 @@ mod tests {
1074 1161 bin_names: vec!["makenotwork".into()],
1075 1162 logs_root: PathBuf::from("/tmp/sando-logs"),
1076 1163 release_contents: vec![],
1164 + cargo_target_dir: None,
1077 1165 }
1078 1166 }
1079 1167
@@ -1385,6 +1473,112 @@ mod tests {
1385 1473 assert_eq!(resp.status(), StatusCode::NOT_FOUND);
1386 1474 }
1387 1475
1476 + #[tokio::test]
1477 + async fn get_run_404s_for_unknown_id() {
1478 + let state = test_state().await;
1479 + let app = router(state);
1480 + let resp = app
1481 + .oneshot(Request::builder().uri("/runs/999").body(Body::empty()).unwrap())
1482 + .await
1483 + .unwrap();
1484 + assert_eq!(resp.status(), StatusCode::NOT_FOUND);
1485 + }
1486 +
1487 + #[tokio::test]
1488 + async fn get_run_returns_view_with_gates() {
1489 + let state = test_state().await;
1490 + // A run that reached version 0.10.2 and ran two host gates (one red).
1491 + let run_id = crate::runs::create(&state.pool, "abc1234def").await.unwrap();
1492 + let ver: crate::domain::Version = "0.10.2".parse().unwrap();
1493 + seed(&state.pool, "host", "0.10.2").await;
1494 + crate::runs::set_version(&state.pool, run_id, &ver).await.unwrap();
1495 + insert_gate(&state.pool, "host", "0.10.2", "cargo_test", 0).await;
1496 + insert_gate(&state.pool, "host", "0.10.2", "boot_smoke", 1).await;
1497 +
1498 + let app = router(state);
1499 + let resp = app
1500 + .oneshot(
1501 + Request::builder()
1502 + .uri(format!("/runs/{}", run_id.0))
1503 + .body(Body::empty())
1504 + .unwrap(),
1505 + )
1506 + .await
1507 + .unwrap();
1508 + assert_eq!(resp.status(), StatusCode::OK);
1509 + let v: serde_json::Value = serde_json::from_str(&body_string(resp).await).unwrap();
1510 + assert_eq!(v["run_id"], run_id.0);
1511 + assert_eq!(v["sha"], "abc1234def");
1512 + assert_eq!(v["version"], "0.10.2");
1513 + assert_eq!(v["result"], "building");
1514 + // Both host gates surface, latest-per-kind, alphabetized by kind.
1515 + assert_eq!(v["gates"].as_array().unwrap().len(), 2);
1516 + assert_eq!(v["gates"][0]["kind"], "boot_smoke");
1517 + assert_eq!(v["gates"][0]["status"], "passed");
1518 + assert_eq!(v["gates"][1]["kind"], "cargo_test");
1519 + assert_eq!(v["gates"][1]["status"], "failed");
1520 + }
1521 +
1522 + #[tokio::test]
1523 + async fn get_run_wait_returns_immediately_when_settled() {
1524 + let state = test_state().await;
1525 + let run_id = crate::runs::create(&state.pool, "abc1234def").await.unwrap();
1526 + crate::runs::mark_passed(&state.pool, run_id).await.unwrap();
1527 +
1528 + let app = router(state);
1529 + // Generous timeout, but an already-settled run must not wait for it.
1530 + let resp = app
1531 + .oneshot(
1532 + Request::builder()
1533 + .uri(format!("/runs/{}/wait?timeout_ms=60000", run_id.0))
1534 + .body(Body::empty())
1535 + .unwrap(),
1536 + )
1537 + .await
1538 + .unwrap();
1539 + assert_eq!(resp.status(), StatusCode::OK);
1540 + let v: serde_json::Value = serde_json::from_str(&body_string(resp).await).unwrap();
1541 + assert_eq!(v["result"], "passed");
1542 + }
1543 +
1544 + #[tokio::test]
1545 + async fn get_run_wait_returns_building_at_timeout() {
1546 + let state = test_state().await;
1547 + let run_id = crate::runs::create(&state.pool, "abc1234def").await.unwrap();
1548 +
1549 + let app = router(state);
1550 + // timeout_ms=0 → deadline is now → the first poll returns the
1551 + // still-building run rather than blocking.
1552 + let resp = app
1553 + .oneshot(
1554 + Request::builder()
1555 + .uri(format!("/runs/{}/wait?timeout_ms=0", run_id.0))
1556 + .body(Body::empty())
1557 + .unwrap(),
1558 + )
1559 + .await
1560 + .unwrap();
1561 + assert_eq!(resp.status(), StatusCode::OK);
1562 + let v: serde_json::Value = serde_json::from_str(&body_string(resp).await).unwrap();
1563 + assert_eq!(v["result"], "building");
1564 + }
1565 +
1566 + #[tokio::test]
1567 + async fn get_run_wait_404s_for_unknown_id() {
1568 + let state = test_state().await;
1569 + let app = router(state);
1570 + let resp = app
1571 + .oneshot(
1572 + Request::builder()
1573 + .uri("/runs/999/wait?timeout_ms=0")
1574 + .body(Body::empty())
1575 + .unwrap(),
1576 + )
1577 + .await
1578 + .unwrap();
1579 + assert_eq!(resp.status(), StatusCode::NOT_FOUND);
1580 + }
1581 +
1388 1582 #[test]
1389 1583 fn self_update_unit_maps_sha_to_instance() {
1390 1584 let sha = crate::domain::GitSha::parse("abc1234def5678").unwrap();
@@ -1630,6 +1824,25 @@ mod tests {
1630 1824 }
1631 1825
1632 1826 #[tokio::test]
1827 + async fn state_build_is_null_until_first_rebuild_then_surfaces_latest() {
1828 + use axum::extract::State;
1829 + let state = test_state().await;
1830 + // No build runs yet → build is null, so /state doesn't pretend a build
1831 + // is happening.
1832 + let Json(view) = get_state(State(state.clone())).await.unwrap();
1833 + assert!(view.build.is_none());
1834 +
1835 + // A failed run must surface its cause in /state, not just in /runs.
1836 + let run_id = crate::runs::create(&state.pool, "deadbeef").await.unwrap();
1837 + crate::runs::mark_failed(&state.pool, run_id, "cargo_test: 3 test(s) failed").await.unwrap();
1838 + let Json(view) = get_state(State(state)).await.unwrap();
1839 + let b = view.build.expect("build surfaced");
1840 + assert_eq!(b.run_id, run_id.0);
1841 + assert_eq!(b.result, "failed");
1842 + assert_eq!(b.failure_summary.as_deref(), Some("cargo_test: 3 test(s) failed"));
1843 + }
1844 +
1845 + #[tokio::test]
1633 1846 async fn promote_with_explicit_version_but_missing_artifact_404s() {
1634 1847 // Explicit version supplied, gates trivially pass (mm has none in
1635 1848 // test_topo), but `versions` table has no row → 404.
@@ -0,0 +1,394 @@
1 + //! Build-run tracking: one `build_runs` row per `/rebuild`, updated as the
2 + //! pipeline moves through its phases, terminating in passed/failed/aborted.
3 + //!
4 + //! This is the resource that makes Sando driveable headlessly. `/state` only
5 + //! ever reflects the last *successful* deploy, so on a red pipeline a poller
6 + //! of `/state` sees stale-green for the whole build (the 0.10.2 incident). A
7 + //! `RunId` returned by `/rebuild` + `GET /runs/{id}` gives a non-TUI caller
8 + //! one pollable resource tied to the build it triggered, carrying the phase,
9 + //! the per-gate status, and — the highest-value bit — a `failure_summary`
10 + //! (first compile error / first failed gate) so the cause is in the API, not
11 + //! behind `sudo journalctl`.
12 + //!
13 + //! Terminal writes (`mark_passed`/`mark_failed`/`mark_aborted`) are guarded on
14 + //! `result = 'building'`, so whichever site settles the run first wins: a
15 + //! build-step compile error, the first red gate, or the task-level catch for
16 + //! pre-build bails. Later writes are silent no-ops.
17 +
18 + use crate::domain::{RunId, Version};
19 + use anyhow::Result;
20 + use chrono::Utc;
21 + use serde::Serialize;
22 + use sqlx::{Row, SqlitePool};
23 +
24 + /// In-flight sub-state. Plain strings in the DB; this enum names the values so
25 + /// call sites can't typo them.
26 + #[derive(Debug, Clone, Copy)]
27 + pub enum Phase {
28 + Fetching,
29 + Compiling,
30 + Staging,
31 + Gating,
32 + }
33 +
34 + impl Phase {
35 + pub fn as_str(self) -> &'static str {
36 + match self {
37 + Phase::Fetching => "fetching",
38 + Phase::Compiling => "compiling",
39 + Phase::Staging => "staging",
40 + Phase::Gating => "gating",
41 + }
42 + }
43 + }
44 +
45 + /// Insert a fresh `building` run for `sha` and return its id.
46 + pub async fn create(pool: &SqlitePool, sha: &str) -> Result<RunId> {
47 + let id: i64 = sqlx::query_scalar(
48 + "INSERT INTO build_runs (sha, phase, result, started_at)
49 + VALUES (?, 'queued', 'building', ?) RETURNING id",
50 + )
51 + .bind(sha)
52 + .bind(Utc::now().to_rfc3339())
53 + .fetch_one(pool)
54 + .await?;
55 + Ok(RunId(id))
56 + }
57 +
58 + /// Advance the in-flight phase. No-op once the run is terminal so a late
59 + /// phase write can't resurrect a finished row.
60 + pub async fn set_phase(pool: &SqlitePool, run_id: RunId, phase: Phase) -> Result<()> {
61 + sqlx::query("UPDATE build_runs SET phase = ? WHERE id = ? AND result = 'building'")
62 + .bind(phase.as_str())
63 + .bind(run_id.0)
64 + .execute(pool)
65 + .await?;
66 + Ok(())
67 + }
68 +
69 + /// Record the version once it's been read from the worktree's Cargo.toml.
70 + pub async fn set_version(pool: &SqlitePool, run_id: RunId, version: &Version) -> Result<()> {
71 + sqlx::query("UPDATE build_runs SET version = ? WHERE id = ? AND result = 'building'")
72 + .bind(version.to_string())
73 + .bind(run_id.0)
74 + .execute(pool)
75 + .await?;
76 + Ok(())
77 + }
78 +
79 + /// Settle the run green. First terminal write wins (guarded on `building`).
80 + pub async fn mark_passed(pool: &SqlitePool, run_id: RunId) -> Result<()> {
81 + sqlx::query(
82 + "UPDATE build_runs SET result = 'passed', phase = 'done', finished_at = ?
83 + WHERE id = ? AND result = 'building'",
84 + )
85 + .bind(Utc::now().to_rfc3339())
86 + .bind(run_id.0)
87 + .execute(pool)
88 + .await?;
89 + Ok(())
90 + }
91 +
92 + /// Settle the run red with a human-readable cause. First terminal write wins,
93 + /// so the most specific failure (build compile error, first red gate) recorded
94 + /// before the task-level catch is the one that sticks.
95 + pub async fn mark_failed(pool: &SqlitePool, run_id: RunId, summary: &str) -> Result<()> {
96 + // Bound the stored summary — it's a headline, not the log. The full output
97 + // is at the gate's log_ref / journald.
98 + let summary: String = summary.chars().take(600).collect();
99 + sqlx::query(
100 + "UPDATE build_runs SET result = 'failed', phase = 'done', failure_summary = ?, finished_at = ?
101 + WHERE id = ? AND result = 'building'",
102 + )
103 + .bind(&summary)
104 + .bind(Utc::now().to_rfc3339())
105 + .bind(run_id.0)
106 + .execute(pool)
107 + .await?;
108 + Ok(())
109 + }
110 +
111 + /// Settle the run as superseded by a newer `/rebuild`.
112 + pub async fn mark_aborted(pool: &SqlitePool, run_id: RunId) -> Result<()> {
113 + sqlx::query(
114 + "UPDATE build_runs SET result = 'aborted', phase = 'done',
115 + failure_summary = 'superseded by a newer /rebuild', finished_at = ?
116 + WHERE id = ? AND result = 'building'",
117 + )
118 + .bind(Utc::now().to_rfc3339())
119 + .bind(run_id.0)
120 + .execute(pool)
121 + .await?;
122 + Ok(())
123 + }
124 +
125 + /// One gate's status within a run view.
126 + #[derive(Debug, Serialize)]
127 + pub struct RunGateView {
128 + pub kind: String,
129 + /// `'passed' | 'failed' | 'blocked'` or NULL while in-flight.
130 + pub status: Option<String>,
131 + /// Relative path under `cfg.logs_root` for the full byte stream.
132 + pub log_ref: Option<String>,
133 + }
134 +
135 + /// The `GET /runs/{id}` payload.
136 + #[derive(Debug, Serialize)]
137 + pub struct RunView {
138 + pub run_id: i64,
139 + pub sha: String,
140 + pub version: Option<String>,
141 + pub phase: String,
142 + /// `'building' | 'passed' | 'failed' | 'aborted'`.
143 + pub result: String,
144 + pub started_at: String,
145 + pub finished_at: Option<String>,
146 + /// Headline cause when `result = 'failed'`: first compile error or first
147 + /// red gate. NULL otherwise.
148 + pub failure_summary: Option<String>,
149 + /// Gates run on the host tier for this run's version, latest row per kind.
150 + /// Empty until the run reaches a version + the gating phase.
151 + pub gates: Vec<RunGateView>,
152 + }
153 +
154 + /// Load a run plus its host-tier gate statuses. `None` if the id is unknown.
155 + pub async fn get(pool: &SqlitePool, run_id: RunId) -> Result<Option<RunView>> {
156 + let Some(row) = sqlx::query(
157 + "SELECT id, sha, version, phase, result, started_at, finished_at, failure_summary
158 + FROM build_runs WHERE id = ?",
159 + )
160 + .bind(run_id.0)
161 + .fetch_optional(pool)
162 + .await?
163 + else {
164 + return Ok(None);
165 + };
166 +
167 + let version: Option<String> = row.get("version");
168 + // Gates are keyed by (tier, version); a build run drives the `host` tier.
169 + // Latest row per gate_kind, matching `/state`'s per-tier query shape.
170 + let gates: Vec<RunGateView> = if let Some(ver) = version.as_deref() {
171 + sqlx::query(
172 + "SELECT gate_kind, status, log_ref
173 + FROM gate_runs g
174 + WHERE tier = 'host' AND version = ?1
175 + AND id = (SELECT MAX(id) FROM gate_runs
176 + WHERE tier = 'host' AND version = ?1 AND gate_kind = g.gate_kind)
177 + ORDER BY gate_kind",
178 + )
179 + .bind(ver)
180 + .fetch_all(pool)
181 + .await?
182 + .into_iter()
183 + .map(|gr| RunGateView {
184 + kind: gr.get("gate_kind"),
185 + status: gr.get("status"),
186 + log_ref: gr.get("log_ref"),
187 + })
188 + .collect()
189 + } else {
190 + Vec::new()
191 + };
192 +
193 + Ok(Some(RunView {
194 + run_id: row.get("id"),
195 + sha: row.get("sha"),
196 + version,
197 + phase: row.get("phase"),
198 + result: row.get("result"),
199 + started_at: row.get("started_at"),
200 + finished_at: row.get("finished_at"),
201 + failure_summary: row.get("failure_summary"),
202 + gates,
203 + }))
204 + }
205 +
206 + /// Compact view of the latest build run for `/state`'s liveness line.
207 + #[derive(Debug, Serialize)]
208 + pub struct BuildSummary {
209 + pub run_id: i64,
210 + pub sha: String,
211 + pub version: Option<String>,
212 + pub phase: String,
213 + pub result: String,
214 + pub failure_summary: Option<String>,
215 + /// Seconds from start to finish (or to now while building). Lets a
216 + /// `/state` poller show "building <ver>, phase=<x>, elapsed Ns" instead of
217 + /// a version frozen at the last success for the whole ~10-min build.
218 + pub elapsed_s: i64,
219 + }
220 +
221 + /// The most recent build run, for `/state`. `None` until the first `/rebuild`.
222 + pub async fn latest_summary(pool: &SqlitePool) -> Result<Option<BuildSummary>> {
223 + let Some(row) = sqlx::query(
224 + "SELECT id, sha, version, phase, result, failure_summary, started_at, finished_at
225 + FROM build_runs ORDER BY id DESC LIMIT 1",
226 + )
227 + .fetch_optional(pool)
228 + .await?
229 + else {
230 + return Ok(None);
231 + };
232 + let started_at: String = row.get("started_at");
233 + let finished_at: Option<String> = row.get("finished_at");
234 + Ok(Some(BuildSummary {
235 + run_id: row.get("id"),
236 + sha: row.get("sha"),
237 + version: row.get("version"),
238 + phase: row.get("phase"),
239 + result: row.get("result"),
240 + failure_summary: row.get("failure_summary"),
241 + elapsed_s: elapsed_seconds(&started_at, finished_at.as_deref()),
242 + }))
243 + }
244 +
245 + /// Seconds between an rfc3339 `started_at` and (`finished_at` or now), clamped
246 + /// at 0. A parse failure yields 0 rather than erroring the whole `/state` call.
247 + fn elapsed_seconds(started_at: &str, finished_at: Option<&str>) -> i64 {
248 + let Ok(start) = chrono::DateTime::parse_from_rfc3339(started_at) else {
249 + return 0;
250 + };
251 + let end = match finished_at {
252 + Some(f) => chrono::DateTime::parse_from_rfc3339(f)
253 + .map(|d| d.with_timezone(&Utc))
254 + .unwrap_or_else(|_| Utc::now()),
255 + None => Utc::now(),
256 + };
257 + (end - start.with_timezone(&Utc)).num_seconds().max(0)
258 + }
259 +
260 + /// The summary of the first failed gate for `version` on the host tier, if
261 + /// any — used by the build pipeline to populate `failure_summary` when
262 + /// `run_all` reports a red pipeline. Reads the typed `outcome_json` so the
263 + /// stored headline matches what the TUI renders.
264 + pub async fn first_failed_gate_summary(pool: &SqlitePool, version: &Version) -> Option<String> {
265 + let row = sqlx::query(
266 + "SELECT gate_kind, outcome_json FROM gate_runs
267 + WHERE tier = 'host' AND version = ? AND status = 'failed'
268 + ORDER BY id ASC LIMIT 1",
269 + )
270 + .bind(version.to_string())
271 + .fetch_optional(pool)
272 + .await
273 + .ok()
274 + .flatten()?;
275 + let kind: String = row.get("gate_kind");
276 + let outcome_json: Option<String> = row.get("outcome_json");
277 + let summary = outcome_json
278 + .and_then(|s| serde_json::from_str::<crate::outcome::GateOutcome>(&s).ok())
279 + .map(|o| match o.status {
280 + crate::outcome::GateStatus::Failed { failure } => failure.summary(),
281 + other => format!("{:?}", other),
282 + })
283 + .unwrap_or_else(|| "gate failed".to_string());
284 + Some(format!("{kind}: {summary}"))
285 + }
286 +
287 + #[cfg(test)]
288 + mod tests {
289 + use super::*;
290 + use sqlx::sqlite::SqlitePoolOptions;
291 +
292 + async fn pool() -> SqlitePool {
293 + let pool = SqlitePoolOptions::new()
294 + .max_connections(1)
295 + .connect("sqlite::memory:")
296 + .await
297 + .unwrap();
298 + crate::db::migrate(&pool).await.unwrap();
299 + pool
300 + }
301 +
302 + #[tokio::test]
303 + async fn create_then_get_roundtrips_building() {
304 + let pool = pool().await;
305 + let id = create(&pool, "abc1234").await.unwrap();
306 + let v = get(&pool, id).await.unwrap().expect("run exists");
307 + assert_eq!(v.sha, "abc1234");
308 + assert_eq!(v.result, "building");
309 + assert_eq!(v.phase, "queued");
310 + assert!(v.version.is_none());
311 + assert!(v.gates.is_empty());
312 + assert!(v.failure_summary.is_none());
313 + }
314 +
315 + #[tokio::test]
316 + async fn phase_and_version_advance_then_pass() {
317 + let pool = pool().await;
318 + let id = create(&pool, "abc1234").await.unwrap();
319 + set_phase(&pool, id, Phase::Compiling).await.unwrap();
320 + let ver: Version = "0.10.2".parse().unwrap();
321 + set_version(&pool, id, &ver).await.unwrap();
322 + mark_passed(&pool, id).await.unwrap();
323 +
324 + let v = get(&pool, id).await.unwrap().unwrap();
325 + assert_eq!(v.result, "passed");
326 + assert_eq!(v.phase, "done");
327 + assert_eq!(v.version.as_deref(), Some("0.10.2"));
328 + assert!(v.finished_at.is_some());
329 + }
330 +
331 + #[tokio::test]
332 + async fn first_terminal_write_wins() {
333 + let pool = pool().await;
334 + let id = create(&pool, "abc1234").await.unwrap();
335 + mark_failed(&pool, id, "error[E0063]: missing field user_pages_host").await.unwrap();
336 + // A later pass attempt (e.g. the task catch racing a build-step error)
337 + // must not overwrite the recorded failure.
338 + mark_passed(&pool, id).await.unwrap();
339 + // And a second failure summary doesn't clobber the first.
340 + mark_failed(&pool, id, "something else").await.unwrap();
341 +
342 + let v = get(&pool, id).await.unwrap().unwrap();
343 + assert_eq!(v.result, "failed");
344 + assert_eq!(v.failure_summary.as_deref(), Some("error[E0063]: missing field user_pages_host"));
345 + }
346 +
347 + #[tokio::test]
348 + async fn phase_write_after_terminal_is_noop() {
349 + let pool = pool().await;
350 + let id = create(&pool, "abc1234").await.unwrap();
351 + mark_passed(&pool, id).await.unwrap();
352 + set_phase(&pool, id, Phase::Gating).await.unwrap();
353 + let v = get(&pool, id).await.unwrap().unwrap();
354 + assert_eq!(v.phase, "done", "a late phase write must not move a finished run");
355 + }
356 +
357 + #[test]
358 + fn elapsed_seconds_uses_finished_when_present() {
359 + // Both timestamps present → exact span, no wall-clock dependency.
360 + let s = elapsed_seconds("2026-06-13T00:00:00Z", Some("2026-06-13T00:02:05Z"));
361 + assert_eq!(s, 125);
362 + // Unparseable start → 0, never a panic / negative.
363 + assert_eq!(elapsed_seconds("not-a-date", None), 0);
364 + }
365 +
366 + #[tokio::test]
367 + async fn latest_summary_reports_most_recent_run() {
368 + let pool = pool().await;
369 + assert!(latest_summary(&pool).await.unwrap().is_none());
370 + let _old = create(&pool, "old1234").await.unwrap();
371 + let new = create(&pool, "new5678").await.unwrap();
372 + set_phase(&pool, new, Phase::Compiling).await.unwrap();
373 + let sum = latest_summary(&pool).await.unwrap().expect("a run exists");
374 + assert_eq!(sum.run_id, new.0);
375 + assert_eq!(sum.sha, "new5678");
376 + assert_eq!(sum.phase, "compiling");
377 + assert_eq!(sum.result, "building");
378 + }
379 +
380 + #[tokio::test]
381 + async fn get_unknown_id_is_none() {
382 + let pool = pool().await;
383 + assert!(get(&pool, RunId(999)).await.unwrap().is_none());
384 + }
385 +
386 + #[tokio::test]
387 + async fn failure_summary_is_bounded() {
388 + let pool = pool().await;
389 + let id = create(&pool, "abc1234").await.unwrap();
390 + mark_failed(&pool, id, &"x".repeat(5_000)).await.unwrap();
391 + let v = get(&pool, id).await.unwrap().unwrap();
392 + assert!(v.failure_summary.unwrap().len() <= 600);
393 + }
394 + }
@@ -15,6 +15,13 @@ use tokio::task::AbortHandle;
15 15 /// constructing ssh/rsync invocations inline.
16 16 pub type ExecutorMap = HashMap<NodeId, Arc<dyn Executor>>;
17 17
18 + /// The in-flight build pipeline: its abort handle plus the `build_runs` row it
19 + /// drives. A newer `/rebuild` aborts the handle and settles the row `aborted`.
20 + pub struct ActiveBuild {
21 + pub handle: AbortHandle,
22 + pub run_id: crate::domain::RunId,
23 + }
24 +
18 25 #[derive(Clone)]
19 26 pub struct AppState {
20 27 pub pool: SqlitePool,
@@ -22,8 +29,10 @@ pub struct AppState {
22 29 pub cfg: Arc<Config>,
23 30 pub prom: PrometheusHandle,
24 31 /// Single-slot guard for the build pipeline. A new /rebuild aborts any
25 - /// in-flight build (cargo + gates) so the latest push always wins.
26 - pub active_build: Arc<Mutex<Option<AbortHandle>>>,
32 + /// in-flight build (cargo + gates) so the latest push always wins. Carries
33 + /// the run id alongside the handle so the aborting `/rebuild` can settle
34 + /// the superseded `build_runs` row as `aborted`.
35 + pub active_build: Arc<Mutex<Option<ActiveBuild>>>,
27 36 /// Serializes the deploy mutators (`/promote`, `/rollback`) so their
28 37 /// check -> deploy -> advance sequences never interleave. Without it two
29 38 /// concurrent promotes (or a promote racing a rollback) could deploy mixed
@@ -16,6 +16,12 @@ release_root = "/srv/sando"
16 16 scratch_db_url = "postgres:///sando_scratch?host=/var/run/postgresql"
17 17 bin_names = ["makenotwork", "mnw-admin"]
18 18 logs_root = "/srv/sando/logs"
19 + # Shared cargo target dir across per-sha worktrees. Without it every /rebuild
20 + # clean-compiles a fresh worktree (~10 min) even for a 1-line diff; with it the
21 + # incremental rebuild reuses the previous sha's compiled deps (1–2 min). Safe
22 + # because builds are serialized (a new /rebuild aborts the in-flight one). The
23 + # sando user must be able to write it; cargo creates it if absent.
24 + cargo_target_dir = "/srv/sando/cargo-target"
19 25
20 26 # Non-binary content shipped as part of each release. Multiple entries can
21 27 # target the same `dst` (additive merge — used to build `docs/` from three
@@ -489,9 +489,14 @@ fn pass_note_short(n: &PassNote) -> String {
489 489
490 490 fn failure_short(f: &GateFailure) -> String {
491 491 match f {
492 - GateFailure::CargoTest { failed_count, first_failed: Some(name) } =>
492 + GateFailure::CargoTest { failed_count, first_panic: Some(p), .. } =>
493 + format!("{failed_count} test(s); panic: {p}"),
494 + GateFailure::CargoTest { failed_count, first_failed: Some(name), .. } =>
493 495 format!("{failed_count} test(s); first {name}"),
494 496 GateFailure::CargoTest { failed_count, .. } => format!("{failed_count} test(s) failed"),
497 + GateFailure::CompileError { error_count, first_error: Some(e) } =>
498 + format!("compile failed ({error_count}); {e}"),
499 + GateFailure::CompileError { error_count, .. } => format!("compile failed ({error_count})"),
495 500 GateFailure::MigrationDrift { migration } => format!("drift {migration}"),
496 501 GateFailure::MigrationModified { migration } => format!("modified {migration}"),
497 502 GateFailure::MigrationSqlError { migration, sqlstate: Some(s) } =>