max / makenotwork

sando: build-run observability, fast pre-gate, shared build cache Close the D2/D2.1 deploy-reliability findings from the 2026-06-12/13 carousel + custom-pages deploys. Build-run tracking (the headless-driver gap): - build_runs table (migration 007) + RunId; /rebuild returns run_id - GET /runs/{id} -> phase/result/gates/failure_summary; GET /runs/{id}/wait long-polls to completion - /state carries a `build` summary so a poller sees in-flight/failed builds instead of a version frozen at the last success - failure_summary carries the cause: classified compile error, first red gate's typed summary, or the anyhow chain Gate speed + diagnosis: - cargo_test runs `cargo test --no-run` as a fast pre-gate; typed CompileError surfaces error[Ennnn] immediately, cache-shared with the run - classify_cargo_test extracts the first root-cause panic, skipping the "Once poisoned" cascade - cargo_target_dir config: one shared CARGO_TARGET_DIR across worktrees so an incremental diff reuses the prior sha's deps (~10 min -> 1-2 min) - /rebuild {} fetches upstream then resolves the branch HEAD - stale-test-db cleanup excludes %template% so the harness reuses it Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Author: Max Johnson <me@maxj.phd> · 2026-06-13 22:41 UTC

Commit: 614c3e7c8a37d5880a734de1b64fd97bf632efbe

Parent: af2c94f

16 files changed, +1078 insertions, -80 deletions

M .gitignore +3

			@@ -54,3 +54,6 @@ audit_review.md
54	54		# sandod local state (regenerable)
55	55		sando/daemon/sando.db
56	56		sando/daemon/sando.db-*
	57	+	sando/daemon/work/
	58	+	sando/daemon/releases/
	59	+	sando/daemon/cargo-target/

M sando/README.md +4 -2

			@@ -102,8 +102,10 @@ curl -X POST http://127.0.0.1:7766/promote/a \
102	102
103	103		\| Method \| Path \| Body \| Purpose \|
104	104		\|--------\|------\|------\|---------\|
105		-	\| GET \| `/state` \| — \| Tier list + current/previous version + last gate outcomes \|
106		-	\| POST \| `/rebuild` \| `{sha?: string}` \| Force a build; if `sha` is absent, resolves the configured deploy branch. Aborts any in-flight build (latest wins). \|
	105	+	\| GET \| `/state` \| — \| Tier list + current/previous version + last gate outcomes, plus `build` (latest build run: phase/result/failure_summary/elapsed_s, `null` until first `/rebuild`) so a poller sees in-flight/failed builds, not a frozen version \|
	106	+	\| POST \| `/rebuild` \| `{sha?: string}` \| Force a build; if `sha` is absent, resolves the configured deploy branch. Aborts any in-flight build (latest wins). Returns `{accepted, sha, run_id}`. \|
	107	+	\| GET \| `/runs/{id}` \| — \| Build-status of the run a `/rebuild` returned: `{run_id, sha, version, phase, result, failure_summary, gates[], started_at, finished_at}`. The pollable resource for a non-TUI driver — `/state` only reflects the last successful version. \|
	108	+	\| GET \| `/runs/{id}/wait` \| `?timeout_ms=` \| Long-poll: blocks until the run settles or `timeout_ms` (default 30s, cap 120s) elapses, then returns the same `RunView`. Fire `/rebuild` → block on `/wait`. \|
107	109		\| POST \| `/promote/{tier}` \| `{version?, hotfix?, reset_burn_in?}` \| Verify predecessor gates, deploy to tier nodes, advance state. `version` defaults to the predecessor tier's `current_version`. \|
108	110		\| POST \| `/rollback/{tier}` \| — \| Swap `current` symlink to `previous_version` on every node in the tier \|
109	111		\| POST \| `/confirm/{tier}` \| — \| Insert a passing `manual_confirm` gate row for the tier's `current_version`. Replaces hand-SQL. \|

A sando/daemon/migrations/007_build_runs.sql +26

		@@ -0,0 +1,26 @@
1	+	-- One row per `/rebuild` invocation: tracks a build attempt through its
2	+	-- lifecycle so a non-TUI caller can poll `GET /runs/{id}` for build status
3	+	-- instead of inferring it from `/state` — which only ever reflects the last
4	+	-- successful version and so reports stale-green for the whole duration of a
5	+	-- failing build (the 0.10.2 headless-deploy incident).
6	+	--
7	+	-- `version` is denormalized (no FK): a run may fail before a `versions` row
8	+	-- exists (fetch/checkout/compile error) and we still want its failure
9	+	-- recorded. `result` is the terminal verdict — 'building' until the pipeline
10	+	-- settles, then 'passed' \| 'failed' \| 'aborted'. `phase` is the in-flight
11	+	-- sub-state ('queued' \| 'fetching' \| 'compiling' \| 'staging' \| 'gating' \|
12	+	-- 'done'). Terminal transitions are guarded on `result = 'building'` so the
13	+	-- first writer (a build-step error, a gate failure, or the task-level catch)
14	+	-- wins and later writes are no-ops.
15	+	CREATE TABLE build_runs (
16	+	id INTEGER PRIMARY KEY AUTOINCREMENT,
17	+	sha TEXT NOT NULL,
18	+	version TEXT,
19	+	phase TEXT NOT NULL DEFAULT 'queued',
20	+	result TEXT NOT NULL DEFAULT 'building',
21	+	failure_summary TEXT,
22	+	started_at TEXT NOT NULL,
23	+	finished_at TEXT
24	+	);
25	+
26	+	CREATE INDEX build_runs_by_sha ON build_runs(sha);

M sando/daemon/sando-daemon.toml +4

			@@ -4,5 +4,9 @@ db_path = "./sando.db"
4	4		topology_path = "../sando.toml"
5	5		workdir = "./work"
6	6		release_root = "./releases"
	7	+	# Shared cargo target dir across per-sha worktrees — incremental rebuilds reuse
	8	+	# the previous sha's compiled deps instead of clean-compiling each fresh
	9	+	# worktree. Safe because builds are serialized. Omit for per-worktree target/.
	10	+	cargo_target_dir = "./cargo-target"
7	11		# Dropped and recreated on every migration_dry_run. Leave unset to skip.
8	12		scratch_db_url = "postgres://sando@127.0.0.1/sando_scratch"

M sando/daemon/src/build.rs +40 -13

			@@ -6,7 +6,7 @@
6	6
7	7		use crate::config::Config;
8	8		use crate::deploy;
9		-	use crate::domain::{GitSha, TierId, Version};
	9	+	use crate::domain::{GitSha, RunId, TierId, Version};
10	10		use crate::gates::{self, GateCtx};
11	11		use crate::git;
12	12		use crate::topology::Topology;
			@@ -34,10 +34,13 @@ pub async fn run(
34	34		topo: Arc<Topology>,
35	35		sha: GitSha,
36	36		events: crate::events::EventTx,
	37	+	run_id: RunId,
37	38		) -> Result<BuildArtifact> {
38	39		let worktree = cfg.workdir.join(sha.as_str());
39	40		let bare = PathBuf::from(&topo.repo.bare_path);
40	41
	42	+	crate::runs::set_phase(&pool, run_id, crate::runs::Phase::Fetching).await.ok();
	43	+
41	44		// Pull-based ingestion: if an upstream remote is configured, fetch the
42	45		// deploy branch so a just-pushed sha is locally resolvable. A fetch
43	46		// failure is non-fatal — the sha may already be present from a prior
			@@ -59,6 +62,7 @@ pub async fn run(
59	62		let server_dir = worktree.join("server");
60	63		let version = read_pkg_version(&server_dir.join("Cargo.toml")).await
61	64		.with_context(\|\| format!("reading version from {}/Cargo.toml", server_dir.display()))?;
	65	+	crate::runs::set_version(&pool, run_id, &version).await.ok();
62	66
63	67		// sqlx compile-time query checking needs a live DB with the current schema.
64	68		// We point cargo at the scratch DB and prep it (drop public, re-migrate)
			@@ -70,6 +74,12 @@ pub async fn run(
70	74		.arg("--release")
71	75		.current_dir(&server_dir)
72	76		.kill_on_drop(true);
	77	+	// Shared build cache across per-sha worktrees: reuse one target dir so an
	78	+	// incremental diff doesn't clean-compile from scratch. Serialized builds
	79	+	// make this contention-free. Unset → cargo's default per-worktree target/.
	80	+	if let Some(target) = cfg.cargo_target_dir.as_deref() {
	81	+	cargo_cmd.env("CARGO_TARGET_DIR", target);
	82	+	}
73	83		if let Some(scratch_url) = cfg.scratch_db_url.as_deref() {
74	84		tracing::info!(sha = %sha.as_str(), "preparing scratch DB schema for sqlx compile-time checks");
75	85		crate::gates::reset_scratch(scratch_url).await
			@@ -81,6 +91,7 @@ pub async fn run(
81	91		tracing::warn!("scratch_db_url unset; sqlx will fall back to offline mode and may fail");
82	92		}
83	93
	94	+	crate::runs::set_phase(&pool, run_id, crate::runs::Phase::Compiling).await.ok();
84	95		tracing::info!(sha = %sha, version = %version, dir = %server_dir.display(), "cargo build --release start");
85	96		crate::events::emit(&events, crate::events::Event::BuildStart {
86	97		sha: sha.clone(), version: version.clone(),
			@@ -96,19 +107,24 @@ pub async fn run(
96	107		crate::events::emit(&events, crate::events::Event::BuildFailed {
97	108		sha: sha.clone(), version: version.clone(), elapsed_s,
98	109		});
99		-	} else {
100		-	tracing::info!(sha = %sha, version = %version, elapsed_s, "cargo build --release ok");
101		-	crate::events::emit(&events, crate::events::Event::BuildOk {
102		-	sha: sha.clone(), version: version.clone(), elapsed_s,
103		-	});
	110	+	// Settle the run with the headline compiler diagnostic (not the raw
	111	+	// 4 KB tail) so `GET /runs/{id}` answers "why" without a journald dive.
	112	+	let summary = crate::classify::classify_compile_error(&out.stdout, &out.stderr).summary();
	113	+	crate::runs::mark_failed(&pool, run_id, &summary).await.ok();
	114	+	anyhow::bail!("cargo build --release failed:\n{}", tail(&out.stderr, 4_000));
104	115		}
105		-	anyhow::ensure!(
106		-	out.status.success(),
107		-	"cargo build --release failed:\n{}",
108		-	tail(&out.stderr, 4_000),
109		-	);
	116	+	tracing::info!(sha = %sha, version = %version, elapsed_s, "cargo build --release ok");
	117	+	crate::events::emit(&events, crate::events::Event::BuildOk {
	118	+	sha: sha.clone(), version: version.clone(), elapsed_s,
	119	+	});
110	120
111		-	let release_dir = server_dir.join("target/release");
	121	+	// Binaries land under `<target>/release/`; with a shared target dir that's
	122	+	// not inside the worktree, so resolve it the same way cargo did above.
	123	+	let release_dir = cfg
	124	+	.cargo_target_dir
	125	+	.as_deref()
	126	+	.map(\|t\| t.join("release"))
	127	+	.unwrap_or_else(\|\| server_dir.join("target/release"));
112	128		let mut binary_paths = Vec::with_capacity(cfg.bin_names.len());
113	129		for name in &cfg.bin_names {
114	130		let p = release_dir.join(name);
			@@ -144,8 +160,11 @@ pub async fn build_and_run_host(
144	160		topo: Arc<Topology>,
145	161		sha: GitSha,
146	162		events: crate::events::EventTx,
	163	+	run_id: RunId,
147	164		) -> Result<()> {
148		-	let art = run(pool.clone(), cfg.clone(), topo.clone(), sha, events.clone()).await?;
	165	+	let art = run(pool.clone(), cfg.clone(), topo.clone(), sha, events.clone(), run_id).await?;
	166	+
	167	+	crate::runs::set_phase(&pool, run_id, crate::runs::Phase::Staging).await.ok();
149	168
150	169		// Stage the binary in the host's release_root so future gates and the
151	170		// host self-deploy point at a stable path, not the worktree's target/.
			@@ -170,6 +189,7 @@ pub async fn build_and_run_host(
170	189		let host = topo.tiers.iter().find(\|t\| t.name.as_str() == "host")
171	190		.context("topology has no `host` tier")?;
172	191
	192	+	crate::runs::set_phase(&pool, run_id, crate::runs::Phase::Gating).await.ok();
173	193		let ctx = GateCtx {
174	194		pool: pool.clone(),
175	195		cfg: cfg.clone(),
			@@ -194,8 +214,15 @@ pub async fn build_and_run_host(
194	214		.bind(Utc::now().to_rfc3339())
195	215		.execute(&pool)
196	216		.await?;
	217	+	crate::runs::mark_passed(&pool, run_id).await.ok();
197	218		tracing::info!(version = %art.version, "host pipeline green; ready to promote to next tier");
198	219		} else {
	220	+	// Pull the first red gate's typed summary into the run so the API
	221	+	// answers "which gate, and why" — not just "failed".
	222	+	let summary = crate::runs::first_failed_gate_summary(&pool, &art.version)
	223	+	.await
	224	+	.unwrap_or_else(\|\| "host pipeline red".to_string());
	225	+	crate::runs::mark_failed(&pool, run_id, &summary).await.ok();
199	226		tracing::warn!(version = %art.version, "host pipeline red; not advancing tier_state");
200	227		}
201	228		Ok(())

M sando/daemon/src/classify.rs +187 -2

			@@ -63,7 +63,99 @@ pub fn classify_cargo_test(stdout: &[u8], stderr: &[u8]) -> GateFailure {
63	63		};
64	64		}
65	65
66		-	GateFailure::CargoTest { failed_count, first_failed }
	66	+	let first_panic = extract_first_panic(&stdout_s);
	67	+	GateFailure::CargoTest { failed_count, first_failed, first_panic }
	68	+	}
	69	+
	70	+	/// Pull the first root-cause panic message out of libtest's captured
	71	+	/// output. libtest (Rust 2021+) prints each captured panic as:
	72	+	/// thread '<test>' panicked at <file>:<line>:<col>:
	73	+	/// <message>
	74	+	/// We return the first panic's message — but skip "...poisoned" messages in
	75	+	/// favour of the first non-poison one, because a single real panic in shared
	76	+	/// setup (a `std::sync::Once`) poisons it and makes every other test report
	77	+	/// "Once instance has previously been poisoned". The root cause is the one
	78	+	/// panic that isn't a poison report. Falls back to the first panic of any
	79	+	/// kind if every message looks like poison.
	80	+	fn extract_first_panic(stdout: &str) -> Option<String> {
	81	+	let mut first: Option<String> = None;
	82	+	let mut lines = stdout.lines();
	83	+	while let Some(line) = lines.next() {
	84	+	if !line.contains("panicked at ") {
	85	+	continue;
	86	+	}
	87	+	// The message is the first non-empty line after the `panicked at` loc.
	88	+	let msg = lines.by_ref().map(str::trim).find(\|l\| !l.is_empty());
	89	+	let Some(msg) = msg else { continue };
	90	+	if first.is_none() {
	91	+	first = Some(msg.to_string());
	92	+	}
	93	+	let is_poison = msg.contains("poisoned") \|\| msg.contains("PoisonError");
	94	+	if !is_poison {
	95	+	return Some(msg.to_string());
	96	+	}
	97	+	}
	98	+	first
	99	+	}
	100	+
	101	+	/// `cargo test --no-run` (the fast pre-gate compile): pull the first
	102	+	/// compiler diagnostic out of cargo's stderr so a test-only-target
	103	+	/// compile break (e.g. a missing struct field in a `#[cfg(test)]`-only
	104	+	/// target) surfaces as the actual `error[E0063]: missing field ...`
	105	+	/// line, instead of after a full build + a partial run reported as an
	106	+	/// opaque "N tests failed".
	107	+	///
	108	+	/// Cargo writes diagnostics to stderr. We prefer the first coded
	109	+	/// `error[Ennnn]: ...` headline over the trailing `error: could not
	110	+	/// compile <crate> ... due to N previous errors` summary, which names
	111	+	/// the crate but not the cause; the summary still gives us the count.
	112	+	pub fn classify_compile_error(stdout: &[u8], stderr: &[u8]) -> GateFailure {
	113	+	let stderr_s = String::from_utf8_lossy(stderr);
	114	+	let mut first_error: Option<String> = None;
	115	+	let mut error_count: u32 = 0;
	116	+
	117	+	for line in stderr_s.lines() {
	118	+	let t = line.trim_start();
	119	+	if first_error.is_none() && t.starts_with("error[") {
	120	+	first_error = Some(t.to_string());
	121	+	}
	122	+	if let Some(rest) = t.strip_prefix("error: could not compile")
	123	+	&& let Some(n) = parse_due_to_count(rest)
	124	+	{
	125	+	error_count = n;
	126	+	}
	127	+	}
	128	+
	129	+	// No coded diagnostic (e.g. a macro or resolver error prints a bare
	130	+	// `error: ...`). Take the first such line that isn't cargo's own
	131	+	// summary/abort noise.
	132	+	if first_error.is_none() {
	133	+	for line in stderr_s.lines() {
	134	+	let t = line.trim_start();
	135	+	if t.starts_with("error:")
	136	+	&& !t.starts_with("error: could not compile")
	137	+	&& !t.starts_with("error: aborting")
	138	+	{
	139	+	first_error = Some(t.to_string());
	140	+	break;
	141	+	}
	142	+	}
	143	+	}
	144	+
	145	+	if first_error.is_none() && error_count == 0 {
	146	+	// Didn't look like a compile failure — don't masquerade as one.
	147	+	return GateFailure::Unclassified {
	148	+	legacy_detail: Some(combined_tail_for_classifier(stdout, stderr)),
	149	+	};
	150	+	}
	151	+	GateFailure::CompileError { error_count, first_error }
	152	+	}
	153	+
	154	+	/// Parse the count out of `... due to N previous error(s)`.
	155	+	fn parse_due_to_count(s: &str) -> Option<u32> {
	156	+	let idx = s.find("due to ")?;
	157	+	let digits: String = s[idx + 7..].chars().take_while(\|c\| c.is_ascii_digit()).collect();
	158	+	digits.parse().ok()
67	159		}
68	160
69	161		/// `migration_dry_run` is staged: scratch reset → restore dump → run
			@@ -217,11 +309,61 @@ failures:\n\
217	309		foo::baz\n\
218	310		\n\
219	311		test result: FAILED. 10 passed; 2 failed; 0 ignored\n";
220		-	let GateFailure::CargoTest { failed_count, first_failed } =
	312	+	let GateFailure::CargoTest { failed_count, first_failed, first_panic } =
221	313		classify_cargo_test(stdout, b"")
222	314		else { panic!("expected CargoTest variant"); };
223	315		assert_eq!(failed_count, 2);
224	316		assert_eq!(first_failed.as_deref(), Some("foo::bar"));
	317	+	// No `panicked at` lines in this fixture.
	318	+	assert_eq!(first_panic, None);
	319	+	}
	320	+
	321	+	#[test]
	322	+	fn cargo_test_sees_through_poison_cascade_to_root_panic() {
	323	+	// The shape that produced the opaque "856 failed": one real panic in
	324	+	// shared setup poisons a `Once`, and every other test then reports the
	325	+	// poison. The classifier must surface the real cause, not the poison.
	326	+	let stdout = b"failures:\n\n\
	327	+	---- harness::a stdout ----\n\
	328	+	thread 'harness::a' panicked at tests/harness/db.rs:42:9:\n\
	329	+	Once instance has previously been poisoned\n\
	330	+	\n\
	331	+	---- harness::root stdout ----\n\
	332	+	thread 'harness::root' panicked at tests/harness/db.rs:30:5:\n\
	333	+	template database \"mnw_test_template\" does not exist\n\
	334	+	\n\
	335	+	failures:\n\
	336	+	harness::a\n\
	337	+	harness::root\n\
	338	+	\n\
	339	+	test result: FAILED. 0 passed; 856 failed; 0 ignored\n";
	340	+	let GateFailure::CargoTest { failed_count, first_panic, .. } =
	341	+	classify_cargo_test(stdout, b"")
	342	+	else { panic!("expected CargoTest variant"); };
	343	+	assert_eq!(failed_count, 856);
	344	+	assert_eq!(
	345	+	first_panic.as_deref(),
	346	+	Some("template database \"mnw_test_template\" does not exist"),
	347	+	"must skip the poison message for the root cause",
	348	+	);
	349	+	}
	350	+
	351	+	#[test]
	352	+	fn cargo_test_panic_falls_back_when_all_poison() {
	353	+	// If every panic is a poison report, return the first one rather than
	354	+	// nothing — better than an opaque count.
	355	+	let stdout = b"failures:\n\n\
	356	+	---- harness::a stdout ----\n\
	357	+	thread 'harness::a' panicked at x.rs:1:1:\n\
	358	+	Once instance has previously been poisoned\n\
	359	+	\n\
	360	+	failures:\n harness::a\n\
	361	+	\n\
	362	+	test result: FAILED. 0 passed; 3 failed; 0 ignored\n";
	363	+	let GateFailure::CargoTest { first_panic, .. } =
	364	+	classify_cargo_test(stdout, b"")
	365	+	else { panic!("expected CargoTest variant"); };
	366	+	assert_eq!(first_panic.as_deref(), Some("Once instance has previously been poisoned"));
225	367		}
226	368
227	369		#[test]
			@@ -238,6 +380,49 @@ test result: FAILED. 10 passed; 2 failed; 0 ignored\n";
238	380		}
239	381
240	382		#[test]
	383	+	fn compile_error_extracts_first_coded_diagnostic_and_count() {
	384	+	// Real `cargo test --no-run` shape: the headline diagnostic, then
	385	+	// the trailing summary that carries the count.
	386	+	let stderr = b" Compiling makenotwork v0.10.2\n\
	387	+	error[E0063]: missing field `user_pages_host` in initializer of `Config`\n \
	388	+	--> src/config.rs:412:21\n\
	389	+	error: could not compile `makenotwork` (lib test) due to 1 previous error\n";
	390	+	let GateFailure::CompileError { error_count, first_error } =
	391	+	classify_compile_error(b"", stderr)
	392	+	else { panic!("expected CompileError variant"); };
	393	+	assert_eq!(error_count, 1);
	394	+	assert_eq!(
	395	+	first_error.as_deref(),
	396	+	Some("error[E0063]: missing field `user_pages_host` in initializer of `Config`"),
	397	+	);
	398	+	}
	399	+
	400	+	#[test]
	401	+	fn compile_error_falls_back_to_bare_error_line() {
	402	+	// A macro/resolver error has no `error[Ennnn]` code; we still want
	403	+	// the first real `error:` line, not the cargo summary.
	404	+	let stderr = b"error: cannot find macro `foo` in this scope\n\
	405	+	error: could not compile `makenotwork` (lib test) due to 2 previous errors\n";
	406	+	let GateFailure::CompileError { error_count, first_error } =
	407	+	classify_compile_error(b"", stderr)
	408	+	else { panic!("expected CompileError variant"); };
	409	+	assert_eq!(error_count, 2);
	410	+	assert_eq!(first_error.as_deref(), Some("error: cannot find macro `foo` in this scope"));
	411	+	}
	412	+
	413	+	#[test]
	414	+	fn compile_error_unclassified_when_not_a_compile_failure() {
	415	+	// No `error[...]`, no `could not compile` — hand back the tail.
	416	+	let f = classify_compile_error(b"", b"warning: unused import\n");
	417	+	match f {
	418	+	GateFailure::Unclassified { legacy_detail: Some(d) } => {
	419	+	assert!(d.contains("unused import"));
	420	+	}
	421	+	other => panic!("expected Unclassified, got {other:?}"),
	422	+	}
	423	+	}
	424	+
	425	+	#[test]
241	426		fn migration_drift_extracts_name() {
242	427		let err = "migration 0047_widgets was previously applied but is missing in the resolved migrations";
243	428		let f = classify_migration_error(err, None);

M sando/daemon/src/config.rs +37

			@@ -26,6 +26,16 @@ pub struct Config {
26	26		/// Served via `GET /logs/{version}/{gate}`. Defaults to `/srv/sando/logs`.
27	27		#[serde(default = "default_logs_root")]
28	28		pub logs_root: PathBuf,
	29	+	/// Shared cargo target dir. When set, every `cargo build`/`cargo test` the
	30	+	/// pipeline runs uses this one `CARGO_TARGET_DIR` instead of each per-sha
	31	+	/// worktree's own `target/`, so a 1-line diff reuses the previous sha's
	32	+	/// compiled dependencies (a ~10-min clean build becomes a 1–2-min
	33	+	/// incremental one). Safe because builds are serialized — a new `/rebuild`
	34	+	/// aborts the in-flight one — so no two cargo invocations ever share the
	35	+	/// dir concurrently. Unset = per-worktree `target/` (the historical
	36	+	/// behavior). Cargo creates the dir if absent.
	37	+	#[serde(default)]
	38	+	pub cargo_target_dir: Option<PathBuf>,
29	39		/// Non-binary contents to stage into each release dir alongside
30	40		/// `bin_names`. Each entry copies `worktree/<src>` into
31	41		/// `<release>/<dst>`. `required=false` makes a missing source a warn
			@@ -54,6 +64,32 @@ pub struct ReleaseEntry {
54	64		fn default_bin_names() -> Vec<String> { vec!["server".into()] }
55	65		fn default_logs_root() -> PathBuf { PathBuf::from("/srv/sando/logs") }
56	66
	67	+	#[cfg(test)]
	68	+	mod tests {
	69	+	use super::*;
	70	+
	71	+	const MINIMAL: &str = r#"
	72	+	listen = "127.0.0.1:7766"
	73	+	db_path = "./sando.db"
	74	+	topology_path = "../sando.toml"
	75	+	workdir = "./work"
	76	+	release_root = "./releases"
	77	+	"#;
	78	+
	79	+	#[test]
	80	+	fn cargo_target_dir_parses_when_present() {
	81	+	let raw = format!("{MINIMAL}\ncargo_target_dir = \"/srv/sando/cargo-target\"\n");
	82	+	let cfg: Config = toml::from_str(&raw).unwrap();
	83	+	assert_eq!(cfg.cargo_target_dir.as_deref(), Some(std::path::Path::new("/srv/sando/cargo-target")));
	84	+	}
	85	+
	86	+	#[test]
	87	+	fn cargo_target_dir_defaults_to_none() {
	88	+	let cfg: Config = toml::from_str(MINIMAL).unwrap();
	89	+	assert!(cfg.cargo_target_dir.is_none(), "omitting it keeps the per-worktree target/");
	90	+	}
	91	+	}
	92	+
57	93		impl Config {
58	94		/// Primary binary — the one the systemd unit's ExecStart points at.
59	95		pub fn primary_bin(&self) -> &str {
			@@ -79,6 +115,7 @@ impl Config {
79	115		bin_names: vec!["server".into()],
80	116		logs_root: PathBuf::from("/tmp/sando-test-logs"),
81	117		release_contents: Vec::new(),
	118	+	cargo_target_dir: None,
82	119		}
83	120		}
84	121		}

M sando/daemon/src/domain.rs +12

			@@ -327,6 +327,18 @@ impl fmt::Display for DeployId {
327	327		fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) }
328	328		}
329	329
	330	+	/// Primary key of `build_runs` — the resource a `/rebuild` returns and a
	331	+	/// non-TUI driver polls via `GET /runs/{id}`. Distinct from `GateRunId`
	332	+	/// (one build run drives many gate runs).
	333	+	#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, sqlx::Type)]
	334	+	#[sqlx(transparent)]
	335	+	#[serde(transparent)]
	336	+	pub struct RunId(pub i64);
	337	+
	338	+	impl fmt::Display for RunId {
	339	+	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) }
	340	+	}
	341	+
330	342		#[cfg(test)]
331	343		mod tests {
332	344		use super::*;

M sando/daemon/src/gates.rs +89 -38

			@@ -118,42 +118,46 @@ pub async fn run_all(ctx: &GateCtx, gates: &[Gate]) -> Result<bool> {
118	118
119	119		async fn cargo_test(ctx: &GateCtx, run_id: GateRunId) -> Result<GateOutcome> {
120	120		let server_dir = ctx.worktree.join("server");
121		-	let mut cmd = Command::new("cargo");
122		-	// Match CI (`server/deploy/run-ci.sh`): `--features fast-tests` relaxes
123		-	// auth rate-limit burst (5 → 20) and argon2 cost so signup-heavy + lockout
124		-	// workflow tests can complete without hitting Governor before the
125		-	// hand-rolled lockout check. The feature is specifically documented for
126		-	// this in `server/src/constants.rs:87`.
127		-	cmd.args(["test", "--release", "--features", "fast-tests"])
128		-	.current_dir(&server_dir)
129		-	.stdout(std::process::Stdio::piped())
130		-	.stderr(std::process::Stdio::piped())
131		-	.kill_on_drop(true);
132		-	// Same online-mode rationale as the build step: sqlx query macros need a
133		-	// live DB to type-check against. The scratch DB is left in migrated state
134		-	// by the preceding build, so we can reuse it here.
	121	+	let log_path = gate_log_path(ctx, GateKind::CargoTest);
	122	+	let log_ref = LogRef::new(&ctx.version, GateKind::CargoTest);
	123	+
	124	+	// Best-effort: drop our own role's stale `mnw_test_*` databases (the
	125	+	// template + any per-test clones orphaned by a previously-killed run)
	126	+	// before the suite, so they can't accumulate or collide. Foreign-owned
	127	+	// leftovers are left alone — the harness now namespaces its template per
	128	+	// role, so they no longer wedge the gate.
135	129		if let Some(scratch_url) = ctx.cfg.scratch_db_url.as_deref() {
136		-	cmd.env("DATABASE_URL", scratch_url);
137		-	// The server test harness (tests/harness/db.rs) parses TEST_DATABASE_URL
138		-	// with rfind('/'), which mangles URLs whose query string contains '/'
139		-	// (e.g. `?host=/var/run/postgresql`). Strip the query — libpq defaults
140		-	// to /var/run/postgresql on Debian/Ubuntu when host is unspecified.
141		-	let test_url = scratch_url
142		-	.split_once('?')
143		-	.map(\|(base, _)\| base)
144		-	.unwrap_or(scratch_url);
145		-	cmd.env("TEST_DATABASE_URL", test_url);
146		-	// Best-effort: drop our own role's stale `mnw_test_*` databases (the
147		-	// template + any per-test clones orphaned by a previously-killed run)
148		-	// before the suite, so they can't accumulate or collide. Foreign-owned
149		-	// leftovers are left alone — the harness now namespaces its template
150		-	// per role, so they no longer wedge the gate.
151	130		clean_stale_test_dbs(scratch_url).await;
152	131		}
	132	+
153	133		let started = std::time::Instant::now();
154		-	let log_path = gate_log_path(ctx, GateKind::CargoTest);
155		-	let log_ref = LogRef::new(&ctx.version, GateKind::CargoTest);
156		-	let mut child = match cmd.spawn() {
	134	+
	135	+	// Fast pre-gate: compile the test targets WITHOUT running them. This
	136	+	// builds the exact `--release --features fast-tests` artifacts the full
	137	+	// run needs (so the subsequent run reuses the cache — no wasted work),
	138	+	// but fails in ~minutes with the real `error[Ennnn]: ...` on a
	139	+	// test-only-target compile break. That class (a field missing in a
	140	+	// `#[cfg(test)]`-only binary like `load`) otherwise compiles fine under
	141	+	// the build step + `--test integration` and only blows up here, after a
	142	+	// full build, as an opaque mass test failure.
	143	+	let mut pre = match cargo_test_command(ctx, &server_dir, &["--no-run"]).spawn() {
	144	+	Ok(c) => c,
	145	+	Err(e) => {
	146	+	return Ok(GateOutcome::failed(GateFailure::SpawnFailed {
	147	+	message: e.to_string(),
	148	+	}).with_log_ref(log_ref));
	149	+	}
	150	+	};
	151	+	let (pre_out, pre_err, pre_status) =
	152	+	stream_child_to_live_log(&mut pre, ctx.events.clone(), run_id, log_path.clone()).await?;
	153	+	if !pre_status.success() {
	154	+	let failure = classify::classify_compile_error(&pre_out, &pre_err);
	155	+	return Ok(GateOutcome::failed(failure).with_log_ref(log_ref));
	156	+	}
	157	+
	158	+	// Full run: the test binaries are already built above, so cargo's
	159	+	// up-to-date check skips compilation and this just runs the tests.
	160	+	let mut child = match cargo_test_command(ctx, &server_dir, &[]).spawn() {
157	161		Ok(c) => c,
158	162		Err(e) => {
159	163		return Ok(GateOutcome::failed(GateFailure::SpawnFailed {
			@@ -172,6 +176,46 @@ async fn cargo_test(ctx: &GateCtx, run_id: GateRunId) -> Result<GateOutcome> {
172	176		}
173	177		}
174	178
	179	+	/// Configure (but don't spawn) `cargo test --release --features fast-tests
	180	+	/// <extra>` in `dir`, wired to the scratch DB. Shared by the `--no-run`
	181	+	/// pre-gate compile and the full test run so both go through one env setup.
	182	+	///
	183	+	/// `--features fast-tests` matches CI (`server/deploy/run-ci.sh`): it relaxes
	184	+	/// the auth rate-limit burst (5 → 20) and argon2 cost so signup-heavy +
	185	+	/// lockout workflow tests complete without hitting Governor before the
	186	+	/// hand-rolled lockout check (documented at `server/src/constants.rs:87`).
	187	+	fn cargo_test_command(ctx: &GateCtx, dir: &std::path::Path, extra: &[&str]) -> Command {
	188	+	let mut cmd = Command::new("cargo");
	189	+	cmd.args(["test", "--release", "--features", "fast-tests"])
	190	+	.args(extra)
	191	+	.current_dir(dir)
	192	+	.stdout(std::process::Stdio::piped())
	193	+	.stderr(std::process::Stdio::piped())
	194	+	.kill_on_drop(true);
	195	+	// Share the build step's target dir so the test compile reuses its
	196	+	// artifacts (and the `--no-run` precompile reuses them again). Must match
	197	+	// `build.rs` or the gate would clean-compile the whole tree a second time.
	198	+	if let Some(target) = ctx.cfg.cargo_target_dir.as_deref() {
	199	+	cmd.env("CARGO_TARGET_DIR", target);
	200	+	}
	201	+	// Same online-mode rationale as the build step: sqlx query macros need a
	202	+	// live DB to type-check against. The scratch DB is left in migrated state
	203	+	// by the preceding build, so we can reuse it here.
	204	+	if let Some(scratch_url) = ctx.cfg.scratch_db_url.as_deref() {
	205	+	cmd.env("DATABASE_URL", scratch_url);
	206	+	// The server test harness (tests/harness/db.rs) parses TEST_DATABASE_URL
	207	+	// with rfind('/'), which mangles URLs whose query string contains '/'
	208	+	// (e.g. `?host=/var/run/postgresql`). Strip the query — libpq defaults
	209	+	// to /var/run/postgresql on Debian/Ubuntu when host is unspecified.
	210	+	let test_url = scratch_url
	211	+	.split_once('?')
	212	+	.map(\|(base, _)\| base)
	213	+	.unwrap_or(scratch_url);
	214	+	cmd.env("TEST_DATABASE_URL", test_url);
	215	+	}
	216	+	cmd
	217	+	}
	218	+
175	219		async fn migration_dry_run(ctx: &GateCtx) -> Result<GateOutcome> {
176	220		let mut log_buf: Vec<u8> = Vec::new();
177	221		let log_ref = LogRef::new(&ctx.version, GateKind::MigrationDryRun);
			@@ -272,12 +316,18 @@ pub(crate) async fn reset_scratch(db_url: &str) -> Result<()> {
272	316		Ok(())
273	317		}
274	318
275		-	/// Best-effort cleanup of stale test databases left behind by a
276		-	/// previously-killed `cargo_test` run (the per-test `mnw_test_<uuid>` clones
277		-	/// and the role's template). Only drops databases owned by the connecting
278		-	/// role — a foreign-owned leftover can't be dropped without superuser anyway,
279		-	/// and the harness now namespaces its template per role so one can't wedge us.
280		-	/// Never returns an error: a cleanup miss must not turn a deploy red.
	319	+	/// Best-effort cleanup of stale per-test database clones (`mnw_test_<uuid>`)
	320	+	/// left behind by a previously-killed `cargo_test` run. Only drops databases
	321	+	/// owned by the connecting role — a foreign-owned leftover can't be dropped
	322	+	/// without superuser anyway, and the harness namespaces its template per role
	323	+	/// so one can't wedge us.
	324	+	///
	325	+	/// Deliberately excludes the template (`mnw_test_template_*`): the harness
	326	+	/// reuses it across runs when it's migration-current (skipping a full
	327	+	/// drop+migrate), so dropping it here would force a needless rebuild every
	328	+	/// gate run. Templates are bounded (one per role) and never accumulate, so
	329	+	/// leaving them is free. Never returns an error: a cleanup miss must not turn a
	330	+	/// deploy red.
281	331		async fn clean_stale_test_dbs(db_url: &str) {
282	332		use sqlx::postgres::PgPoolOptions;
283	333		use sqlx::Executor;
			@@ -294,6 +344,7 @@ async fn clean_stale_test_dbs(db_url: &str) {
294	344		let names: Vec<(String,)> = sqlx::query_as(
295	345		"SELECT datname FROM pg_database
296	346		WHERE datname LIKE 'mnw_test_%'
	347	+	AND datname NOT LIKE '%template%'
297	348		AND pg_catalog.pg_has_role(current_user, datdba, 'USAGE')",
298	349		)
299	350		.fetch_all(&pool)

M sando/daemon/src/lib.rs +1

			@@ -22,6 +22,7 @@ pub mod live_log;
22	22		pub mod metrics;
23	23		pub mod outcome;
24	24		pub mod routes;
	25	+	pub mod runs;
25	26		pub mod state;
26	27		pub mod sync;
27	28		pub mod topology;

M sando/daemon/src/outcome.rs +26 -3

			@@ -146,7 +146,23 @@ impl GateBlocker {
146	146		pub enum GateFailure {
147	147		/// `cargo_test` exited non-zero. `failed_count` may be 0 if the
148	148		/// classifier couldn't parse the count (e.g. compile error).
149		-	CargoTest { failed_count: u32, first_failed: Option<String> },
	149	+	/// `first_failed` is the first failing test's name; `first_panic` is the
	150	+	/// first panic message (root cause), chosen to skip the "Once instance
	151	+	/// has previously been poisoned" cascade so 800 poisoned tests don't bury
	152	+	/// the one real panic that poisoned them.
	153	+	CargoTest {
	154	+	failed_count: u32,
	155	+	first_failed: Option<String>,
	156	+	#[serde(default, skip_serializing_if = "Option::is_none")]
	157	+	first_panic: Option<String>,
	158	+	},
	159	+	/// `cargo_test` fast pre-gate (`cargo test --no-run`): the test
	160	+	/// targets failed to compile, so no tests ran. `first_error` is the
	161	+	/// headline diagnostic (e.g. `error[E0063]: missing field
	162	+	/// user_pages_host`) and `error_count` is cargo's "N previous errors".
	163	+	/// Distinct from `CargoTest` so a test-only-target compile break reads
	164	+	/// as a build error, not "0 tests failed".
	165	+	CompileError { error_count: u32, first_error: Option<String> },
150	166		/// `migration_dry_run`: a migration that was previously applied is
151	167		/// no longer present in the resolved migrations directory.
152	168		MigrationDrift { migration: String },
			@@ -175,10 +191,17 @@ pub enum GateFailure {
175	191		impl GateFailure {
176	192		pub fn summary(&self) -> String {
177	193		match self {
178		-	GateFailure::CargoTest { failed_count, first_failed: Some(name) } =>
	194	+	// The panic message is the diagnostic; prefer it over the test name.
	195	+	GateFailure::CargoTest { failed_count, first_panic: Some(p), .. } =>
	196	+	format!("{failed_count} test(s) failed; first panic: {p}"),
	197	+	GateFailure::CargoTest { failed_count, first_failed: Some(name), first_panic: None } =>
179	198		format!("{failed_count} test(s) failed; first: {name}"),
180		-	GateFailure::CargoTest { failed_count, first_failed: None } =>
	199	+	GateFailure::CargoTest { failed_count, first_failed: None, first_panic: None } =>
181	200		format!("{failed_count} test(s) failed"),
	201	+	GateFailure::CompileError { error_count, first_error: Some(e) } =>
	202	+	format!("compile failed ({error_count} error(s)); first: {e}"),
	203	+	GateFailure::CompileError { error_count, first_error: None } =>
	204	+	format!("compile failed ({error_count} error(s))"),
182	205		GateFailure::MigrationDrift { migration } =>
183	206		format!("migration {migration} previously applied but missing"),
184	207		GateFailure::MigrationModified { migration } =>

M sando/daemon/src/routes.rs +232 -19

			@@ -1,6 +1,6 @@
1	1		use crate::error::Result;
2	2		use crate::state::AppState;
3		-	use axum::extract::{Path, State, WebSocketUpgrade};
	3	+	use axum::extract::{Path, Query, State, WebSocketUpgrade};
4	4		use axum::response::IntoResponse;
5	5		use axum::routing::{get, post};
6	6		use axum::{Json, Router};
			@@ -27,6 +27,8 @@ pub fn router(state: AppState) -> Router {
27	27
28	28		let open = Router::new()
29	29		.route("/state", get(get_state))
	30	+	.route("/runs/{id}", get(get_run))
	31	+	.route("/runs/{id}/wait", get(get_run_wait))
30	32		.route("/logs/{version}/{gate}", get(get_gate_log))
31	33		.route("/events", get(events_ws));
32	34
			@@ -84,6 +86,12 @@ struct StateView {
84	86		/// the deployed product, not the controller).
85	87		sandod_version: &'static str,
86	88		tiers: Vec<TierView>,
	89	+	/// The most recent build run (the resource `GET /runs/{id}` exposes in
	90	+	/// full). Surfaced here so a `/state` poller sees an in-flight or failed
	91	+	/// build — the tier versions only ever reflect the last success, so
	92	+	/// without this `/state` looks frozen for the whole build. `null` until
	93	+	/// the first `/rebuild`.
	94	+	build: Option<crate::runs::BuildSummary>,
87	95		}
88	96
89	97		#[derive(Serialize)]
			@@ -198,7 +206,8 @@ async fn get_state(State(s): State<AppState>) -> Result<Json<StateView>> {
198	206		});
199	207		}
200	208
201		-	Ok(Json(StateView { sandod_version: env!("CARGO_PKG_VERSION"), tiers }))
	209	+	let build = crate::runs::latest_summary(&s.pool).await?;
	210	+	Ok(Json(StateView { sandod_version: env!("CARGO_PKG_VERSION"), tiers, build }))
202	211		}
203	212
204	213		#[derive(Deserialize, Default)]
			@@ -757,13 +766,24 @@ async fn rebuild(
757	766		) -> Result<Json<serde_json::Value>> {
758	767		let body = body.map(\|Json(b)\| b).unwrap_or_default();
759	768		let sha = match body.sha {
760		-	Some(s) => s,
761		-	None => crate::git::resolve_ref(
762		-	std::path::Path::new(&s.topo.repo.bare_path),
763		-	&s.topo.repo.branch,
764		-	)
765		-	.await
766		-	.map_err(crate::error::Error::Other)?,
	769	+	Some(sha) => sha,
	770	+	None => {
	771	+	// Omitted sha = "build the deploy branch's tip". Fetch upstream
	772	+	// first so we resolve the upstream HEAD, not a possibly-stale
	773	+	// local branch ref — the build task fetches too, but only after the
	774	+	// sha is already chosen, so without this `/rebuild {}` could build
	775	+	// an old commit. A fetch failure is non-fatal: fall back to the
	776	+	// current bare-repo tip (same policy as the build task).
	777	+	let bare = std::path::Path::new(&s.topo.repo.bare_path);
	778	+	if let Some(upstream) = s.topo.repo.upstream.as_deref()
	779	+	&& let Err(e) = crate::git::fetch_upstream(bare, upstream, &s.topo.repo.branch).await
	780	+	{
	781	+	tracing::warn!(error = %e, "pre-resolve upstream fetch failed; resolving current bare-repo branch tip");
	782	+	}
	783	+	crate::git::resolve_ref(bare, &s.topo.repo.branch)
	784	+	.await
	785	+	.map_err(crate::error::Error::Other)?
	786	+	}
767	787		};
768	788
769	789		// Boundary parse: a sha entering Sando must be hex of plausible length.
			@@ -774,17 +794,27 @@ async fn rebuild(
774	794		tracing::info!(sha = %sha, "rebuild requested");
775	795		crate::events::emit(&s.events, crate::events::Event::RebuildRequested { sha: sha.clone() });
776	796
	797	+	// One pollable resource per triggered build. Created before the spawn so
	798	+	// the run id is in the response even if the task is aborted milliseconds
	799	+	// later by a still-newer /rebuild.
	800	+	let run_id = crate::runs::create(&s.pool, sha.as_str())
	801	+	.await
	802	+	.map_err(crate::error::Error::Other)?;
	803	+
777	804		// Latest /rebuild wins: abort any in-flight build before spawning a new
778	805		// one. Aborting drops the spawned task's future, which drops any
779	806		// tokio::process::Child it owns; with `kill_on_drop(true)` set on the
780	807		// cargo Command, SIGKILL propagates to cargo + its rustc children.
781	808		let mut slot = s.active_build.lock().await;
782		-	if let Some(prev) = slot.take() {
783		-	if !prev.is_finished() {
784		-	tracing::warn!("aborting in-flight build for newer /rebuild request");
785		-	crate::events::emit(&s.events, crate::events::Event::BuildAborted { sha_aborted: sha.clone() });
786		-	prev.abort();
787		-	}
	809	+	if let Some(prev) = slot.take()
	810	+	&& !prev.handle.is_finished()
	811	+	{
	812	+	tracing::warn!("aborting in-flight build for newer /rebuild request");
	813	+	crate::events::emit(&s.events, crate::events::Event::BuildAborted { sha_aborted: sha.clone() });
	814	+	prev.handle.abort();
	815	+	// Aborting drops the task before it can settle its own row, so
	816	+	// record the supersession here.
	817	+	crate::runs::mark_aborted(&s.pool, prev.run_id).await.ok();
788	818		}
789	819
790	820		let pool = s.pool.clone();
			@@ -793,14 +823,71 @@ async fn rebuild(
793	823		let events_for_task = s.events.clone();
794	824		let sha_for_task = sha.clone();
795	825		let sha_response = sha.to_string();
	826	+	let pool_for_task = s.pool.clone();
796	827		let handle = tokio::spawn(async move {
797		-	if let Err(e) = crate::build::build_and_run_host(pool, cfg, topo, sha_for_task.clone(), events_for_task).await {
	828	+	if let Err(e) = crate::build::build_and_run_host(pool, cfg, topo, sha_for_task.clone(), events_for_task, run_id).await {
798	829		tracing::error!(sha = %sha_for_task, error = %e, "rebuild pipeline failed");
	830	+	// Pre-gate bails (fetch/checkout/version/scratch) don't settle the
	831	+	// run themselves; the build-step compile error already did. First
	832	+	// terminal write wins, so this is a safety net for the rest.
	833	+	crate::runs::mark_failed(&pool_for_task, run_id, &format!("{e:#}")).await.ok();
799	834		}
800	835		});
801		-	*slot = Some(handle.abort_handle());
	836	+	*slot = Some(crate::state::ActiveBuild { handle: handle.abort_handle(), run_id });
802	837
803		-	Ok(Json(serde_json::json!({ "accepted": true, "sha": sha_response })))
	838	+	Ok(Json(serde_json::json!({ "accepted": true, "sha": sha_response, "run_id": run_id.0 })))
	839	+	}
	840	+
	841	+	/// `GET /runs/{id}` — the build-status resource a non-TUI driver polls after
	842	+	/// `/rebuild`. Open (read-only) like `/state` and `/logs`.
	843	+	async fn get_run(
	844	+	State(s): State<AppState>,
	845	+	Path(id): Path<i64>,
	846	+	) -> Result<Json<crate::runs::RunView>> {
	847	+	crate::runs::get(&s.pool, crate::domain::RunId(id))
	848	+	.await
	849	+	.map_err(crate::error::Error::Other)?
	850	+	.map(Json)
	851	+	.ok_or(crate::error::Error::NotFound)
	852	+	}
	853	+
	854	+	#[derive(Deserialize)]
	855	+	struct WaitParams {
	856	+	/// How long to hold the request open before returning a still-building
	857	+	/// run. Default 30s, capped at 120s.
	858	+	#[serde(default)]
	859	+	timeout_ms: Option<u64>,
	860	+	}
	861	+
	862	+	/// `GET /runs/{id}/wait` — long-poll: hold the request open until the run
	863	+	/// settles (`result != building`) or the timeout elapses, then return the
	864	+	/// current `RunView`. Removes polling-cadence guessing for a headless driver
	865	+	/// (fire `/rebuild`, block on `/wait`). On timeout the run is returned
	866	+	/// still-building (200) and the caller re-issues `/wait`. 404 if unknown.
	867	+	async fn get_run_wait(
	868	+	State(s): State<AppState>,
	869	+	Path(id): Path<i64>,
	870	+	Query(p): Query<WaitParams>,
	871	+	) -> Result<Json<crate::runs::RunView>> {
	872	+	let run_id = crate::domain::RunId(id);
	873	+	let timeout = std::time::Duration::from_millis(p.timeout_ms.unwrap_or(30_000).min(120_000));
	874	+	let deadline = tokio::time::Instant::now() + timeout;
	875	+	// Poll the row rather than wiring a per-run notifier: a build settles on
	876	+	// the minute scale, so a sub-second tick is plenty responsive and the
	877	+	// query is a single indexed read. The request releases its pool handle
	878	+	// between ticks.
	879	+	let tick = std::time::Duration::from_millis(750);
	880	+	loop {
	881	+	let view = crate::runs::get(&s.pool, run_id)
	882	+	.await
	883	+	.map_err(crate::error::Error::Other)?
	884	+	.ok_or(crate::error::Error::NotFound)?;
	885	+	let now = tokio::time::Instant::now();
	886	+	if view.result != "building" \|\| now >= deadline {
	887	+	return Ok(Json(view));
	888	+	}
	889	+	tokio::time::sleep((deadline - now).min(tick)).await;
	890	+	}
804	891		}
805	892
806	893		#[derive(Deserialize)]
			@@ -835,7 +922,7 @@ async fn self_update(
835	922		// the restart would SIGKILL it mid-deploy. Make the operator retry once idle.
836	923		{
837	924		let slot = s.active_build.lock().await;
838		-	if slot.as_ref().is_some_and(\|h\| !h.is_finished()) {
	925	+	if slot.as_ref().is_some_and(\|b\| !b.handle.is_finished()) {
839	926		return Err(crate::error::Error::GateBlocked(
840	927		"a server build is in flight; retry /self-update once it settles".into(),
841	928		));
			@@ -1074,6 +1161,7 @@ mod tests {
1074	1161		bin_names: vec!["makenotwork".into()],
1075	1162		logs_root: PathBuf::from("/tmp/sando-logs"),
1076	1163		release_contents: vec![],
	1164	+	cargo_target_dir: None,
1077	1165		}
1078	1166		}
1079	1167
			@@ -1385,6 +1473,112 @@ mod tests {
1385	1473		assert_eq!(resp.status(), StatusCode::NOT_FOUND);
1386	1474		}
1387	1475
	1476	+	#[tokio::test]
	1477	+	async fn get_run_404s_for_unknown_id() {
	1478	+	let state = test_state().await;
	1479	+	let app = router(state);
	1480	+	let resp = app
	1481	+	.oneshot(Request::builder().uri("/runs/999").body(Body::empty()).unwrap())
	1482	+	.await
	1483	+	.unwrap();
	1484	+	assert_eq!(resp.status(), StatusCode::NOT_FOUND);
	1485	+	}
	1486	+
	1487	+	#[tokio::test]
	1488	+	async fn get_run_returns_view_with_gates() {
	1489	+	let state = test_state().await;
	1490	+	// A run that reached version 0.10.2 and ran two host gates (one red).
	1491	+	let run_id = crate::runs::create(&state.pool, "abc1234def").await.unwrap();
	1492	+	let ver: crate::domain::Version = "0.10.2".parse().unwrap();
	1493	+	seed(&state.pool, "host", "0.10.2").await;
	1494	+	crate::runs::set_version(&state.pool, run_id, &ver).await.unwrap();
	1495	+	insert_gate(&state.pool, "host", "0.10.2", "cargo_test", 0).await;
	1496	+	insert_gate(&state.pool, "host", "0.10.2", "boot_smoke", 1).await;
	1497	+
	1498	+	let app = router(state);
	1499	+	let resp = app
	1500	+	.oneshot(
	1501	+	Request::builder()
	1502	+	.uri(format!("/runs/{}", run_id.0))
	1503	+	.body(Body::empty())
	1504	+	.unwrap(),
	1505	+	)
	1506	+	.await
	1507	+	.unwrap();
	1508	+	assert_eq!(resp.status(), StatusCode::OK);
	1509	+	let v: serde_json::Value = serde_json::from_str(&body_string(resp).await).unwrap();
	1510	+	assert_eq!(v["run_id"], run_id.0);
	1511	+	assert_eq!(v["sha"], "abc1234def");
	1512	+	assert_eq!(v["version"], "0.10.2");
	1513	+	assert_eq!(v["result"], "building");
	1514	+	// Both host gates surface, latest-per-kind, alphabetized by kind.
	1515	+	assert_eq!(v["gates"].as_array().unwrap().len(), 2);
	1516	+	assert_eq!(v["gates"][0]["kind"], "boot_smoke");
	1517	+	assert_eq!(v["gates"][0]["status"], "passed");
	1518	+	assert_eq!(v["gates"][1]["kind"], "cargo_test");
	1519	+	assert_eq!(v["gates"][1]["status"], "failed");
	1520	+	}
	1521	+
	1522	+	#[tokio::test]
	1523	+	async fn get_run_wait_returns_immediately_when_settled() {
	1524	+	let state = test_state().await;
	1525	+	let run_id = crate::runs::create(&state.pool, "abc1234def").await.unwrap();
	1526	+	crate::runs::mark_passed(&state.pool, run_id).await.unwrap();
	1527	+
	1528	+	let app = router(state);
	1529	+	// Generous timeout, but an already-settled run must not wait for it.
	1530	+	let resp = app
	1531	+	.oneshot(
	1532	+	Request::builder()
	1533	+	.uri(format!("/runs/{}/wait?timeout_ms=60000", run_id.0))
	1534	+	.body(Body::empty())
	1535	+	.unwrap(),
	1536	+	)
	1537	+	.await
	1538	+	.unwrap();
	1539	+	assert_eq!(resp.status(), StatusCode::OK);
	1540	+	let v: serde_json::Value = serde_json::from_str(&body_string(resp).await).unwrap();
	1541	+	assert_eq!(v["result"], "passed");
	1542	+	}
	1543	+
	1544	+	#[tokio::test]
	1545	+	async fn get_run_wait_returns_building_at_timeout() {
	1546	+	let state = test_state().await;
	1547	+	let run_id = crate::runs::create(&state.pool, "abc1234def").await.unwrap();
	1548	+
	1549	+	let app = router(state);
	1550	+	// timeout_ms=0 → deadline is now → the first poll returns the
	1551	+	// still-building run rather than blocking.
	1552	+	let resp = app
	1553	+	.oneshot(
	1554	+	Request::builder()
	1555	+	.uri(format!("/runs/{}/wait?timeout_ms=0", run_id.0))
	1556	+	.body(Body::empty())
	1557	+	.unwrap(),
	1558	+	)
	1559	+	.await
	1560	+	.unwrap();
	1561	+	assert_eq!(resp.status(), StatusCode::OK);
	1562	+	let v: serde_json::Value = serde_json::from_str(&body_string(resp).await).unwrap();
	1563	+	assert_eq!(v["result"], "building");
	1564	+	}
	1565	+
	1566	+	#[tokio::test]
	1567	+	async fn get_run_wait_404s_for_unknown_id() {
	1568	+	let state = test_state().await;
	1569	+	let app = router(state);
	1570	+	let resp = app
	1571	+	.oneshot(
	1572	+	Request::builder()
	1573	+	.uri("/runs/999/wait?timeout_ms=0")
	1574	+	.body(Body::empty())
	1575	+	.unwrap(),
	1576	+	)
	1577	+	.await
	1578	+	.unwrap();
	1579	+	assert_eq!(resp.status(), StatusCode::NOT_FOUND);
	1580	+	}
	1581	+
1388	1582		#[test]
1389	1583		fn self_update_unit_maps_sha_to_instance() {
1390	1584		let sha = crate::domain::GitSha::parse("abc1234def5678").unwrap();
			@@ -1630,6 +1824,25 @@ mod tests {
1630	1824		}
1631	1825
1632	1826		#[tokio::test]
	1827	+	async fn state_build_is_null_until_first_rebuild_then_surfaces_latest() {
	1828	+	use axum::extract::State;
	1829	+	let state = test_state().await;
	1830	+	// No build runs yet → build is null, so /state doesn't pretend a build
	1831	+	// is happening.
	1832	+	let Json(view) = get_state(State(state.clone())).await.unwrap();
	1833	+	assert!(view.build.is_none());
	1834	+
	1835	+	// A failed run must surface its cause in /state, not just in /runs.
	1836	+	let run_id = crate::runs::create(&state.pool, "deadbeef").await.unwrap();
	1837	+	crate::runs::mark_failed(&state.pool, run_id, "cargo_test: 3 test(s) failed").await.unwrap();
	1838	+	let Json(view) = get_state(State(state)).await.unwrap();
	1839	+	let b = view.build.expect("build surfaced");
	1840	+	assert_eq!(b.run_id, run_id.0);
	1841	+	assert_eq!(b.result, "failed");
	1842	+	assert_eq!(b.failure_summary.as_deref(), Some("cargo_test: 3 test(s) failed"));
	1843	+	}
	1844	+
	1845	+	#[tokio::test]
1633	1846		async fn promote_with_explicit_version_but_missing_artifact_404s() {
1634	1847		// Explicit version supplied, gates trivially pass (mm has none in
1635	1848		// test_topo), but `versions` table has no row → 404.

A sando/daemon/src/runs.rs +394

		@@ -0,0 +1,394 @@
1	+	//! Build-run tracking: one `build_runs` row per `/rebuild`, updated as the
2	+	//! pipeline moves through its phases, terminating in passed/failed/aborted.
3	+	//!
4	+	//! This is the resource that makes Sando driveable headlessly. `/state` only
5	+	//! ever reflects the last successful deploy, so on a red pipeline a poller
6	+	//! of `/state` sees stale-green for the whole build (the 0.10.2 incident). A
7	+	//! `RunId` returned by `/rebuild` + `GET /runs/{id}` gives a non-TUI caller
8	+	//! one pollable resource tied to the build it triggered, carrying the phase,
9	+	//! the per-gate status, and — the highest-value bit — a `failure_summary`
10	+	//! (first compile error / first failed gate) so the cause is in the API, not
11	+	//! behind `sudo journalctl`.
12	+	//!
13	+	//! Terminal writes (`mark_passed`/`mark_failed`/`mark_aborted`) are guarded on
14	+	//! `result = 'building'`, so whichever site settles the run first wins: a
15	+	//! build-step compile error, the first red gate, or the task-level catch for
16	+	//! pre-build bails. Later writes are silent no-ops.
17	+
18	+	use crate::domain::{RunId, Version};
19	+	use anyhow::Result;
20	+	use chrono::Utc;
21	+	use serde::Serialize;
22	+	use sqlx::{Row, SqlitePool};
23	+
24	+	/// In-flight sub-state. Plain strings in the DB; this enum names the values so
25	+	/// call sites can't typo them.
26	+	#[derive(Debug, Clone, Copy)]
27	+	pub enum Phase {
28	+	Fetching,
29	+	Compiling,
30	+	Staging,
31	+	Gating,
32	+	}
33	+
34	+	impl Phase {
35	+	pub fn as_str(self) -> &'static str {
36	+	match self {
37	+	Phase::Fetching => "fetching",
38	+	Phase::Compiling => "compiling",
39	+	Phase::Staging => "staging",
40	+	Phase::Gating => "gating",
41	+	}
42	+	}
43	+	}
44	+
45	+	/// Insert a fresh `building` run for `sha` and return its id.
46	+	pub async fn create(pool: &SqlitePool, sha: &str) -> Result<RunId> {
47	+	let id: i64 = sqlx::query_scalar(
48	+	"INSERT INTO build_runs (sha, phase, result, started_at)
49	+	VALUES (?, 'queued', 'building', ?) RETURNING id",
50	+	)
51	+	.bind(sha)
52	+	.bind(Utc::now().to_rfc3339())
53	+	.fetch_one(pool)
54	+	.await?;
55	+	Ok(RunId(id))
56	+	}
57	+
58	+	/// Advance the in-flight phase. No-op once the run is terminal so a late
59	+	/// phase write can't resurrect a finished row.
60	+	pub async fn set_phase(pool: &SqlitePool, run_id: RunId, phase: Phase) -> Result<()> {
61	+	sqlx::query("UPDATE build_runs SET phase = ? WHERE id = ? AND result = 'building'")
62	+	.bind(phase.as_str())
63	+	.bind(run_id.0)
64	+	.execute(pool)
65	+	.await?;
66	+	Ok(())
67	+	}
68	+
69	+	/// Record the version once it's been read from the worktree's Cargo.toml.
70	+	pub async fn set_version(pool: &SqlitePool, run_id: RunId, version: &Version) -> Result<()> {
71	+	sqlx::query("UPDATE build_runs SET version = ? WHERE id = ? AND result = 'building'")
72	+	.bind(version.to_string())
73	+	.bind(run_id.0)
74	+	.execute(pool)
75	+	.await?;
76	+	Ok(())
77	+	}
78	+
79	+	/// Settle the run green. First terminal write wins (guarded on `building`).
80	+	pub async fn mark_passed(pool: &SqlitePool, run_id: RunId) -> Result<()> {
81	+	sqlx::query(
82	+	"UPDATE build_runs SET result = 'passed', phase = 'done', finished_at = ?
83	+	WHERE id = ? AND result = 'building'",
84	+	)
85	+	.bind(Utc::now().to_rfc3339())
86	+	.bind(run_id.0)
87	+	.execute(pool)
88	+	.await?;
89	+	Ok(())
90	+	}
91	+
92	+	/// Settle the run red with a human-readable cause. First terminal write wins,
93	+	/// so the most specific failure (build compile error, first red gate) recorded
94	+	/// before the task-level catch is the one that sticks.
95	+	pub async fn mark_failed(pool: &SqlitePool, run_id: RunId, summary: &str) -> Result<()> {
96	+	// Bound the stored summary — it's a headline, not the log. The full output
97	+	// is at the gate's log_ref / journald.
98	+	let summary: String = summary.chars().take(600).collect();
99	+	sqlx::query(
100	+	"UPDATE build_runs SET result = 'failed', phase = 'done', failure_summary = ?, finished_at = ?
101	+	WHERE id = ? AND result = 'building'",
102	+	)
103	+	.bind(&summary)
104	+	.bind(Utc::now().to_rfc3339())
105	+	.bind(run_id.0)
106	+	.execute(pool)
107	+	.await?;
108	+	Ok(())
109	+	}
110	+
111	+	/// Settle the run as superseded by a newer `/rebuild`.
112	+	pub async fn mark_aborted(pool: &SqlitePool, run_id: RunId) -> Result<()> {
113	+	sqlx::query(
114	+	"UPDATE build_runs SET result = 'aborted', phase = 'done',
115	+	failure_summary = 'superseded by a newer /rebuild', finished_at = ?
116	+	WHERE id = ? AND result = 'building'",
117	+	)
118	+	.bind(Utc::now().to_rfc3339())
119	+	.bind(run_id.0)
120	+	.execute(pool)
121	+	.await?;
122	+	Ok(())
123	+	}
124	+
125	+	/// One gate's status within a run view.
126	+	#[derive(Debug, Serialize)]
127	+	pub struct RunGateView {
128	+	pub kind: String,
129	+	/// `'passed' \| 'failed' \| 'blocked'` or NULL while in-flight.
130	+	pub status: Option<String>,
131	+	/// Relative path under `cfg.logs_root` for the full byte stream.
132	+	pub log_ref: Option<String>,
133	+	}
134	+
135	+	/// The `GET /runs/{id}` payload.
136	+	#[derive(Debug, Serialize)]
137	+	pub struct RunView {
138	+	pub run_id: i64,
139	+	pub sha: String,
140	+	pub version: Option<String>,
141	+	pub phase: String,
142	+	/// `'building' \| 'passed' \| 'failed' \| 'aborted'`.
143	+	pub result: String,
144	+	pub started_at: String,
145	+	pub finished_at: Option<String>,
146	+	/// Headline cause when `result = 'failed'`: first compile error or first
147	+	/// red gate. NULL otherwise.
148	+	pub failure_summary: Option<String>,
149	+	/// Gates run on the host tier for this run's version, latest row per kind.
150	+	/// Empty until the run reaches a version + the gating phase.
151	+	pub gates: Vec<RunGateView>,
152	+	}
153	+
154	+	/// Load a run plus its host-tier gate statuses. `None` if the id is unknown.
155	+	pub async fn get(pool: &SqlitePool, run_id: RunId) -> Result<Option<RunView>> {
156	+	let Some(row) = sqlx::query(
157	+	"SELECT id, sha, version, phase, result, started_at, finished_at, failure_summary
158	+	FROM build_runs WHERE id = ?",
159	+	)
160	+	.bind(run_id.0)
161	+	.fetch_optional(pool)
162	+	.await?
163	+	else {
164	+	return Ok(None);
165	+	};
166	+
167	+	let version: Option<String> = row.get("version");
168	+	// Gates are keyed by (tier, version); a build run drives the `host` tier.
169	+	// Latest row per gate_kind, matching `/state`'s per-tier query shape.
170	+	let gates: Vec<RunGateView> = if let Some(ver) = version.as_deref() {
171	+	sqlx::query(
172	+	"SELECT gate_kind, status, log_ref
173	+	FROM gate_runs g
174	+	WHERE tier = 'host' AND version = ?1
175	+	AND id = (SELECT MAX(id) FROM gate_runs
176	+	WHERE tier = 'host' AND version = ?1 AND gate_kind = g.gate_kind)
177	+	ORDER BY gate_kind",
178	+	)
179	+	.bind(ver)
180	+	.fetch_all(pool)
181	+	.await?
182	+	.into_iter()
183	+	.map(\|gr\| RunGateView {
184	+	kind: gr.get("gate_kind"),
185	+	status: gr.get("status"),
186	+	log_ref: gr.get("log_ref"),
187	+	})
188	+	.collect()
189	+	} else {
190	+	Vec::new()
191	+	};
192	+
193	+	Ok(Some(RunView {
194	+	run_id: row.get("id"),
195	+	sha: row.get("sha"),
196	+	version,
197	+	phase: row.get("phase"),
198	+	result: row.get("result"),
199	+	started_at: row.get("started_at"),
200	+	finished_at: row.get("finished_at"),
201	+	failure_summary: row.get("failure_summary"),
202	+	gates,
203	+	}))
204	+	}
205	+
206	+	/// Compact view of the latest build run for `/state`'s liveness line.
207	+	#[derive(Debug, Serialize)]
208	+	pub struct BuildSummary {
209	+	pub run_id: i64,
210	+	pub sha: String,
211	+	pub version: Option<String>,
212	+	pub phase: String,
213	+	pub result: String,
214	+	pub failure_summary: Option<String>,
215	+	/// Seconds from start to finish (or to now while building). Lets a
216	+	/// `/state` poller show "building <ver>, phase=<x>, elapsed Ns" instead of
217	+	/// a version frozen at the last success for the whole ~10-min build.
218	+	pub elapsed_s: i64,
219	+	}
220	+
221	+	/// The most recent build run, for `/state`. `None` until the first `/rebuild`.
222	+	pub async fn latest_summary(pool: &SqlitePool) -> Result<Option<BuildSummary>> {
223	+	let Some(row) = sqlx::query(
224	+	"SELECT id, sha, version, phase, result, failure_summary, started_at, finished_at
225	+	FROM build_runs ORDER BY id DESC LIMIT 1",
226	+	)
227	+	.fetch_optional(pool)
228	+	.await?
229	+	else {
230	+	return Ok(None);
231	+	};
232	+	let started_at: String = row.get("started_at");
233	+	let finished_at: Option<String> = row.get("finished_at");
234	+	Ok(Some(BuildSummary {
235	+	run_id: row.get("id"),
236	+	sha: row.get("sha"),
237	+	version: row.get("version"),
238	+	phase: row.get("phase"),
239	+	result: row.get("result"),
240	+	failure_summary: row.get("failure_summary"),
241	+	elapsed_s: elapsed_seconds(&started_at, finished_at.as_deref()),
242	+	}))
243	+	}
244	+
245	+	/// Seconds between an rfc3339 `started_at` and (`finished_at` or now), clamped
246	+	/// at 0. A parse failure yields 0 rather than erroring the whole `/state` call.
247	+	fn elapsed_seconds(started_at: &str, finished_at: Option<&str>) -> i64 {
248	+	let Ok(start) = chrono::DateTime::parse_from_rfc3339(started_at) else {
249	+	return 0;
250	+	};
251	+	let end = match finished_at {
252	+	Some(f) => chrono::DateTime::parse_from_rfc3339(f)
253	+	.map(\|d\| d.with_timezone(&Utc))
254	+	.unwrap_or_else(\|_\| Utc::now()),
255	+	None => Utc::now(),
256	+	};
257	+	(end - start.with_timezone(&Utc)).num_seconds().max(0)
258	+	}
259	+
260	+	/// The summary of the first failed gate for `version` on the host tier, if
261	+	/// any — used by the build pipeline to populate `failure_summary` when
262	+	/// `run_all` reports a red pipeline. Reads the typed `outcome_json` so the
263	+	/// stored headline matches what the TUI renders.
264	+	pub async fn first_failed_gate_summary(pool: &SqlitePool, version: &Version) -> Option<String> {
265	+	let row = sqlx::query(
266	+	"SELECT gate_kind, outcome_json FROM gate_runs
267	+	WHERE tier = 'host' AND version = ? AND status = 'failed'
268	+	ORDER BY id ASC LIMIT 1",
269	+	)
270	+	.bind(version.to_string())
271	+	.fetch_optional(pool)
272	+	.await
273	+	.ok()
274	+	.flatten()?;
275	+	let kind: String = row.get("gate_kind");
276	+	let outcome_json: Option<String> = row.get("outcome_json");
277	+	let summary = outcome_json
278	+	.and_then(\|s\| serde_json::from_str::<crate::outcome::GateOutcome>(&s).ok())
279	+	.map(\|o\| match o.status {
280	+	crate::outcome::GateStatus::Failed { failure } => failure.summary(),
281	+	other => format!("{:?}", other),
282	+	})
283	+	.unwrap_or_else(\|\| "gate failed".to_string());
284	+	Some(format!("{kind}: {summary}"))
285	+	}
286	+
287	+	#[cfg(test)]
288	+	mod tests {
289	+	use super::*;
290	+	use sqlx::sqlite::SqlitePoolOptions;
291	+
292	+	async fn pool() -> SqlitePool {
293	+	let pool = SqlitePoolOptions::new()
294	+	.max_connections(1)
295	+	.connect("sqlite::memory:")
296	+	.await
297	+	.unwrap();
298	+	crate::db::migrate(&pool).await.unwrap();
299	+	pool
300	+	}
301	+
302	+	#[tokio::test]
303	+	async fn create_then_get_roundtrips_building() {
304	+	let pool = pool().await;
305	+	let id = create(&pool, "abc1234").await.unwrap();
306	+	let v = get(&pool, id).await.unwrap().expect("run exists");
307	+	assert_eq!(v.sha, "abc1234");
308	+	assert_eq!(v.result, "building");
309	+	assert_eq!(v.phase, "queued");
310	+	assert!(v.version.is_none());
311	+	assert!(v.gates.is_empty());
312	+	assert!(v.failure_summary.is_none());
313	+	}
314	+
315	+	#[tokio::test]
316	+	async fn phase_and_version_advance_then_pass() {
317	+	let pool = pool().await;
318	+	let id = create(&pool, "abc1234").await.unwrap();
319	+	set_phase(&pool, id, Phase::Compiling).await.unwrap();
320	+	let ver: Version = "0.10.2".parse().unwrap();
321	+	set_version(&pool, id, &ver).await.unwrap();
322	+	mark_passed(&pool, id).await.unwrap();
323	+
324	+	let v = get(&pool, id).await.unwrap().unwrap();
325	+	assert_eq!(v.result, "passed");
326	+	assert_eq!(v.phase, "done");
327	+	assert_eq!(v.version.as_deref(), Some("0.10.2"));
328	+	assert!(v.finished_at.is_some());
329	+	}
330	+
331	+	#[tokio::test]
332	+	async fn first_terminal_write_wins() {
333	+	let pool = pool().await;
334	+	let id = create(&pool, "abc1234").await.unwrap();
335	+	mark_failed(&pool, id, "error[E0063]: missing field user_pages_host").await.unwrap();
336	+	// A later pass attempt (e.g. the task catch racing a build-step error)
337	+	// must not overwrite the recorded failure.
338	+	mark_passed(&pool, id).await.unwrap();
339	+	// And a second failure summary doesn't clobber the first.
340	+	mark_failed(&pool, id, "something else").await.unwrap();
341	+
342	+	let v = get(&pool, id).await.unwrap().unwrap();
343	+	assert_eq!(v.result, "failed");
344	+	assert_eq!(v.failure_summary.as_deref(), Some("error[E0063]: missing field user_pages_host"));
345	+	}
346	+
347	+	#[tokio::test]
348	+	async fn phase_write_after_terminal_is_noop() {
349	+	let pool = pool().await;
350	+	let id = create(&pool, "abc1234").await.unwrap();
351	+	mark_passed(&pool, id).await.unwrap();
352	+	set_phase(&pool, id, Phase::Gating).await.unwrap();
353	+	let v = get(&pool, id).await.unwrap().unwrap();
354	+	assert_eq!(v.phase, "done", "a late phase write must not move a finished run");
355	+	}
356	+
357	+	#[test]
358	+	fn elapsed_seconds_uses_finished_when_present() {
359	+	// Both timestamps present → exact span, no wall-clock dependency.
360	+	let s = elapsed_seconds("2026-06-13T00:00:00Z", Some("2026-06-13T00:02:05Z"));
361	+	assert_eq!(s, 125);
362	+	// Unparseable start → 0, never a panic / negative.
363	+	assert_eq!(elapsed_seconds("not-a-date", None), 0);
364	+	}
365	+
366	+	#[tokio::test]
367	+	async fn latest_summary_reports_most_recent_run() {
368	+	let pool = pool().await;
369	+	assert!(latest_summary(&pool).await.unwrap().is_none());
370	+	let _old = create(&pool, "old1234").await.unwrap();
371	+	let new = create(&pool, "new5678").await.unwrap();
372	+	set_phase(&pool, new, Phase::Compiling).await.unwrap();
373	+	let sum = latest_summary(&pool).await.unwrap().expect("a run exists");
374	+	assert_eq!(sum.run_id, new.0);
375	+	assert_eq!(sum.sha, "new5678");
376	+	assert_eq!(sum.phase, "compiling");
377	+	assert_eq!(sum.result, "building");
378	+	}
379	+
380	+	#[tokio::test]
381	+	async fn get_unknown_id_is_none() {
382	+	let pool = pool().await;
383	+	assert!(get(&pool, RunId(999)).await.unwrap().is_none());
384	+	}
385	+
386	+	#[tokio::test]
387	+	async fn failure_summary_is_bounded() {
388	+	let pool = pool().await;
389	+	let id = create(&pool, "abc1234").await.unwrap();
390	+	mark_failed(&pool, id, &"x".repeat(5_000)).await.unwrap();
391	+	let v = get(&pool, id).await.unwrap().unwrap();
392	+	assert!(v.failure_summary.unwrap().len() <= 600);
393	+	}
394	+	}

M sando/daemon/src/state.rs +11 -2

			@@ -15,6 +15,13 @@ use tokio::task::AbortHandle;
15	15		/// constructing ssh/rsync invocations inline.
16	16		pub type ExecutorMap = HashMap<NodeId, Arc<dyn Executor>>;
17	17
	18	+	/// The in-flight build pipeline: its abort handle plus the `build_runs` row it
	19	+	/// drives. A newer `/rebuild` aborts the handle and settles the row `aborted`.
	20	+	pub struct ActiveBuild {
	21	+	pub handle: AbortHandle,
	22	+	pub run_id: crate::domain::RunId,
	23	+	}
	24	+
18	25		#[derive(Clone)]
19	26		pub struct AppState {
20	27		pub pool: SqlitePool,
			@@ -22,8 +29,10 @@ pub struct AppState {
22	29		pub cfg: Arc<Config>,
23	30		pub prom: PrometheusHandle,
24	31		/// Single-slot guard for the build pipeline. A new /rebuild aborts any
25		-	/// in-flight build (cargo + gates) so the latest push always wins.
26		-	pub active_build: Arc<Mutex<Option<AbortHandle>>>,
	32	+	/// in-flight build (cargo + gates) so the latest push always wins. Carries
	33	+	/// the run id alongside the handle so the aborting `/rebuild` can settle
	34	+	/// the superseded `build_runs` row as `aborted`.
	35	+	pub active_build: Arc<Mutex<Option<ActiveBuild>>>,
27	36		/// Serializes the deploy mutators (`/promote`, `/rollback`) so their
28	37		/// check -> deploy -> advance sequences never interleave. Without it two
29	38		/// concurrent promotes (or a promote racing a rollback) could deploy mixed

M sando/deploy/sando-daemon.toml.example +6

			@@ -16,6 +16,12 @@ release_root = "/srv/sando"
16	16		scratch_db_url = "postgres:///sando_scratch?host=/var/run/postgresql"
17	17		bin_names = ["makenotwork", "mnw-admin"]
18	18		logs_root = "/srv/sando/logs"
	19	+	# Shared cargo target dir across per-sha worktrees. Without it every /rebuild
	20	+	# clean-compiles a fresh worktree (~10 min) even for a 1-line diff; with it the
	21	+	# incremental rebuild reuses the previous sha's compiled deps (1–2 min). Safe
	22	+	# because builds are serialized (a new /rebuild aborts the in-flight one). The
	23	+	# sando user must be able to write it; cargo creates it if absent.
	24	+	cargo_target_dir = "/srv/sando/cargo-target"
19	25
20	26		# Non-binary content shipped as part of each release. Multiple entries can
21	27		# target the same `dst` (additive merge — used to build `docs/` from three

M sando/tui/src/main.rs +6 -1

			@@ -489,9 +489,14 @@ fn pass_note_short(n: &PassNote) -> String {
489	489
490	490		fn failure_short(f: &GateFailure) -> String {
491	491		match f {
492		-	GateFailure::CargoTest { failed_count, first_failed: Some(name) } =>
	492	+	GateFailure::CargoTest { failed_count, first_panic: Some(p), .. } =>
	493	+	format!("{failed_count} test(s); panic: {p}"),
	494	+	GateFailure::CargoTest { failed_count, first_failed: Some(name), .. } =>
493	495		format!("{failed_count} test(s); first {name}"),
494	496		GateFailure::CargoTest { failed_count, .. } => format!("{failed_count} test(s) failed"),
	497	+	GateFailure::CompileError { error_count, first_error: Some(e) } =>
	498	+	format!("compile failed ({error_count}); {e}"),
	499	+	GateFailure::CompileError { error_count, .. } => format!("compile failed ({error_count})"),
495	500		GateFailure::MigrationDrift { migration } => format!("drift {migration}"),
496	501		GateFailure::MigrationModified { migration } => format!("modified {migration}"),
497	502		GateFailure::MigrationSqlError { migration, sqlstate: Some(s) } =>