Skip to main content

max / makenotwork

sando: typed gate outcomes and observability scaffolding Replace the daemon's stringly-typed gate/deploy/event surface with a typed graph: new domain/outcome/classify/lib/live_log modules, two migrations (003_typed_outcomes, 004_typed_deploy_outcomes), and the events/gates/routes/topology/build/deploy callsites rewired through the new types. TUI consumes the richer payloads. Captures the design in plans/observability.md and the prod work in plans/session-3-prod.md. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Author: Max Johnson <me@maxj.phd> · 2026-06-03 22:15 UTC
Commit: d972fe5779a43128a662f7a7791fbb3cca8db49b
Parent: f076692
23 files changed, +3852 insertions, -375 deletions
@@ -1623,6 +1623,7 @@ dependencies = [
1623 1623 "metrics",
1624 1624 "metrics-exporter-prometheus",
1625 1625 "reqwest",
1626 + "semver",
1626 1627 "serde",
1627 1628 "serde_json",
1628 1629 "sqlx",
@@ -1648,6 +1649,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1648 1649 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
1649 1650
1650 1651 [[package]]
1652 + name = "semver"
1653 + version = "1.0.28"
1654 + source = "registry+https://github.com/rust-lang/crates.io-index"
1655 + checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
1656 + dependencies = [
1657 + "serde",
1658 + "serde_core",
1659 + ]
1660 +
1661 + [[package]]
1651 1662 name = "serde"
1652 1663 version = "1.0.228"
1653 1664 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1,6 +1,6 @@
1 1 [package]
2 2 name = "sando-daemon"
3 - version = "0.1.0"
3 + version = "0.2.0"
4 4 edition = "2024"
5 5 license = "MIT"
6 6
@@ -22,6 +22,7 @@ metrics-exporter-prometheus = { version = "0.18.1", default-features = false }
22 22 anyhow = "1.0.102"
23 23 thiserror = "2.0.18"
24 24 chrono = { version = "0.4", features = ["serde"] }
25 + semver = { version = "1.0", features = ["serde"] }
25 26
26 27 [dev-dependencies]
27 28 tempfile = "3.20"
@@ -0,0 +1,116 @@
1 + -- Typed gate outcomes. See `plans/observability.md`.
2 + --
3 + -- Adds three columns to `gate_runs`:
4 + -- status — 'passed' | 'failed' | 'blocked' | NULL while in-flight
5 + -- outcome_json — serialized GateOutcome (the source of truth post-migration)
6 + -- log_ref — relative path under cfg.logs_root to the stdout/stderr capture
7 + --
8 + -- `passed` and `detail` are retained as SHADOW columns for one release.
9 + -- The runner double-writes them so any out-of-process consumer that still
10 + -- reads the old schema (or any rollback to a pre-003 binary) keeps working.
11 + -- Migration 004 will drop them.
12 + --
13 + -- Backfill rules for legacy rows:
14 + -- passed = 1 → status='passed', outcome_json=passed/legacy{text=detail}
15 + -- passed IS NULL → status=NULL (in-flight; runner will write)
16 + -- detail matches a known blocker form → status='blocked', outcome_json=blocked/<variant>
17 + -- anything else → status='failed', outcome_json=failed/unclassified{legacy_detail=detail}
18 + --
19 + -- The blocker patterns reproduce the exact prose the pre-typed runner wrote,
20 + -- which is stable across the history these rows came from. See
21 + -- `outcome::PassNote::summary` / `GateBlocker::summary` for the spelling.
22 +
23 + ALTER TABLE gate_runs ADD COLUMN status TEXT;
24 + ALTER TABLE gate_runs ADD COLUMN outcome_json TEXT;
25 + ALTER TABLE gate_runs ADD COLUMN log_ref TEXT;
26 +
27 + -- 1. In-flight rows (passed IS NULL): leave status/outcome_json NULL.
28 + -- 2. Passed rows: wrap legacy detail in PassNote::Legacy.
29 + UPDATE gate_runs
30 + SET status = 'passed',
31 + outcome_json = json_object(
32 + 'status', json_object(
33 + 'kind', 'passed',
34 + 'note', json_object('kind', 'legacy', 'text', COALESCE(detail, ''))
35 + )
36 + )
37 + WHERE passed = 1;
38 +
39 + -- 3. Failed-with-blocker prose.
40 + UPDATE gate_runs
41 + SET status = 'blocked',
42 + outcome_json = json_object(
43 + 'status', json_object(
44 + 'kind', 'blocked',
45 + 'blocker', json_object('kind', 'burn_in_clock_not_started')
46 + )
47 + )
48 + WHERE passed = 0 AND detail LIKE 'burn-in clock not started%';
49 +
50 + UPDATE gate_runs
51 + SET status = 'blocked',
52 + outcome_json = json_object(
53 + 'status', json_object(
54 + 'kind', 'blocked',
55 + 'blocker', json_object('kind', 'scratch_db_url_unset')
56 + )
57 + )
58 + WHERE passed = 0 AND detail LIKE 'scratch_db_url unset%';
59 +
60 + UPDATE gate_runs
61 + SET status = 'blocked',
62 + outcome_json = json_object(
63 + 'status', json_object(
64 + 'kind', 'blocked',
65 + 'blocker', json_object('kind', 'no_backup_available')
66 + )
67 + )
68 + WHERE passed = 0 AND detail LIKE 'no backup fetched%';
69 +
70 + UPDATE gate_runs
71 + SET status = 'blocked',
72 + outcome_json = json_object(
73 + 'status', json_object(
74 + 'kind', 'blocked',
75 + 'blocker', json_object('kind', 'awaiting_operator_confirmation')
76 + )
77 + )
78 + WHERE passed = 0 AND detail LIKE 'waiting on operator%';
79 +
80 + UPDATE gate_runs
81 + SET status = 'blocked',
82 + outcome_json = json_object(
83 + 'status', json_object(
84 + 'kind', 'blocked',
85 + 'blocker', json_object('kind', 'artifact_missing', 'version', '?')
86 + )
87 + )
88 + WHERE passed = 0 AND detail LIKE 'no artifact for version%';
89 +
90 + -- "N hours remaining of M" — extract via SQL regexp would be nice but SQLite's
91 + -- regex is optional. Settle for the raw form; the typed runner will overwrite
92 + -- with proper values on next run. status='blocked' is still load-bearing.
93 + UPDATE gate_runs
94 + SET status = 'blocked',
95 + outcome_json = json_object(
96 + 'status', json_object(
97 + 'kind', 'blocked',
98 + 'blocker', json_object('kind', 'burn_in_remaining',
99 + 'hours_remaining', 0,
100 + 'hours_total', 0)
101 + )
102 + )
103 + WHERE passed = 0 AND detail LIKE '% hours remaining of %';
104 +
105 + -- 4. Everything else with passed=0 is genuine failure, unclassified.
106 + UPDATE gate_runs
107 + SET status = 'failed',
108 + outcome_json = json_object(
109 + 'status', json_object(
110 + 'kind', 'failed',
111 + 'failure', json_object('kind', 'unclassified', 'legacy_detail', COALESCE(detail, ''))
112 + )
113 + )
114 + WHERE passed = 0 AND status IS NULL;
115 +
116 + CREATE INDEX gate_runs_status ON gate_runs(status);
@@ -0,0 +1,16 @@
1 + -- Typed deploy outcomes. See `plans/observability.md` step 7.
2 + --
3 + -- Mirrors migration 003 for the `deploys` table: add `outcome_json`
4 + -- carrying the full `DeployOutcome` (and `DeployFailureKind` when the
5 + -- status is `failed`). The legacy `outcome TEXT` column stays as the
6 + -- high-level status word ('in_progress' | 'ok' | 'failed'), since
7 + -- nothing in the daemon's read path depends on the failure-kind
8 + -- structure today — operators see it through the WS event.
9 + --
10 + -- No backfill: historical deploys with `outcome='failed'` lose their
11 + -- freeform `error` string (which previously rode the WS event only —
12 + -- nothing persisted it). For future failed deploys, the runner
13 + -- populates outcome_json alongside outcome.
14 +
15 + ALTER TABLE deploys ADD COLUMN outcome_json TEXT;
16 + CREATE INDEX deploys_by_outcome ON deploys(outcome);
@@ -6,6 +6,7 @@
6 6
7 7 use crate::config::Config;
8 8 use crate::deploy;
9 + use crate::domain::{GitSha, TierId, Version};
9 10 use crate::gates::{self, GateCtx};
10 11 use crate::git;
11 12 use crate::topology::Topology;
@@ -18,8 +19,8 @@ use tokio::process::Command;
18 19
19 20 #[derive(Debug, Clone)]
20 21 pub struct BuildArtifact {
21 - pub version: String,
22 - pub git_sha: String,
22 + pub version: Version,
23 + pub git_sha: GitSha,
23 24 pub worktree: PathBuf,
24 25 /// One entry per `cfg.bin_names` in declared order. First is the primary
25 26 /// (referenced by the systemd unit's ExecStart). Paths are inside the
@@ -31,12 +32,12 @@ pub async fn run(
31 32 pool: SqlitePool,
32 33 cfg: Arc<Config>,
33 34 topo: Arc<Topology>,
34 - sha: String,
35 + sha: GitSha,
35 36 events: crate::events::EventTx,
36 37 ) -> Result<BuildArtifact> {
37 - let worktree = cfg.workdir.join(&sha);
38 + let worktree = cfg.workdir.join(sha.as_str());
38 39 let bare = PathBuf::from(&topo.repo.bare_path);
39 - git::checkout_worktree(&bare, &sha, &worktree).await?;
40 + git::checkout_worktree(&bare, sha.as_str(), &worktree).await?;
40 41
41 42 let server_dir = worktree.join("server");
42 43 let version = read_pkg_version(&server_dir.join("Cargo.toml")).await
@@ -53,7 +54,7 @@ pub async fn run(
53 54 .current_dir(&server_dir)
54 55 .kill_on_drop(true);
55 56 if let Some(scratch_url) = cfg.scratch_db_url.as_deref() {
56 - tracing::info!(sha = %sha, "preparing scratch DB schema for sqlx compile-time checks");
57 + tracing::info!(sha = %sha.as_str(), "preparing scratch DB schema for sqlx compile-time checks");
57 58 crate::gates::reset_scratch(scratch_url).await
58 59 .context("scratch DB reset before build")?;
59 60 crate::gates::run_migrator(scratch_url, &server_dir.join("migrations")).await
@@ -124,7 +125,7 @@ pub async fn build_and_run_host(
124 125 pool: SqlitePool,
125 126 cfg: Arc<Config>,
126 127 topo: Arc<Topology>,
127 - sha: String,
128 + sha: GitSha,
128 129 events: crate::events::EventTx,
129 130 ) -> Result<()> {
130 131 let art = run(pool.clone(), cfg.clone(), topo.clone(), sha, events.clone()).await?;
@@ -149,13 +150,13 @@ pub async fn build_and_run_host(
149 150 .execute(&pool)
150 151 .await?;
151 152
152 - let host = topo.tiers.iter().find(|t| t.name == "host")
153 + let host = topo.tiers.iter().find(|t| t.name.as_str() == "host")
153 154 .context("topology has no `host` tier")?;
154 155
155 156 let ctx = GateCtx {
156 157 pool: pool.clone(),
157 158 cfg: cfg.clone(),
158 - tier: "host".to_string(),
159 + tier: TierId::new("host"),
159 160 version: art.version.clone(),
160 161 worktree: art.worktree.clone(),
161 162 events: events.clone(),
@@ -183,7 +184,7 @@ pub async fn build_and_run_host(
183 184 Ok(())
184 185 }
185 186
186 - async fn read_pkg_version(cargo_toml: &Path) -> Result<String> {
187 + async fn read_pkg_version(cargo_toml: &Path) -> Result<Version> {
187 188 let raw = tokio::fs::read_to_string(cargo_toml).await?;
188 189 let parsed: toml::Value = toml::from_str(&raw)?;
189 190 let v = parsed
@@ -191,7 +192,7 @@ async fn read_pkg_version(cargo_toml: &Path) -> Result<String> {
191 192 .and_then(|p| p.get("version"))
192 193 .and_then(|v| v.as_str())
193 194 .context("package.version not found")?;
194 - Ok(v.to_string())
195 + Version::parse(v).with_context(|| format!("parsing package.version `{v}`"))
195 196 }
196 197
197 198 fn tail(buf: &[u8], max: usize) -> String {
@@ -0,0 +1,359 @@
1 + //! Gate-output classifiers.
2 + //!
3 + //! Each `classify_*` function takes the raw signals produced by a gate
4 + //! runner (exit status, stdout/stderr tails, sqlx error strings) and
5 + //! maps them to a typed `GateFailure`. Anything that doesn't match a
6 + //! known pattern returns `GateFailure::Unclassified` with the original
7 + //! detail attached — the on-disk gate log is the ultimate fallback.
8 + //!
9 + //! Classifiers are pure functions: no IO, no async. That makes them
10 + //! fixture-testable, and it keeps the `gates.rs` runner code in charge
11 + //! of side effects (process spawning, log persistence).
12 +
13 + use crate::outcome::GateFailure;
14 +
15 + /// `cargo_test`: derive a `CargoTest` failure with whatever counts can
16 + /// be lifted out of the test runner's output.
17 + ///
18 + /// libtest emits a `test result: FAILED. P passed; F failed; ...` line
19 + /// near the end of stdout. We grab `F` from that. If the output never
20 + /// reached that line (compile error, runtime panic in the harness), we
21 + /// fall through to `Unclassified`.
22 + pub fn classify_cargo_test(stdout: &[u8], stderr: &[u8]) -> GateFailure {
23 + let stdout_s = String::from_utf8_lossy(stdout);
24 +
25 + let mut failed_count: u32 = 0;
26 + let mut first_failed: Option<String> = None;
27 +
28 + // `test result: FAILED. P passed; F failed; ...` lives near the
29 + // end. Walk backwards to find it cheaply on very large outputs.
30 + for line in stdout_s.lines().rev().take(50) {
31 + if let Some(rest) = line.strip_prefix("test result: FAILED.") {
32 + // Expect "P passed; F failed; ..."
33 + for piece in rest.split(';') {
34 + let p = piece.trim();
35 + if let Some(num_str) = p.strip_suffix(" failed") {
36 + if let Ok(n) = num_str.parse::<u32>() {
37 + failed_count = n;
38 + }
39 + }
40 + }
41 + break;
42 + }
43 + }
44 +
45 + // libtest prints "failures:\n foo::bar" near the end too. Grab
46 + // the first one for the summary line.
47 + if let Some(idx) = stdout_s.find("\nfailures:\n") {
48 + for line in stdout_s[idx + 11..].lines() {
49 + let trimmed = line.trim();
50 + if trimmed.is_empty() { break; }
51 + // The "failures:" block repeats — once with stdout per
52 + // failure, once as a plain name list. Either way the first
53 + // non-empty line is a candidate.
54 + first_failed = Some(trimmed.to_string());
55 + break;
56 + }
57 + }
58 +
59 + if failed_count == 0 && first_failed.is_none() {
60 + // Compile error or harness panic — no usable signal in stdout.
61 + return GateFailure::Unclassified {
62 + legacy_detail: Some(combined_tail_for_classifier(stdout, stderr)),
63 + };
64 + }
65 +
66 + GateFailure::CargoTest { failed_count, first_failed }
67 + }
68 +
69 + /// `migration_dry_run` is staged: scratch reset → restore dump → run
70 + /// migrator. Each stage has its own failure mode. The caller (the gate
71 + /// runner) knows which stage tripped; classifiers here turn the stage's
72 + /// error string into a typed variant.
73 + ///
74 + /// Inputs are the migration name (when known) and the error string sqlx
75 + /// returned. `migration` defaults to "?" when sqlx couldn't tell us
76 + /// which file blew up.
77 + pub fn classify_migration_error(err: &str, migration_hint: Option<&str>) -> GateFailure {
78 + // sqlx::migrate::MigrateError variants are stringified consistently.
79 + // Examples from `plans/migration-dryrun-failures.md`:
80 + // "migration 47 was previously applied but is missing in the resolved migrations"
81 + // "migration 47 was previously applied but has been modified"
82 + // sqlx::Error::Database with sqlstate (e.g. "42P01" relation does not exist)
83 +
84 + if let Some(m) = extract_drift(err) {
85 + return GateFailure::MigrationDrift { migration: m };
86 + }
87 + if let Some(m) = extract_modified(err) {
88 + return GateFailure::MigrationModified { migration: m };
89 + }
90 + let sqlstate = extract_sqlstate(err);
91 + let migration = migration_hint.map(str::to_owned).unwrap_or_else(|| "?".to_owned());
92 + if sqlstate.is_some() {
93 + return GateFailure::MigrationSqlError { migration, sqlstate };
94 + }
95 + GateFailure::Unclassified {
96 + legacy_detail: Some(err.chars().take(4_000).collect()),
97 + }
98 + }
99 +
100 + fn extract_drift(err: &str) -> Option<String> {
101 + // "migration N was previously applied but is missing in the resolved migrations"
102 + let idx = err.find(" was previously applied but is missing")?;
103 + let prefix = &err[..idx];
104 + let mig = prefix.rsplit_once(' ').map(|(_, m)| m).unwrap_or(prefix);
105 + Some(mig.to_string())
106 + }
107 +
108 + fn extract_modified(err: &str) -> Option<String> {
109 + let idx = err.find(" was previously applied but has been modified")?;
110 + let prefix = &err[..idx];
111 + let mig = prefix.rsplit_once(' ').map(|(_, m)| m).unwrap_or(prefix);
112 + Some(mig.to_string())
113 + }
114 +
115 + fn extract_sqlstate(err: &str) -> Option<String> {
116 + // Postgres errors surface as `... code: "42P01" ...` in the Debug
117 + // form sqlx produces. Be tolerant of the surrounding quoting.
118 + let idx = err.find("code: \"")?;
119 + let rest = &err[idx + 7..];
120 + let end = rest.find('"')?;
121 + Some(rest[..end].to_string())
122 + }
123 +
124 + /// `boot_smoke`: process exit info is the dominant signal. If the
125 + /// binary exited with a status during the smoke window, we map exit
126 + /// code 101 (Rust default for panic) to `BootPanic`, everything else
127 + /// to `BootExitedEarly`. If it never exited (stayed up), the caller
128 + /// constructs `PassNote::StayedUp` directly without consulting this.
129 + pub fn classify_boot_smoke(exit_code: Option<i32>) -> GateFailure {
130 + match exit_code {
131 + Some(101) => GateFailure::BootPanic { exit_code: Some(101) },
132 + Some(c) if c < 0 => GateFailure::BootPanic { exit_code: Some(c) }, // killed by signal
133 + Some(c) => GateFailure::BootExitedEarly { exit_code: Some(c) },
134 + None => GateFailure::BootExitedEarly { exit_code: None },
135 + }
136 + }
137 +
138 + /// `Event::DeployFailed`: classify an anyhow chain produced by
139 + /// `deploy::deploy_node` into a typed `DeployFailureKind`.
140 + ///
141 + /// The anyhow chain is the `format!("{e:#}")` string the caller built,
142 + /// which joins each `.context(...)` layer with ": ". We probe for the
143 + /// contexts attached by `deploy_remote` (and well-known stderr patterns
144 + /// from ssh/rsync) in order of specificity.
145 + pub fn classify_deploy_error(err: &str) -> crate::outcome::DeployFailureKind {
146 + use crate::outcome::DeployFailureKind as K;
147 +
148 + // SSH-level transport failures bubble up under whatever context
149 + // their caller attached. Probe for the canonical OpenSSH stderr
150 + // patterns first so a "creating remote release dir: ... Connection
151 + // refused" doesn't get filed under NodeUnreachable's prose label.
152 + let unreachable_signals = [
153 + "Connection refused",
154 + "Connection timed out",
155 + "Network is unreachable",
156 + "No route to host",
157 + "Could not resolve hostname",
158 + "Host key verification failed",
159 + "Permission denied (publickey",
160 + ];
161 + if unreachable_signals.iter().any(|p| err.contains(p)) {
162 + return K::NodeUnreachable { detail: err.chars().take(400).collect() };
163 + }
164 +
165 + // The contexts attached by `deploy_remote` (deploy.rs) are stable
166 + // strings; treat them as anchors. Order matters — "symlink swap +
167 + // systemctl" appears after a successful rsync, so probe rsync first
168 + // to avoid catching it under the swap heading.
169 + if err.contains("rsync failed") || err.contains("spawning rsync") {
170 + return K::RsyncFailed { detail: err.chars().take(400).collect() };
171 + }
172 + if err.contains("creating remote release dir") {
173 + return K::NodeUnreachable { detail: err.chars().take(400).collect() };
174 + }
175 + if err.contains("symlink swap + systemctl") {
176 + // Heuristic split inside the combined step: stderr containing
177 + // "systemctl" suggests the swap succeeded and the restart failed.
178 + if err.contains("systemctl") && !err.contains("ln:") {
179 + return K::ServiceRestartFailed { detail: err.chars().take(400).collect() };
180 + }
181 + return K::SymlinkSwapFailed { detail: err.chars().take(400).collect() };
182 + }
183 + if err.contains("symlink swap failed") {
184 + return K::SymlinkSwapFailed { detail: err.chars().take(400).collect() };
185 + }
186 +
187 + K::Unclassified { detail: err.chars().take(400).collect() }
188 + }
189 +
190 + /// Concatenate stdout + stderr tails the way the legacy runner did, so
191 + /// `Unclassified.legacy_detail` looks like what operators are used to
192 + /// seeing in `gate_runs.detail` today.
193 + fn combined_tail_for_classifier(stdout: &[u8], stderr: &[u8]) -> String {
194 + let mut joined = Vec::with_capacity(stdout.len() + stderr.len() + 32);
195 + joined.extend_from_slice(b"==== stdout ====\n");
196 + joined.extend_from_slice(stdout);
197 + if !stdout.last().is_some_and(|b| *b == b'\n') { joined.push(b'\n'); }
198 + joined.extend_from_slice(b"==== stderr ====\n");
199 + joined.extend_from_slice(stderr);
200 + let s = String::from_utf8_lossy(&joined);
201 + if s.len() <= 4_000 { s.into_owned() } else { format!("...{}", &s[s.len() - 4_000..]) }
202 + }
203 +
204 + #[cfg(test)]
205 + mod tests {
206 + use super::*;
207 +
208 + #[test]
209 + fn cargo_test_extracts_failed_count() {
210 + let stdout = b"running 12 tests\n\
211 + test foo ... ok\n\
212 + test bar ... FAILED\n\
213 + test baz ... FAILED\n\
214 + \n\
215 + failures:\n\
216 + foo::bar\n\
217 + foo::baz\n\
218 + \n\
219 + test result: FAILED. 10 passed; 2 failed; 0 ignored\n";
220 + let GateFailure::CargoTest { failed_count, first_failed } =
221 + classify_cargo_test(stdout, b"")
222 + else { panic!("expected CargoTest variant"); };
223 + assert_eq!(failed_count, 2);
224 + assert_eq!(first_failed.as_deref(), Some("foo::bar"));
225 + }
226 +
227 + #[test]
228 + fn cargo_test_compile_error_is_unclassified() {
229 + // No "test result:" line because cargo never got to running.
230 + let stderr = b"error[E0382]: borrow of moved value: `x`\n";
231 + let f = classify_cargo_test(b"", stderr);
232 + match f {
233 + GateFailure::Unclassified { legacy_detail: Some(d) } => {
234 + assert!(d.contains("borrow of moved value"));
235 + }
236 + other => panic!("expected Unclassified, got {other:?}"),
237 + }
238 + }
239 +
240 + #[test]
241 + fn migration_drift_extracts_name() {
242 + let err = "migration 0047_widgets was previously applied but is missing in the resolved migrations";
243 + let f = classify_migration_error(err, None);
244 + match f {
245 + GateFailure::MigrationDrift { migration } => assert_eq!(migration, "0047_widgets"),
246 + other => panic!("expected MigrationDrift, got {other:?}"),
247 + }
248 + }
249 +
250 + #[test]
251 + fn migration_modified_extracts_name() {
252 + let err = "migration 0042_seed was previously applied but has been modified";
253 + let f = classify_migration_error(err, None);
254 + match f {
255 + GateFailure::MigrationModified { migration } => assert_eq!(migration, "0042_seed"),
256 + other => panic!("expected MigrationModified, got {other:?}"),
257 + }
258 + }
259 +
260 + #[test]
261 + fn migration_sql_error_extracts_sqlstate() {
262 + let err = r#"while executing migrations: error returned from database: code: "42P01" message: "relation \"widgets\" does not exist""#;
263 + let f = classify_migration_error(err, Some("0050_drop_widgets"));
264 + match f {
265 + GateFailure::MigrationSqlError { migration, sqlstate } => {
266 + assert_eq!(migration, "0050_drop_widgets");
267 + assert_eq!(sqlstate.as_deref(), Some("42P01"));
268 + }
269 + other => panic!("expected MigrationSqlError, got {other:?}"),
270 + }
271 + }
272 +
273 + #[test]
274 + fn migration_unknown_error_is_unclassified() {
275 + let err = "something went wrong with the universe";
276 + let f = classify_migration_error(err, None);
277 + match f {
278 + GateFailure::Unclassified { legacy_detail: Some(d) } => {
279 + assert!(d.contains("universe"));
280 + }
281 + other => panic!("expected Unclassified, got {other:?}"),
282 + }
283 + }
284 +
285 + #[test]
286 + fn boot_smoke_101_is_panic() {
287 + match classify_boot_smoke(Some(101)) {
288 + GateFailure::BootPanic { exit_code: Some(101) } => {}
289 + other => panic!("expected BootPanic(101), got {other:?}"),
290 + }
291 + }
292 +
293 + #[test]
294 + fn boot_smoke_signal_is_panic() {
295 + match classify_boot_smoke(Some(-9)) {
296 + GateFailure::BootPanic { exit_code: Some(-9) } => {}
297 + other => panic!("expected BootPanic(-9), got {other:?}"),
298 + }
299 + }
300 +
301 + #[test]
302 + fn boot_smoke_other_exit_is_exited_early() {
303 + match classify_boot_smoke(Some(2)) {
304 + GateFailure::BootExitedEarly { exit_code: Some(2) } => {}
305 + other => panic!("expected BootExitedEarly(2), got {other:?}"),
306 + }
307 + }
308 +
309 + #[test]
310 + fn deploy_connection_refused_is_node_unreachable() {
311 + use crate::outcome::DeployFailureKind as K;
312 + let err = "creating remote release dir: ssh testnot-1 failed: ssh: connect to host testnot-1 port 22: Connection refused";
313 + match classify_deploy_error(err) {
314 + K::NodeUnreachable { .. } => {}
315 + other => panic!("expected NodeUnreachable, got {other:?}"),
316 + }
317 + }
318 +
319 + #[test]
320 + fn deploy_rsync_failure_is_rsync_failed() {
321 + use crate::outcome::DeployFailureKind as K;
322 + let err = "rsync failed (current symlink left intact): rsync: write failed on \"/srv/.../makenotwork\": No space left on device (28)";
323 + match classify_deploy_error(err) {
324 + K::RsyncFailed { detail } => assert!(detail.contains("No space left")),
325 + other => panic!("expected RsyncFailed, got {other:?}"),
326 + }
327 + }
328 +
329 + #[test]
330 + fn deploy_systemctl_failure_is_service_restart_failed() {
331 + use crate::outcome::DeployFailureKind as K;
332 + // The combined "swap + restart" step where stderr mentions systemctl.
333 + let err = "symlink swap + systemctl reload-or-restart: ssh testnot-1 failed: Failed to restart makenotwork.service: Unit makenotwork.service failed to start";
334 + match classify_deploy_error(err) {
335 + K::ServiceRestartFailed { .. } => {}
336 + other => panic!("expected ServiceRestartFailed, got {other:?}"),
337 + }
338 + }
339 +
340 + #[test]
341 + fn deploy_ln_failure_is_symlink_swap_failed() {
342 + use crate::outcome::DeployFailureKind as K;
343 + let err = "symlink swap + systemctl reload-or-restart: ssh testnot-1 failed: ln: failed to create symbolic link: Permission denied";
344 + match classify_deploy_error(err) {
345 + K::SymlinkSwapFailed { .. } => {}
346 + other => panic!("expected SymlinkSwapFailed, got {other:?}"),
347 + }
348 + }
349 +
350 + #[test]
351 + fn deploy_unknown_is_unclassified() {
352 + use crate::outcome::DeployFailureKind as K;
353 + let err = "something went wrong in a way we did not anticipate";
354 + match classify_deploy_error(err) {
355 + K::Unclassified { detail } => assert!(detail.contains("anticipate")),
356 + other => panic!("expected Unclassified, got {other:?}"),
357 + }
358 + }
359 + }
@@ -66,4 +66,19 @@ impl Config {
66 66 .with_context(|| format!("reading daemon config at {path}"))?;
67 67 Ok(toml::from_str(&raw)?)
68 68 }
69 +
70 + #[cfg(test)]
71 + pub fn for_tests() -> Self {
72 + Self {
73 + listen: "127.0.0.1:0".into(),
74 + db_path: PathBuf::from(":memory:"),
75 + topology_path: PathBuf::from("/tmp/sando-test-topology.toml"),
76 + workdir: PathBuf::from("/tmp/sando-test-workdir"),
77 + release_root: PathBuf::from("/tmp/sando-test-release-root"),
78 + scratch_db_url: None,
79 + bin_names: vec!["server".into()],
80 + logs_root: PathBuf::from("/tmp/sando-test-logs"),
81 + release_contents: Vec::new(),
82 + }
83 + }
69 84 }
@@ -38,10 +38,10 @@ const RELEASES_TO_KEEP: usize = 5;
38 38
39 39 pub async fn deploy_local(
40 40 release_root: &Path,
41 - version: &str,
41 + version: &crate::domain::Version,
42 42 binaries: &[PathBuf],
43 43 ) -> Result<PathBuf> {
44 - let release_dir = release_root.join("releases").join(version);
44 + let release_dir = release_root.join("releases").join(version.to_string());
45 45 tokio::fs::create_dir_all(&release_dir).await?;
46 46 for binary in binaries {
47 47 let name = binary.file_name()
@@ -272,7 +272,7 @@ mod tests {
272 272
273 273 let staged = deploy_local(
274 274 &release_root,
275 - "0.8.12",
275 + &"0.8.12".parse().unwrap(),
276 276 &[primary.clone(), admin.clone()],
277 277 )
278 278 .await
@@ -303,10 +303,10 @@ mod tests {
303 303 let release_root = root.join("rr");
304 304 tokio::fs::create_dir_all(&release_root).await.unwrap();
305 305
306 - deploy_local(&release_root, "0.1.0", &[bin.clone()]).await.unwrap();
306 + deploy_local(&release_root, &"0.1.0".parse().unwrap(), &[bin.clone()]).await.unwrap();
307 307 // Rewrite source then deploy 0.2.0.
308 308 tokio::fs::write(&bin, b"V2").await.unwrap();
309 - deploy_local(&release_root, "0.2.0", &[bin.clone()]).await.unwrap();
309 + deploy_local(&release_root, &"0.2.0".parse().unwrap(), &[bin.clone()]).await.unwrap();
310 310
311 311 // Both versions present on disk.
312 312 assert!(release_root.join("releases/0.1.0/server").exists());
@@ -0,0 +1,423 @@
1 + //! Domain types — the vocabulary every other module speaks.
2 + //!
3 + //! These newtypes replace string-typed fields across the daemon, schema,
4 + //! WS payloads, and TUI. Construction is the boundary parse: a `Version`
5 + //! exists because some byte sequence at the edge of the process passed
6 + //! semver validation; downstream code is freed from re-validating it.
7 + //!
8 + //! All types implement `Display`, `FromStr`, `Serialize`, `Deserialize`,
9 + //! and `sqlx::Type<Sqlite>` so they round-trip through events, JSON
10 + //! responses, and SQLite columns without per-site conversion.
11 + //!
12 + //! See `plans/observability.md` for the architecture this is the first
13 + //! step of.
14 +
15 + // Step 1 is pure addition: nothing else in the crate uses these yet.
16 + // Steps 2-7 thread the types through call sites; remove the allow then.
17 + #![allow(dead_code)]
18 +
19 + use serde::{Deserialize, Serialize};
20 + use sqlx::Sqlite;
21 + use std::fmt;
22 + use std::str::FromStr;
23 +
24 + // ---------------------------------------------------------------------
25 + // String-backed identifiers
26 + // ---------------------------------------------------------------------
27 +
28 + /// A tier in the deploy topology (e.g. "host", "a", "b").
29 + ///
30 + /// Construction does no cross-validation against the loaded `Topology` —
31 + /// that is the responsibility of `Topology::load`, which mints the
32 + /// canonical `TierId` set. Use `TierId::new` only at boundaries (config
33 + /// load, deserialization of inbound requests).
34 + #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, sqlx::Type)]
35 + #[sqlx(transparent)]
36 + #[serde(transparent)]
37 + pub struct TierId(String);
38 +
39 + impl TierId {
40 + pub fn new(s: impl Into<String>) -> Self { Self(s.into()) }
41 + pub fn as_str(&self) -> &str { &self.0 }
42 + }
43 +
44 + impl fmt::Display for TierId {
45 + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) }
46 + }
47 +
48 + impl FromStr for TierId {
49 + type Err = std::convert::Infallible;
50 + fn from_str(s: &str) -> Result<Self, Self::Err> { Ok(Self(s.to_owned())) }
51 + }
52 +
53 + impl From<&str> for TierId {
54 + fn from(s: &str) -> Self { Self(s.to_owned()) }
55 + }
56 +
57 + impl From<String> for TierId {
58 + fn from(s: String) -> Self { Self(s) }
59 + }
60 +
61 + /// A node name within a tier (e.g. "alpha-west-1").
62 + #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, sqlx::Type)]
63 + #[sqlx(transparent)]
64 + #[serde(transparent)]
65 + pub struct NodeId(String);
66 +
67 + impl NodeId {
68 + pub fn new(s: impl Into<String>) -> Self { Self(s.into()) }
69 + pub fn as_str(&self) -> &str { &self.0 }
70 + }
71 +
72 + impl fmt::Display for NodeId {
73 + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) }
74 + }
75 +
76 + impl FromStr for NodeId {
77 + type Err = std::convert::Infallible;
78 + fn from_str(s: &str) -> Result<Self, Self::Err> { Ok(Self(s.to_owned())) }
79 + }
80 +
81 + impl From<&str> for NodeId {
82 + fn from(s: &str) -> Self { Self(s.to_owned()) }
83 + }
84 +
85 + impl From<String> for NodeId {
86 + fn from(s: String) -> Self { Self(s) }
87 + }
88 +
89 + // ---------------------------------------------------------------------
90 + // Version (semver)
91 + // ---------------------------------------------------------------------
92 +
93 + /// Server semver (e.g. `0.9.6`). Parsed once at the build step; stored
94 + /// as TEXT in the schema.
95 + #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
96 + #[serde(try_from = "String", into = "String")]
97 + pub struct Version(semver::Version);
98 +
99 + #[derive(Debug, thiserror::Error)]
100 + #[error("invalid semver `{input}`: {source}")]
101 + pub struct VersionParseError {
102 + pub input: String,
103 + #[source]
104 + pub source: semver::Error,
105 + }
106 +
107 + impl Version {
108 + pub fn parse(s: &str) -> Result<Self, VersionParseError> {
109 + semver::Version::parse(s)
110 + .map(Self)
111 + .map_err(|e| VersionParseError { input: s.to_owned(), source: e })
112 + }
113 + pub fn as_inner(&self) -> &semver::Version { &self.0 }
114 + }
115 +
116 + impl fmt::Display for Version {
117 + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) }
118 + }
119 +
120 + impl FromStr for Version {
121 + type Err = VersionParseError;
122 + fn from_str(s: &str) -> Result<Self, Self::Err> { Self::parse(s) }
123 + }
124 +
125 + impl TryFrom<String> for Version {
126 + type Error = VersionParseError;
127 + fn try_from(s: String) -> Result<Self, Self::Error> { Self::parse(&s) }
128 + }
129 +
130 + impl From<Version> for String {
131 + fn from(v: Version) -> Self { v.0.to_string() }
132 + }
133 +
134 + impl sqlx::Type<Sqlite> for Version {
135 + fn type_info() -> <Sqlite as sqlx::Database>::TypeInfo { <String as sqlx::Type<Sqlite>>::type_info() }
136 + fn compatible(ty: &<Sqlite as sqlx::Database>::TypeInfo) -> bool { <String as sqlx::Type<Sqlite>>::compatible(ty) }
137 + }
138 +
139 + impl<'q> sqlx::Encode<'q, Sqlite> for Version {
140 + fn encode_by_ref(
141 + &self,
142 + buf: &mut <Sqlite as sqlx::Database>::ArgumentBuffer<'q>,
143 + ) -> Result<sqlx::encode::IsNull, sqlx::error::BoxDynError> {
144 + <String as sqlx::Encode<Sqlite>>::encode(self.0.to_string(), buf)
145 + }
146 + }
147 +
148 + impl<'r> sqlx::Decode<'r, Sqlite> for Version {
149 + fn decode(
150 + value: <Sqlite as sqlx::Database>::ValueRef<'r>,
151 + ) -> Result<Self, sqlx::error::BoxDynError> {
152 + let s = <String as sqlx::Decode<Sqlite>>::decode(value)?;
153 + Ok(Version::parse(&s)?)
154 + }
155 + }
156 +
157 + // ---------------------------------------------------------------------
158 + // Git sha
159 + // ---------------------------------------------------------------------
160 +
161 + /// A git commit sha. Always stored in its full 40-hex-character form;
162 + /// short forms entering at the edge are accepted only if the topology
163 + /// resolves them unambiguously (resolution happens at the call site,
164 + /// not in this type — this type only enforces shape).
165 + #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
166 + #[serde(try_from = "String", into = "String")]
167 + pub struct GitSha(String);
168 +
169 + #[derive(Debug, thiserror::Error)]
170 + pub enum GitShaParseError {
171 + #[error("git sha `{0}` is not 7-40 hex chars")]
172 + BadShape(String),
173 + }
174 +
175 + impl GitSha {
176 + pub fn parse(s: &str) -> Result<Self, GitShaParseError> {
177 + let len = s.len();
178 + let ok = (7..=40).contains(&len) && s.bytes().all(|b| b.is_ascii_hexdigit());
179 + if ok { Ok(Self(s.to_ascii_lowercase())) } else { Err(GitShaParseError::BadShape(s.to_owned())) }
180 + }
181 + pub fn as_str(&self) -> &str { &self.0 }
182 + /// Best-effort 7-char prefix for display.
183 + pub fn short(&self) -> &str { &self.0[..self.0.len().min(7)] }
184 + }
185 +
186 + impl fmt::Display for GitSha {
187 + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) }
188 + }
189 +
190 + impl FromStr for GitSha {
191 + type Err = GitShaParseError;
192 + fn from_str(s: &str) -> Result<Self, Self::Err> { Self::parse(s) }
193 + }
194 +
195 + impl TryFrom<String> for GitSha {
196 + type Error = GitShaParseError;
197 + fn try_from(s: String) -> Result<Self, Self::Error> { Self::parse(&s) }
198 + }
199 +
200 + impl From<GitSha> for String {
201 + fn from(g: GitSha) -> Self { g.0 }
202 + }
203 +
204 + impl sqlx::Type<Sqlite> for GitSha {
205 + fn type_info() -> <Sqlite as sqlx::Database>::TypeInfo { <String as sqlx::Type<Sqlite>>::type_info() }
206 + fn compatible(ty: &<Sqlite as sqlx::Database>::TypeInfo) -> bool { <String as sqlx::Type<Sqlite>>::compatible(ty) }
207 + }
208 +
209 + impl<'q> sqlx::Encode<'q, Sqlite> for GitSha {
210 + fn encode_by_ref(
211 + &self,
212 + buf: &mut <Sqlite as sqlx::Database>::ArgumentBuffer<'q>,
213 + ) -> Result<sqlx::encode::IsNull, sqlx::error::BoxDynError> {
214 + <String as sqlx::Encode<Sqlite>>::encode(self.0.clone(), buf)
215 + }
216 + }
217 +
218 + impl<'r> sqlx::Decode<'r, Sqlite> for GitSha {
219 + fn decode(
220 + value: <Sqlite as sqlx::Database>::ValueRef<'r>,
221 + ) -> Result<Self, sqlx::error::BoxDynError> {
222 + let s = <String as sqlx::Decode<Sqlite>>::decode(value)?;
223 + Ok(GitSha::parse(&s)?)
224 + }
225 + }
226 +
227 + // ---------------------------------------------------------------------
228 + // Gate kind
229 + // ---------------------------------------------------------------------
230 +
231 + /// The discriminant of `topology::Gate`. `Gate` carries gate parameters
232 + /// (e.g. `BurnIn { hours }`); `GateKind` is the identifier we use in
233 + /// events, schema columns, and the TUI. They were the same type before;
234 + /// splitting them is what lets a gate's parameters evolve without
235 + /// touching the wire/schema vocabulary.
236 + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
237 + #[serde(rename_all = "snake_case")]
238 + pub enum GateKind {
239 + CargoTest,
240 + MigrationDryRun,
241 + BootSmoke,
242 + BurnIn,
243 + ManualConfirm,
244 + }
245 +
246 + impl GateKind {
247 + pub fn as_str(self) -> &'static str {
248 + match self {
249 + GateKind::CargoTest => "cargo_test",
250 + GateKind::MigrationDryRun => "migration_dry_run",
251 + GateKind::BootSmoke => "boot_smoke",
252 + GateKind::BurnIn => "burn_in",
253 + GateKind::ManualConfirm => "manual_confirm",
254 + }
255 + }
256 + }
257 +
258 + #[derive(Debug, thiserror::Error)]
259 + #[error("unknown gate kind `{0}`")]
260 + pub struct GateKindParseError(pub String);
261 +
262 + impl FromStr for GateKind {
263 + type Err = GateKindParseError;
264 + fn from_str(s: &str) -> Result<Self, Self::Err> {
265 + match s {
266 + "cargo_test" => Ok(GateKind::CargoTest),
267 + "migration_dry_run" => Ok(GateKind::MigrationDryRun),
268 + "boot_smoke" => Ok(GateKind::BootSmoke),
269 + "burn_in" => Ok(GateKind::BurnIn),
270 + "manual_confirm" => Ok(GateKind::ManualConfirm),
271 + other => Err(GateKindParseError(other.to_owned())),
272 + }
273 + }
274 + }
275 +
276 + impl fmt::Display for GateKind {
277 + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(self.as_str()) }
278 + }
279 +
280 + impl sqlx::Type<Sqlite> for GateKind {
281 + fn type_info() -> <Sqlite as sqlx::Database>::TypeInfo { <String as sqlx::Type<Sqlite>>::type_info() }
282 + fn compatible(ty: &<Sqlite as sqlx::Database>::TypeInfo) -> bool { <String as sqlx::Type<Sqlite>>::compatible(ty) }
283 + }
284 +
285 + impl<'q> sqlx::Encode<'q, Sqlite> for GateKind {
286 + fn encode_by_ref(
287 + &self,
288 + buf: &mut <Sqlite as sqlx::Database>::ArgumentBuffer<'q>,
289 + ) -> Result<sqlx::encode::IsNull, sqlx::error::BoxDynError> {
290 + <String as sqlx::Encode<Sqlite>>::encode(self.as_str().to_owned(), buf)
291 + }
292 + }
293 +
294 + impl<'r> sqlx::Decode<'r, Sqlite> for GateKind {
295 + fn decode(
296 + value: <Sqlite as sqlx::Database>::ValueRef<'r>,
297 + ) -> Result<Self, sqlx::error::BoxDynError> {
298 + let s = <String as sqlx::Decode<Sqlite>>::decode(value)?;
299 + Ok(GateKind::from_str(&s)?)
300 + }
301 + }
302 +
303 + // ---------------------------------------------------------------------
304 + // Row ids
305 + // ---------------------------------------------------------------------
306 +
307 + /// Primary key of `gate_runs`. Carried through `GateStart` → `GateLogChunk`
308 + /// → `GateDone` so client-side correlation is trivial.
309 + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, sqlx::Type)]
310 + #[sqlx(transparent)]
311 + #[serde(transparent)]
312 + pub struct GateRunId(pub i64);
313 +
314 + impl fmt::Display for GateRunId {
315 + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) }
316 + }
317 +
318 + /// Primary key of `deploys`.
319 + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, sqlx::Type)]
320 + #[sqlx(transparent)]
321 + #[serde(transparent)]
322 + pub struct DeployId(pub i64);
323 +
324 + impl fmt::Display for DeployId {
325 + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) }
326 + }
327 +
328 + #[cfg(test)]
329 + mod tests {
330 + use super::*;
331 +
332 + #[test]
333 + fn tier_id_round_trips_through_json() {
334 + let t = TierId::new("host");
335 + let s = serde_json::to_string(&t).unwrap();
336 + assert_eq!(s, "\"host\"");
337 + let back: TierId = serde_json::from_str(&s).unwrap();
338 + assert_eq!(t, back);
339 + }
340 +
341 + #[test]
342 + fn version_parses_and_displays() {
343 + let v: Version = "0.9.6".parse().unwrap();
344 + assert_eq!(v.to_string(), "0.9.6");
345 + assert!("not-a-version".parse::<Version>().is_err());
346 + }
347 +
348 + #[test]
349 + fn version_json_is_string_form() {
350 + let v: Version = "1.2.3-rc.1".parse().unwrap();
351 + let s = serde_json::to_string(&v).unwrap();
352 + assert_eq!(s, "\"1.2.3-rc.1\"");
353 + let back: Version = serde_json::from_str(&s).unwrap();
354 + assert_eq!(v, back);
355 + }
356 +
357 + #[test]
358 + fn git_sha_accepts_short_and_full() {
359 + assert!(GitSha::parse("abc1234").is_ok());
360 + assert!(GitSha::parse("0123456789abcdef0123456789abcdef01234567").is_ok());
361 + // length out of range
362 + assert!(GitSha::parse("abc").is_err());
363 + assert!(GitSha::parse(&"a".repeat(41)).is_err());
364 + // non-hex
365 + assert!(GitSha::parse("zzzzzzz").is_err());
366 + }
367 +
368 + #[test]
369 + fn git_sha_short_truncates_safely() {
370 + let s = GitSha::parse("abc1234").unwrap();
371 + assert_eq!(s.short(), "abc1234");
372 + let long = GitSha::parse("0123456789abcdef0123456789abcdef01234567").unwrap();
373 + assert_eq!(long.short(), "0123456");
374 + }
375 +
376 + #[test]
377 + fn git_sha_normalizes_to_lowercase() {
378 + let s = GitSha::parse("ABCdef1").unwrap();
379 + assert_eq!(s.as_str(), "abcdef1");
380 + }
381 +
382 + #[test]
383 + fn gate_kind_round_trips_through_json() {
384 + // serde_json uses #[serde(rename_all = "snake_case")] — verify the
385 + // shape the TUI's `format_event` already consumes is preserved.
386 + let k = GateKind::MigrationDryRun;
387 + let s = serde_json::to_string(&k).unwrap();
388 + assert_eq!(s, "\"migration_dry_run\"");
389 + let back: GateKind = serde_json::from_str(&s).unwrap();
390 + assert_eq!(k, back);
391 + }
392 +
393 + #[test]
394 + fn gate_kind_as_str_matches_serde_form() {
395 + // The legacy `gates::kind_str` helper produced strings the TUI
396 + // matched on. Locking in that our serde form matches those exactly
397 + // so step 3 (events use the types) doesn't change the wire shape.
398 + for k in [
399 + GateKind::CargoTest,
400 + GateKind::MigrationDryRun,
401 + GateKind::BootSmoke,
402 + GateKind::BurnIn,
403 + GateKind::ManualConfirm,
404 + ] {
405 + let via_serde: String = serde_json::from_str::<String>(
406 + &serde_json::to_string(&k).unwrap(),
407 + )
408 + .unwrap();
409 + assert_eq!(via_serde, k.as_str());
410 + }
411 + }
412 +
413 + #[test]
414 + fn gate_kind_from_str_rejects_unknown() {
415 + assert!("not_a_gate".parse::<GateKind>().is_err());
416 + }
417 +
418 + #[test]
419 + fn gate_run_id_serializes_as_number() {
420 + let id = GateRunId(42);
421 + assert_eq!(serde_json::to_string(&id).unwrap(), "42");
422 + }
423 + }
@@ -5,8 +5,10 @@
5 5 //! subscribes to the bus and forwards each envelope to the connected TUI as
6 6 //! a JSON text frame.
7 7
8 + use crate::domain::{GateKind, GateRunId, GitSha, NodeId, TierId, Version};
9 + use crate::outcome::{DeployFailureKind, GateOutcome};
8 10 use chrono::{DateTime, Utc};
9 - use serde::Serialize;
11 + use serde::{Deserialize, Serialize};
10 12 use tokio::sync::broadcast;
11 13
12 14 /// Capacity of the broadcast channel. Slow subscribers that fall behind by
@@ -16,32 +18,65 @@ pub const CAPACITY: usize = 256;
16 18
17 19 pub type EventTx = broadcast::Sender<EventEnvelope>;
18 20
19 - #[derive(Clone, Debug, Serialize)]
21 + #[derive(Clone, Debug, Serialize, Deserialize)]
20 22 pub struct EventEnvelope {
21 23 pub at: DateTime<Utc>,
22 24 #[serde(flatten)]
23 25 pub event: Event,
24 26 }
25 27
26 - #[derive(Clone, Debug, Serialize)]
28 + #[derive(Clone, Debug, Serialize, Deserialize)]
27 29 #[serde(tag = "kind", rename_all = "snake_case")]
28 30 pub enum Event {
29 31 /// A /rebuild was accepted (post-receive hook or operator).
30 - RebuildRequested { sha: String },
32 + RebuildRequested { sha: GitSha },
31 33 /// A previous in-flight build was aborted because a newer /rebuild arrived.
32 - BuildAborted { sha_aborted: String },
33 - BuildStart { sha: String, version: String },
34 - BuildOk { sha: String, version: String, elapsed_s: u64 },
35 - BuildFailed { sha: String, version: String, elapsed_s: u64 },
36 - GateStart { tier: String, version: String, gate: String },
37 - GateDone { tier: String, version: String, gate: String, passed: bool },
38 - DeployStart { tier: String, node: String, version: String },
39 - DeployOk { tier: String, node: String, version: String },
40 - DeployFailed { tier: String, node: String, version: String, error: String },
41 - PromoteComplete { tier: String, version: String },
42 - Rollback { tier: String, from: String, to: String },
34 + BuildAborted { sha_aborted: GitSha },
35 + BuildStart { sha: GitSha, version: Version },
36 + BuildOk { sha: GitSha, version: Version, elapsed_s: u64 },
37 + BuildFailed { sha: GitSha, version: Version, elapsed_s: u64 },
38 + GateStart {
39 + run_id: GateRunId,
40 + tier: TierId,
41 + version: Version,
42 + gate: GateKind,
43 + },
44 + /// Chunk of combined stdout+stderr from a gate that's currently running.
45 + /// `run_id` correlates back to the `GateStart` for the same gate run; the
46 + /// TUI uses it to group chunks if it wants a per-run buffer. `seq` is a
47 + /// monotonic counter scoped to one run (resets across runs). `text` is a
48 + /// UTF-8-lossy slice of bytes — chunks reflect tokio read boundaries, not
49 + /// line boundaries; the on-disk log at `outcome.log_ref` is the full,
50 + /// byte-exact stream.
51 + GateLogChunk {
52 + run_id: GateRunId,
53 + seq: u32,
54 + text: String,
55 + },
56 + /// `passed` is a shadow field. `outcome` is the source of truth and
57 + /// carries classification, blocker variants, and the log_ref.
58 + GateDone {
59 + run_id: GateRunId,
60 + tier: TierId,
61 + version: Version,
62 + gate: GateKind,
63 + passed: bool,
64 + outcome: GateOutcome,
65 + },
66 + DeployStart { tier: TierId, node: NodeId, version: Version },
67 + DeployOk { tier: TierId, node: NodeId, version: Version },
68 + DeployFailed {
69 + tier: TierId,
70 + node: NodeId,
71 + version: Version,
72 + failure: DeployFailureKind,
73 + },
74 + PromoteComplete { tier: TierId, version: Version },
75 + Rollback { tier: TierId, from: Version, to: Version },
76 + /// `source` is an ssh URL, kept freeform on purpose — it's a transport
77 + /// detail, not a domain identifier.
43 78 BackupFetched { source: String, byte_size: i64 },
44 - ManualConfirm { tier: String, version: String },
79 + ManualConfirm { tier: TierId, version: Version },
45 80 }
46 81
47 82 pub fn channel() -> EventTx {
@@ -67,7 +102,7 @@ mod tests {
67 102 // to `.unwrap()` someday, every build/deploy site will start
68 103 // crashing.
69 104 let tx = channel();
70 - emit(&tx, Event::RebuildRequested { sha: "abc".into() });
105 + emit(&tx, Event::RebuildRequested { sha: GitSha::parse("abc1234").unwrap() });
71 106 emit(&tx, Event::BackupFetched { source: "x".into(), byte_size: 1 });
72 107 }
73 108
@@ -75,12 +110,15 @@ mod tests {
75 110 async fn emit_reaches_a_subscriber() {
76 111 let tx = channel();
77 112 let mut rx = tx.subscribe();
78 - emit(&tx, Event::PromoteComplete { tier: "a".into(), version: "0.8.12".into() });
113 + emit(&tx, Event::PromoteComplete {
114 + tier: TierId::new("a"),
115 + version: "0.8.12".parse().unwrap(),
116 + });
79 117 let env = rx.recv().await.expect("envelope");
80 118 match env.event {
81 119 Event::PromoteComplete { tier, version } => {
82 - assert_eq!(tier, "a");
83 - assert_eq!(version, "0.8.12");
120 + assert_eq!(tier.as_str(), "a");
121 + assert_eq!(version.to_string(), "0.8.12");
84 122 }
85 123 _ => panic!("wrong event kind"),
86 124 }
@@ -93,9 +131,10 @@ mod tests {
93 131 let env = EventEnvelope {
94 132 at: Utc::now(),
95 133 event: Event::GateStart {
96 - tier: "host".into(),
97 - version: "0.8.12".into(),
98 - gate: "cargo_test".into(),
134 + run_id: GateRunId(42),
135 + tier: TierId::new("host"),
136 + version: "0.8.12".parse().unwrap(),
137 + gate: GateKind::CargoTest,
99 138 },
100 139 };
101 140 let s = serde_json::to_string(&env).unwrap();
@@ -115,7 +154,9 @@ mod tests {
115 154 let tx = channel();
116 155 let mut rx = tx.subscribe();
117 156 for i in 0..(CAPACITY + 10) {
118 - emit(&tx, Event::RebuildRequested { sha: format!("{i}") });
157 + // 7+ hex chars satisfy GitSha::parse; pad i into that shape.
158 + let sha = GitSha::parse(&format!("{i:0>7x}")).unwrap();
159 + emit(&tx, Event::RebuildRequested { sha });
119 160 }
120 161 let err = rx.recv().await.expect_err("expected Lagged");
121 162 match err {
@@ -3,8 +3,12 @@
3 3 //! human-readable reason). Outcomes are persisted to `gate_runs` so /state
4 4 //! and the TUI can show them.
5 5
6 + use crate::classify;
6 7 use crate::config::Config;
8 + use crate::domain::{GateKind, GateRunId, TierId, Version};
7 9 use crate::events::{self, Event, EventTx};
10 + use crate::live_log::LiveLog;
11 + use crate::outcome::{GateBlocker, GateFailure, GateOutcome, LogRef, PassNote};
8 12 use crate::topology::Gate;
9 13 use anyhow::Result;
10 14 use chrono::Utc;
@@ -17,22 +21,16 @@ use tokio::process::Command;
17 21 pub struct GateCtx {
18 22 pub pool: SqlitePool,
19 23 pub cfg: Arc<Config>,
20 - pub tier: String,
21 - pub version: String,
24 + pub tier: TierId,
25 + pub version: Version,
22 26 pub worktree: PathBuf,
23 27 pub events: EventTx,
24 28 }
25 29
26 - #[derive(Debug, Clone)]
27 - pub struct GateOutcome {
28 - pub passed: bool,
29 - pub detail: Option<String>,
30 - }
31 -
32 30 /// Run a single gate end-to-end: insert the in-flight row, execute the gate,
33 31 /// update the row with the outcome. Returns the outcome for the caller.
34 32 pub async fn run(ctx: &GateCtx, gate: &Gate) -> Result<GateOutcome> {
35 - let kind = kind_str(gate);
33 + let kind = gate.kind();
36 34 let started_at = Utc::now().to_rfc3339();
37 35
38 36 let id: i64 = sqlx::query_scalar(
@@ -45,46 +43,63 @@ pub async fn run(ctx: &GateCtx, gate: &Gate) -> Result<GateOutcome> {
45 43 .bind(&started_at)
46 44 .fetch_one(&ctx.pool)
47 45 .await?;
46 + let run_id = GateRunId(id);
48 47
49 - tracing::info!(tier = %ctx.tier, version = %ctx.version, gate = kind, "gate start");
48 + tracing::info!(
49 + run_id = %run_id, tier = %ctx.tier, version = %ctx.version, gate = %kind,
50 + "gate start",
51 + );
50 52 events::emit(&ctx.events, Event::GateStart {
53 + run_id,
51 54 tier: ctx.tier.clone(),
52 55 version: ctx.version.clone(),
53 - gate: kind.into(),
56 + gate: kind,
54 57 });
55 58
56 59 let outcome = match gate {
57 - Gate::CargoTest => cargo_test(ctx).await,
60 + Gate::CargoTest => cargo_test(ctx, run_id).await,
58 61 Gate::MigrationDryRun => migration_dry_run(ctx).await,
59 - Gate::BootSmoke => boot_smoke(ctx).await,
62 + Gate::BootSmoke => boot_smoke(ctx, run_id).await,
60 63 Gate::BurnIn { hours } => burn_in(ctx, *hours).await,
61 64 Gate::ManualConfirm => manual_confirm(ctx).await,
62 65 };
63 66
64 - let outcome = outcome.unwrap_or_else(|e| GateOutcome {
65 - passed: false,
66 - detail: Some(format!("gate runner errored: {e}")),
67 - });
67 + let outcome = outcome.unwrap_or_else(|e| GateOutcome::failed(GateFailure::Unclassified {
68 + legacy_detail: Some(format!("gate runner errored: {e}")),
69 + }));
68 70
71 + // Migration 003 added status/outcome_json/log_ref. Until migration 004
72 + // drops them, we double-write the shadow `passed` and `detail` columns
73 + // so any consumer still reading the old schema keeps working.
74 + let outcome_json = serde_json::to_string(&outcome)
75 + .unwrap_or_else(|e| format!("{{\"_serialize_error\":{e:?}}}"));
69 76 sqlx::query(
70 - "UPDATE gate_runs SET finished_at = ?, passed = ?, detail = ? WHERE id = ?",
77 + "UPDATE gate_runs
78 + SET finished_at = ?, passed = ?, detail = ?,
79 + status = ?, outcome_json = ?, log_ref = ?
80 + WHERE id = ?",
71 81 )
72 82 .bind(Utc::now().to_rfc3339())
73 - .bind(outcome.passed as i64)
74 - .bind(outcome.detail.as_deref())
83 + .bind(outcome.legacy_passed() as i64)
84 + .bind(outcome.legacy_detail())
85 + .bind(outcome.status_str())
86 + .bind(&outcome_json)
87 + .bind(outcome.log_ref.as_ref().map(|l| l.as_str()))
75 88 .bind(id)
76 89 .execute(&ctx.pool)
77 90 .await?;
78 91
79 92 tracing::info!(
80 - tier = %ctx.tier, version = %ctx.version, gate = kind,
81 - passed = outcome.passed, "gate done",
93 + tier = %ctx.tier, version = %ctx.version, gate = %kind,
94 + status = outcome.status_str(), "gate done",
82 95 );
83 96 events::emit(&ctx.events, Event::GateDone {
97 + run_id,
84 98 tier: ctx.tier.clone(),
85 99 version: ctx.version.clone(),
86 - gate: kind.into(),
87 - passed: outcome.passed,
100 + gate: kind,
101 + passed: outcome.legacy_passed(),
102 + outcome: outcome.clone(),
88 103 });
89 104
90 105 Ok(outcome)
@@ -99,26 +114,16 @@ pub async fn run_all(ctx: &GateCtx, gates: &[Gate]) -> Result<bool> {
99 114 let mut all_ok = true;
100 115 for g in gates {
101 116 let o = run(ctx, g).await?;
102 - if !o.passed {
117 + if !o.legacy_passed() {
103 118 all_ok = false;
104 119 }
105 120 }
106 121 Ok(all_ok)
107 122 }
108 123
109 - fn kind_str(g: &Gate) -> &'static str {
110 - match g {
111 - Gate::CargoTest => "cargo_test",
112 - Gate::MigrationDryRun => "migration_dry_run",
113 - Gate::BootSmoke => "boot_smoke",
114 - Gate::BurnIn { .. } => "burn_in",
115 - Gate::ManualConfirm => "manual_confirm",
116 - }
117 - }
118 -
119 124 // ---- individual gate runners ----
120 125
121 - async fn cargo_test(ctx: &GateCtx) -> Result<GateOutcome> {
126 + async fn cargo_test(ctx: &GateCtx, run_id: GateRunId) -> Result<GateOutcome> {
122 127 let server_dir = ctx.worktree.join("server");
123 128 let mut cmd = Command::new("cargo");
124 129 // Match CI (`server/deploy/run-ci.sh`): `--features fast-tests` relaxes
@@ -128,6 +133,8 @@ async fn cargo_test(ctx: &GateCtx) -> Result<GateOutcome> {
128 133 // this in `server/src/constants.rs:87`.
129 134 cmd.args(["test", "--release", "--features", "fast-tests"])
130 135 .current_dir(&server_dir)
136 + .stdout(std::process::Stdio::piped())
137 + .stderr(std::process::Stdio::piped())
131 138 .kill_on_drop(true);
132 139 // Same online-mode rationale as the build step: sqlx query macros need a
133 140 // live DB to type-check against. The scratch DB is left in migrated state
@@ -144,27 +151,42 @@ async fn cargo_test(ctx: &GateCtx) -> Result<GateOutcome> {
144 151 .unwrap_or(scratch_url);
145 152 cmd.env("TEST_DATABASE_URL", test_url);
146 153 }
147 - let out = cmd.output().await?;
148 - persist_gate_log(ctx, "cargo_test", &out.stdout, &out.stderr).await;
149 - Ok(GateOutcome {
150 - passed: out.status.success(),
151 - detail: Some(combined_tail(&out.stdout, &out.stderr, 4_000)),
152 - })
154 + let started = std::time::Instant::now();
155 + let log_path = gate_log_path(ctx, GateKind::CargoTest);
156 + let log_ref = LogRef::new(&ctx.version, GateKind::CargoTest);
157 + let mut child = match cmd.spawn() {
158 + Ok(c) => c,
159 + Err(e) => {
160 + return Ok(GateOutcome::failed(GateFailure::SpawnFailed {
161 + message: e.to_string(),
162 + }).with_log_ref(log_ref));
163 + }
164 + };
165 + let (stdout_buf, stderr_buf, status) =
166 + stream_child_to_live_log(&mut child, ctx.events.clone(), run_id, log_path).await?;
167 + let duration_s = started.elapsed().as_secs() as u32;
168 + if status.success() {
169 + Ok(GateOutcome::passed(PassNote::TestsPassed { duration_s }).with_log_ref(log_ref))
170 + } else {
171 + let failure = classify::classify_cargo_test(&stdout_buf, &stderr_buf);
172 + Ok(GateOutcome::failed(failure).with_log_ref(log_ref))
173 + }
153 174 }
154 175
155 176 async fn migration_dry_run(ctx: &GateCtx) -> Result<GateOutcome> {
156 177 let mut log_buf: Vec<u8> = Vec::new();
178 + let log_ref = LogRef::new(&ctx.version, GateKind::MigrationDryRun);
157 179 let finish = |outcome: GateOutcome, buf: Vec<u8>| async move {
158 - persist_gate_log(ctx, "migration_dry_run", &buf, &[]).await;
180 + persist_gate_log(ctx, GateKind::MigrationDryRun, &buf, &[]).await;
159 181 outcome
160 182 };
161 183
162 184 let Some(db_url) = ctx.cfg.scratch_db_url.as_deref() else {
163 185 log_buf.extend_from_slice(b"scratch_db_url unset in daemon config\n");
164 - return Ok(finish(GateOutcome {
165 - passed: false,
166 - detail: Some("scratch_db_url unset in daemon config".into()),
167 - }, log_buf).await);
186 + return Ok(finish(
187 + GateOutcome::blocked(GateBlocker::ScratchDbUrlUnset).with_log_ref(log_ref),
188 + log_buf,
189 + ).await);
168 190 };
169 191
170 192 let backup: Option<(String,)> = sqlx::query_as(
@@ -174,23 +196,29 @@ async fn migration_dry_run(ctx: &GateCtx) -> Result<GateOutcome> {
174 196 .await?;
175 197 let Some((backup_path,)) = backup else {
176 198 log_buf.extend_from_slice(b"no backup fetched; call /backup/fetch first\n");
177 - return Ok(finish(GateOutcome {
178 - passed: false,
179 - detail: Some("no backup fetched; call /backup/fetch first".into()),
180 - }, log_buf).await);
199 + return Ok(finish(
200 + GateOutcome::blocked(GateBlocker::NoBackupAvailable).with_log_ref(log_ref),
201 + log_buf,
202 + ).await);
181 203 };
182 204
183 - log_buf.extend_from_slice(format!("---- reset_scratch ----\n").as_bytes());
205 + log_buf.extend_from_slice(b"---- reset_scratch ----\n");
184 206 if let Err(e) = reset_scratch(db_url).await {
185 207 let msg = format!("scratch reset: {e}");
186 208 log_buf.extend_from_slice(msg.as_bytes());
187 - return Ok(finish(GateOutcome { passed: false, detail: Some(msg) }, log_buf).await);
209 + return Ok(finish(
210 + GateOutcome::failed(GateFailure::RestoreFailed { reason: msg }).with_log_ref(log_ref),
211 + log_buf,
212 + ).await);
188 213 }
189 214 log_buf.extend_from_slice(format!("---- restore_dump ({backup_path}) ----\n").as_bytes());
190 215 if let Err(e) = restore_dump(db_url, &backup_path, &mut log_buf).await {
191 216 let msg = format!("restore: {e}");
192 217 log_buf.extend_from_slice(msg.as_bytes());
193 - return Ok(finish(GateOutcome { passed: false, detail: Some(msg) }, log_buf).await);
218 + return Ok(finish(
219 + GateOutcome::failed(GateFailure::RestoreFailed { reason: msg }).with_log_ref(log_ref),
220 + log_buf,
221 + ).await);
194 222 }
195 223
196 224 let migrations_dir = ctx.worktree.join("server").join("migrations");
@@ -199,12 +227,20 @@ async fn migration_dry_run(ctx: &GateCtx) -> Result<GateOutcome> {
199 227 Ok(()) => {
200 228 let detail = format!("restored {backup_path} + migrated");
201 229 log_buf.extend_from_slice(detail.as_bytes());
202 - Ok(finish(GateOutcome { passed: true, detail: Some(detail) }, log_buf).await)
230 + Ok(finish(
231 + GateOutcome::passed(PassNote::Migrated { backup_path: backup_path.clone() })
232 + .with_log_ref(log_ref),
233 + log_buf,
234 + ).await)
203 235 }
204 236 Err(e) => {
205 237 let err_s = e.to_string();
206 238 log_buf.extend_from_slice(err_s.as_bytes());
207 - Ok(finish(GateOutcome { passed: false, detail: Some(tail(err_s.as_bytes(), 4_000)) }, log_buf).await)
239 + let failure = classify::classify_migration_error(&err_s, None);
240 + Ok(finish(
241 + GateOutcome::failed(failure).with_log_ref(log_ref),
242 + log_buf,
243 + ).await)
208 244 }
209 245 }
210 246 }
@@ -271,7 +307,7 @@ fn shell_escape(s: &str) -> String {
271 307 format!("'{}'", s.replace('\'', "'\\''"))
272 308 }
273 309
274 - async fn boot_smoke(ctx: &GateCtx) -> Result<GateOutcome> {
310 + async fn boot_smoke(ctx: &GateCtx, run_id: GateRunId) -> Result<GateOutcome> {
275 311 let bin: Option<(String,)> = sqlx::query_as(
276 312 "SELECT artifact_path FROM versions WHERE version = ?",
277 313 )
@@ -279,7 +315,9 @@ async fn boot_smoke(ctx: &GateCtx) -> Result<GateOutcome> {
279 315 .fetch_optional(&ctx.pool)
280 316 .await?;
281 317 let Some((bin,)) = bin else {
282 - return Ok(GateOutcome { passed: false, detail: Some("no artifact for version".into()) });
318 + return Ok(GateOutcome::blocked(GateBlocker::ArtifactMissing {
319 + version: ctx.version.clone(),
320 + }));
283 321 };
284 322
285 323 // Lowest-bar smoke: start the binary and verify it stays up for a few
@@ -302,34 +340,32 @@ async fn boot_smoke(ctx: &GateCtx) -> Result<GateOutcome> {
302 340 if let Some(scratch_url) = ctx.cfg.scratch_db_url.as_deref() {
303 341 cmd.env("DATABASE_URL", scratch_url);
304 342 }
343 + let log_path = gate_log_path(ctx, GateKind::BootSmoke);
344 + let log_ref = LogRef::new(&ctx.version, GateKind::BootSmoke);
305 345 let mut child = match cmd.spawn() {
306 346 Ok(c) => c,
307 347 Err(e) => {
308 - persist_gate_log(ctx, "boot_smoke", format!("spawn: {e}").as_bytes(), &[]).await;
309 - return Ok(GateOutcome { passed: false, detail: Some(format!("spawn: {e}")) });
348 + // Spawn failures get a one-off log line via LiveLog so the
349 + // on-disk file still exists for `GET /logs/...`.
350 + let mut log = LiveLog::open(ctx.events.clone(), run_id, log_path).await;
351 + log.write_chunk(format!("spawn: {e}\n").as_bytes()).await;
352 + log.close().await;
353 + return Ok(GateOutcome::failed(GateFailure::SpawnFailed {
354 + message: e.to_string(),
355 + }).with_log_ref(log_ref));
310 356 }
311 357 };
312 358
313 - // Drain stdout/stderr concurrently into in-memory buffers so the log
314 - // captures whatever the binary printed before exiting (or being killed
315 - // after the 3s smoke window). Without piping + draining, output goes to
316 - // sandod's own stdio and is lost.
317 - let mut stdout_h = child.stdout.take();
318 - let mut stderr_h = child.stderr.take();
319 - let stdout_task = tokio::spawn(async move {
320 - let mut buf = Vec::new();
321 - if let Some(s) = stdout_h.as_mut() {
322 - let _ = s.read_to_end(&mut buf).await;
323 - }
324 - buf
325 - });
326 - let stderr_task = tokio::spawn(async move {
327 - let mut buf = Vec::new();
328 - if let Some(s) = stderr_h.as_mut() {
329 - let _ = s.read_to_end(&mut buf).await;
330 - }
331 - buf
332 - });
359 + // The boot smoke window is 3s. Drain stdout/stderr concurrently through
360 + // a shared LiveLog sink so the operator sees panics/log lines stream in
361 + // real time before the kill, AND the on-disk log gets the full byte
362 + // stream for post-mortem reads. The drainers exit when their pipe
363 + // closes — which happens when the child exits naturally or after kill.
364 + let log = std::sync::Arc::new(tokio::sync::Mutex::new(
365 + LiveLog::open(ctx.events.clone(), run_id, log_path).await,
366 + ));
367 + let stdout_task = tokio::spawn(stream_into_log(child.stdout.take(), log.clone()));
368 + let stderr_task = tokio::spawn(stream_into_log(child.stderr.take(), log.clone()));
333 369
334 370 tokio::time::sleep(std::time::Duration::from_secs(3)).await;
335 371
@@ -337,22 +373,79 @@ async fn boot_smoke(ctx: &GateCtx) -> Result<GateOutcome> {
337 373 if exit.is_none() {
338 374 let _ = child.kill().await;
339 375 }
340 - // Both read tasks complete once the child's stdio is closed (kill closes
341 - // the pipes via the dropped Child on exit-after-kill).
342 - let stdout_buf = stdout_task.await.unwrap_or_default();
343 - let stderr_buf = stderr_task.await.unwrap_or_default();
344 - persist_gate_log(ctx, "boot_smoke", &stdout_buf, &stderr_buf).await;
376 + // The boot_smoke classifier looks at exit code only — the streamed
377 + // bytes already landed in the live log and the on-disk file for the
378 + // post-mortem reader. Drain the join handles to avoid hangs.
379 + let _ = stdout_task.await;
380 + let _ = stderr_task.await;
381 + // Unique owner of the Arc at this point (both tasks dropped their clones).
382 + if let Ok(mutex) = std::sync::Arc::try_unwrap(log) {
383 + mutex.into_inner().close().await;
384 + }
345 385
346 386 match exit {
347 - Some(status) => Ok(GateOutcome {
348 - passed: false,
349 - detail: Some(format!(
350 - "binary exited early: {status}\n{}",
351 - combined_tail(&stdout_buf, &stderr_buf, 4_000),
352 - )),
353 - }),
354 - None => Ok(GateOutcome { passed: true, detail: Some("stayed up for 3s".into()) }),
387 + Some(status) => {
388 + let failure = classify::classify_boot_smoke(status.code());
389 + Ok(GateOutcome::failed(failure).with_log_ref(log_ref))
390 + }
391 + None => Ok(GateOutcome::passed(PassNote::StayedUp { duration_s: 3 })
392 + .with_log_ref(log_ref)),
393 + }
394 + }
395 +
396 + /// Drain `stream` into the shared `LiveLog` (which forwards each chunk to
397 + /// the on-disk log file AND broadcasts a `GateLogChunk` event), and return
398 + /// the concatenated bytes so the classifier can still operate on the full
399 + /// output post-hoc.
400 + async fn stream_into_log<R>(
401 + stream: Option<R>,
402 + log: std::sync::Arc<tokio::sync::Mutex<LiveLog>>,
403 + ) -> Vec<u8>
404 + where
405 + R: tokio::io::AsyncRead + Unpin + Send + 'static,
406 + {
407 + let mut total = Vec::new();
408 + let Some(mut s) = stream else { return total };
409 + let mut buf = [0u8; 4096];
410 + loop {
411 + match s.read(&mut buf).await {
412 + Ok(0) => break,
413 + Err(_) => break,
414 + Ok(n) => {
415 + total.extend_from_slice(&buf[..n]);
416 + log.lock().await.write_chunk(&buf[..n]).await;
417 + }
418 + }
355 419 }
420 + total
421 + }
422 +
423 + /// Spawn a child, drain its stdout/stderr through a `LiveLog`, return the
424 + /// combined buffers and exit status. Shared by `cargo_test` (no deadline)
425 + /// and ad-hoc callers — `boot_smoke` rolls its own variant because of its
426 + /// 3s kill window.
427 + async fn stream_child_to_live_log(
428 + child: &mut tokio::process::Child,
429 + events: EventTx,
430 + run_id: GateRunId,
431 + log_path: PathBuf,
432 + ) -> Result<(Vec<u8>, Vec<u8>, std::process::ExitStatus)> {
433 + let log = std::sync::Arc::new(tokio::sync::Mutex::new(
434 + LiveLog::open(events, run_id, log_path).await,
435 + ));
436 + let stdout_task = tokio::spawn(stream_into_log(child.stdout.take(), log.clone()));
437 + let stderr_task = tokio::spawn(stream_into_log(child.stderr.take(), log.clone()));
438 + let status = child.wait().await?;
439 + let stdout_buf = stdout_task.await.unwrap_or_default();
440 + let stderr_buf = stderr_task.await.unwrap_or_default();
441 + if let Ok(mutex) = std::sync::Arc::try_unwrap(log) {
442 + mutex.into_inner().close().await;
443 + }
444 + Ok((stdout_buf, stderr_buf, status))
445 + }
446 +
447 + fn gate_log_path(ctx: &GateCtx, gate: GateKind) -> PathBuf {
448 + ctx.cfg.logs_root.join(ctx.version.to_string()).join(format!("{}.log", gate.as_str()))
356 449 }
357 450
358 451 async fn burn_in(ctx: &GateCtx, hours: u32) -> Result<GateOutcome> {
@@ -367,19 +460,19 @@ async fn burn_in(ctx: &GateCtx, hours: u32) -> Result<GateOutcome> {
367 460 .await?
368 461 .flatten();
369 462 let Some(started) = started else {
370 - return Ok(GateOutcome { passed: false, detail: Some("burn-in clock not started".into()) });
463 + return Ok(GateOutcome::blocked(GateBlocker::BurnInClockNotStarted));
371 464 };
372 465 let started = chrono::DateTime::parse_from_rfc3339(&started)?.with_timezone(&Utc);
373 466 let elapsed = Utc::now() - started;
374 467 let needed = chrono::Duration::hours(hours as i64);
375 468 if elapsed >= needed {
376 - Ok(GateOutcome { passed: true, detail: Some(format!("{} hours elapsed", elapsed.num_hours())) })
469 + Ok(GateOutcome::passed(PassNote::BurnInElapsed { hours: elapsed.num_hours() as u32 }))
377 470 } else {
378 - let remaining = needed - elapsed;
379 - Ok(GateOutcome {
380 - passed: false,
381 - detail: Some(format!("{} hours remaining of {hours}", remaining.num_hours())),
382 - })
471 + let remaining = (needed - elapsed).num_hours().max(0) as u32;
472 + Ok(GateOutcome::blocked(GateBlocker::BurnInRemaining {
473 + hours_remaining: remaining,
474 + hours_total: hours,
475 + }))
383 476 }
384 477 }
385 478
@@ -387,40 +480,24 @@ async fn manual_confirm(ctx: &GateCtx) -> Result<GateOutcome> {
387 480 // Pass iff a row in gate_runs exists with passed=1 for this (tier, version, manual_confirm)
388 481 // that was inserted out-of-band by an operator action. Since the harness inserts the
389 482 // in-flight row itself, look for a prior confirmation row.
390 - let prior: Option<i64> = sqlx::query_scalar(
391 - "SELECT COUNT(*) FROM gate_runs
392 - WHERE tier = ? AND version = ? AND gate_kind = 'manual_confirm' AND passed = 1",
483 + let prior_at: Option<String> = sqlx::query_scalar(
484 + "SELECT finished_at FROM gate_runs
485 + WHERE tier = ? AND version = ? AND gate_kind = 'manual_confirm' AND passed = 1
486 + ORDER BY id DESC LIMIT 1",
393 487 )
394 488 .bind(&ctx.tier)
395 489 .bind(&ctx.version)
396 490 .fetch_optional(&ctx.pool)
397 491 .await?;
398 - let passed = prior.unwrap_or(0) > 0;
399 - Ok(GateOutcome {
400 - passed,
401 - detail: if passed { None } else { Some("waiting on operator confirmation".into()) },
402 - })
403 - }
404 -
405 - fn tail(buf: &[u8], max: usize) -> String {
406 - let s = String::from_utf8_lossy(buf);
407 - if s.len() <= max { s.into_owned() } else { format!("...{}", &s[s.len() - max..]) }
408 - }
409 -
410 - /// Tail combining stdout + stderr. Phase A doesn't preserve true interleaving
411 - /// (the streams were captured separately); we concatenate stderr after stdout
412 - /// so the failing test output (cargo writes it to stdout) is no longer hidden
413 - /// behind the 4KB stderr tail. Full unsliced output lives in the on-disk log.
414 - fn combined_tail(stdout: &[u8], stderr: &[u8], max: usize) -> String {
415 - let mut joined = Vec::with_capacity(stdout.len() + stderr.len() + 32);
416 - joined.extend_from_slice(b"==== stdout ====\n");
417 - joined.extend_from_slice(stdout);
418 - if !stdout.last().is_some_and(|b| *b == b'\n') {
419 - joined.push(b'\n');
492 + match prior_at {
493 + Some(at_str) => {
494 + let at = chrono::DateTime::parse_from_rfc3339(&at_str)
495 + .map(|d| d.with_timezone(&Utc))
496 + .unwrap_or_else(|_| Utc::now());
497 + Ok(GateOutcome::passed(PassNote::OperatorConfirmed { at }))
498 + }
499 + None => Ok(GateOutcome::blocked(GateBlocker::AwaitingOperatorConfirmation)),
420 500 }
421 - joined.extend_from_slice(b"==== stderr ====\n");
422 - joined.extend_from_slice(stderr);
Lines truncated
@@ -0,0 +1,27 @@
1 + //! sando-daemon as a library.
2 + //!
3 + //! Exposes every module so the `sandod` binary (in `src/main.rs`) and
4 + //! the `sando` TUI (in `../tui`) can share wire-facing types — events,
5 + //! outcomes, domain newtypes — by import rather than duplication.
6 + //!
7 + //! External consumers should only need `domain`, `outcome`, and `events`;
8 + //! the rest are exposed because the bin re-uses them via this crate.
9 +
10 + pub mod backup;
11 + pub mod build;
12 + pub mod classify;
13 + pub mod config;
14 + pub mod db;
15 + pub mod deploy;
16 + pub mod domain;
17 + pub mod error;
18 + pub mod events;
19 + pub mod gates;
20 + pub mod git;
21 + pub mod live_log;
22 + pub mod metrics;
23 + pub mod outcome;
24 + pub mod routes;
25 + pub mod state;
26 + pub mod sync;
27 + pub mod topology;
@@ -0,0 +1,160 @@
1 + //! Live gate-log sink.
2 + //!
3 + //! Wraps the on-disk per-run log file with chunk broadcasting. Gate
4 + //! runners (`cargo_test`, `boot_smoke`) push their stdout/stderr through
5 + //! `LiveLog::write_chunk` as it arrives; the sink:
6 + //! 1. appends to the on-disk file at `<logs_root>/<version>/<gate>.log`
7 + //! so the post-mortem `GET /logs/...` route still has the full byte
8 + //! stream;
9 + //! 2. broadcasts a `GateLogChunk` event with a UTF-8-lossy slice of the
10 + //! same bytes, so the TUI sees the tail in real time.
11 + //!
12 + //! Chunks reflect tokio read boundaries — they are NOT line-aligned.
13 + //! Consumers that want lines must reassemble; the on-disk log preserves
14 + //! the exact byte stream for that purpose.
15 +
16 + use crate::domain::GateRunId;
17 + use crate::events::{self, Event, EventTx};
18 + use std::path::{Path, PathBuf};
19 + use tokio::fs::File;
20 + use tokio::io::AsyncWriteExt;
21 +
22 + pub struct LiveLog {
23 + file: Option<File>,
24 + /// Kept for diagnostic logging when file IO is unavailable.
25 + path: PathBuf,
26 + events: EventTx,
27 + run_id: GateRunId,
28 + seq: u32,
29 + }
30 +
31 + impl LiveLog {
32 + /// Open the log file for append-streaming. Creates parent directories
33 + /// as needed. If the file can't be opened, the sink degrades to
34 + /// "broadcast only" — chunks still go out as `GateLogChunk` events;
35 + /// the missing on-disk log is logged as a warning. This matches the
36 + /// pre-step-6 invariant: a broken log dir doesn't turn a passing gate
37 + /// red.
38 + pub async fn open(events: EventTx, run_id: GateRunId, path: PathBuf) -> Self {
39 + let file = open_for_append(&path).await;
40 + Self { file, path, events, run_id, seq: 0 }
41 + }
42 +
43 + /// Append `bytes` to the on-disk log and broadcast a `GateLogChunk`.
44 + /// The broadcast goes out even if the disk write fails — operators
45 + /// watching live still see the chunk.
46 + pub async fn write_chunk(&mut self, bytes: &[u8]) {
47 + if bytes.is_empty() { return; }
48 + if let Some(f) = self.file.as_mut() {
49 + if let Err(e) = f.write_all(bytes).await {
50 + tracing::warn!(error = %e, path = %self.path.display(), "live log write failed");
51 + self.file = None;
52 + }
53 + }
54 + let text = String::from_utf8_lossy(bytes).into_owned();
55 + events::emit(&self.events, Event::GateLogChunk {
56 + run_id: self.run_id,
57 + seq: self.seq,
58 + text,
59 + });
60 + self.seq = self.seq.saturating_add(1);
61 + }
62 +
63 + /// Flush and close the file. Best-effort: errors are logged.
64 + pub async fn close(mut self) {
65 + if let Some(mut f) = self.file.take() {
66 + if let Err(e) = f.flush().await {
67 + tracing::warn!(error = %e, path = %self.path.display(), "live log flush failed");
68 + }
69 + }
70 + }
71 +
72 + pub fn run_id(&self) -> GateRunId { self.run_id }
73 + pub fn chunks_emitted(&self) -> u32 { self.seq }
74 + }
75 +
76 + async fn open_for_append(path: &Path) -> Option<File> {
77 + if let Some(parent) = path.parent() {
78 + if let Err(e) = tokio::fs::create_dir_all(parent).await {
79 + tracing::warn!(error = %e, dir = %parent.display(), "could not create gate log dir");
80 + return None;
81 + }
82 + }
83 + match tokio::fs::OpenOptions::new()
84 + .create(true)
85 + .append(true)
86 + .open(path)
87 + .await
88 + {
89 + Ok(f) => Some(f),
90 + Err(e) => {
91 + tracing::warn!(error = %e, path = %path.display(), "could not open gate log file");
92 + None
93 + }
94 + }
95 + }
96 +
97 + #[cfg(test)]
98 + mod tests {
99 + use super::*;
100 + use crate::events::EventEnvelope;
101 +
102 + #[tokio::test]
103 + async fn write_chunk_emits_event_and_appends_to_file() {
104 + let dir = tempfile::tempdir().unwrap();
105 + let path = dir.path().join("nested/test.log");
106 + let events = events::channel();
107 + let mut rx = events.subscribe();
108 + let mut log = LiveLog::open(events.clone(), GateRunId(7), path.clone()).await;
109 + log.write_chunk(b"hello ").await;
110 + log.write_chunk(b"world\n").await;
111 + log.close().await;
112 +
113 + let on_disk = tokio::fs::read_to_string(&path).await.unwrap();
114 + assert_eq!(on_disk, "hello world\n");
115 +
116 + let mut chunks: Vec<String> = Vec::new();
117 + while let Ok(env) = rx.try_recv() {
118 + if let Event::GateLogChunk { run_id, seq, text } = env.event {
119 + assert_eq!(run_id, GateRunId(7));
120 + assert_eq!(seq as usize, chunks.len());
121 + chunks.push(text);
122 + }
123 + }
124 + assert_eq!(chunks, vec!["hello ".to_string(), "world\n".to_string()]);
125 + }
126 +
127 + #[tokio::test]
128 + async fn write_chunk_emits_even_when_file_cannot_be_opened() {
129 + // Point at a path whose parent is a regular file — create_dir_all
130 + // will fail. The broadcast must still fire.
131 + let dir = tempfile::tempdir().unwrap();
132 + let blocker = dir.path().join("blocker");
133 + tokio::fs::write(&blocker, b"i am a file, not a dir").await.unwrap();
134 + let path = blocker.join("inside.log"); // parent is a file
135 + let events = events::channel();
136 + let mut rx = events.subscribe();
137 + let mut log = LiveLog::open(events.clone(), GateRunId(1), path).await;
138 + log.write_chunk(b"streamed despite no file\n").await;
139 + log.close().await;
140 +
141 + let env: EventEnvelope = rx.try_recv().unwrap();
142 + match env.event {
143 + Event::GateLogChunk { text, .. } => assert_eq!(text, "streamed despite no file\n"),
144 + _ => panic!("expected GateLogChunk"),
145 + }
146 + }
147 +
148 + #[tokio::test]
149 + async fn empty_chunk_is_noop() {
150 + let dir = tempfile::tempdir().unwrap();
151 + let path = dir.path().join("empty.log");
152 + let events = events::channel();
153 + let mut rx = events.subscribe();
154 + let mut log = LiveLog::open(events.clone(), GateRunId(9), path).await;
155 + log.write_chunk(b"").await;
156 + assert_eq!(log.chunks_emitted(), 0);
157 + assert!(rx.try_recv().is_err());
158 + log.close().await;
159 + }
160 + }
@@ -1,23 +1,9 @@
1 1 use anyhow::Result;
2 + use sando_daemon::{config, db, events, git, metrics, routes, state, sync, topology};
2 3 use std::net::SocketAddr;
3 4 use std::path::Path;
4 5 use std::sync::Arc;
5 6
6 - mod backup;
7 - mod build;
8 - mod config;
9 - mod db;
10 - mod deploy;
11 - mod error;
12 - mod events;
13 - mod gates;
14 - mod git;
15 - mod metrics;
16 - mod routes;
17 - mod state;
18 - mod sync;
19 - mod topology;
20 -
21 7 #[tokio::main]
22 8 async fn main() -> Result<()> {
23 9 tracing_subscriber::fmt()
@@ -27,9 +13,10 @@ async fn main() -> Result<()> {
27 13 .with_writer(std::io::stderr)
28 14 .with_env_filter(
29 15 tracing_subscriber::EnvFilter::try_from_default_env()
30 - // bin target name is `sandod`, NOT the package name `sando-daemon` —
31 - // `module_path!()` uses the binary's crate name, so events come from `sandod::*`.
32 - .unwrap_or_else(|_| "sandod=info,tower_http=info".into()),
16 + // Modules live under the library crate `sando_daemon` (since
17 + // the step-5 lib/bin split). `sandod` is kept for any
18 + // top-level events that originate in main.rs itself.
19 + .unwrap_or_else(|_| "sando_daemon=info,sandod=info,tower_http=info".into()),
33 20 )
34 21 .init();
35 22
@@ -0,0 +1,386 @@
1 + //! Typed gate outcomes.
2 + //!
3 + //! Replaces the `(passed: bool, detail: Option<String>)` pair on
4 + //! `GateOutcome`. The point is to push failure classification into the
5 + //! type itself: a `GateFailure::MigrationDrift { migration }` is what it
6 + //! says, not a string the operator has to parse. See
7 + //! `plans/observability.md` for the full argument.
8 + //!
9 + //! The variants here describe what the gate runner actually observed.
10 + //! Mapping raw process output (stderr tails, exit codes) to these
11 + //! variants is the classifier's job — `classify.rs`.
12 +
13 + use crate::domain::{GateKind, Version};
14 + use chrono::{DateTime, Utc};
15 + use serde::{Deserialize, Serialize};
16 +
17 + /// A gate's result, persisted to `gate_runs.outcome_json` and emitted
18 + /// over WS in `GateDone`.
19 + #[derive(Debug, Clone, Serialize, Deserialize)]
20 + pub struct GateOutcome {
21 + pub status: GateStatus,
22 + /// Relative path under `cfg.logs_root` to the persisted stdout/stderr
23 + /// for this run. `None` for gates that don't produce process output
24 + /// (burn_in, manual_confirm).
25 + #[serde(skip_serializing_if = "Option::is_none", default)]
26 + pub log_ref: Option<LogRef>,
27 + }
28 +
29 + impl GateOutcome {
30 + pub fn passed(note: PassNote) -> Self {
31 + Self { status: GateStatus::Passed { note }, log_ref: None }
32 + }
33 + pub fn failed(failure: GateFailure) -> Self {
34 + Self { status: GateStatus::Failed { failure }, log_ref: None }
35 + }
36 + pub fn blocked(blocker: GateBlocker) -> Self {
37 + Self { status: GateStatus::Blocked { blocker }, log_ref: None }
38 + }
39 + pub fn with_log_ref(mut self, log_ref: LogRef) -> Self {
40 + self.log_ref = Some(log_ref);
41 + self
42 + }
43 +
44 + /// Shadow column: until migration 004 drops `passed`, every write
45 + /// also populates the legacy boolean. `Blocked` reads as failing
46 + /// because gates that are blocked have not satisfied the pipeline.
47 + pub fn legacy_passed(&self) -> bool {
48 + matches!(self.status, GateStatus::Passed { .. })
49 + }
50 +
51 + /// Shadow column: human-readable single-line summary for the legacy
52 + /// `gate_runs.detail` column. Goes away when migration 004 drops it.
53 + pub fn legacy_detail(&self) -> String {
54 + match &self.status {
55 + GateStatus::Passed { note } => note.summary(),
56 + GateStatus::Failed { failure } => failure.summary(),
57 + GateStatus::Blocked { blocker } => blocker.summary(),
58 + }
59 + }
60 +
61 + /// The high-level status word for the `gate_runs.status` column.
62 + pub fn status_str(&self) -> &'static str {
63 + match self.status {
64 + GateStatus::Passed { .. } => "passed",
65 + GateStatus::Failed { .. } => "failed",
66 + GateStatus::Blocked { .. } => "blocked",
67 + }
68 + }
69 + }
70 +
71 + #[derive(Debug, Clone, Serialize, Deserialize)]
72 + #[serde(tag = "kind", rename_all = "snake_case")]
73 + pub enum GateStatus {
74 + /// Gate ran and succeeded. The note carries gate-specific evidence
75 + /// (e.g. `TestsPassed { duration_s }`).
76 + Passed { note: PassNote },
77 + /// Gate ran and failed. Two-layer tag: outer `kind = "failed"`, inner
78 + /// `failure.kind` names the classified variant. If no classifier
79 + /// matched, that's `unclassified`.
80 + Failed { failure: GateFailure },
81 + /// Gate cannot run yet. Burn-in clock not started, scratch DB not
82 + /// configured, backup missing — pre-conditions the operator can fix
83 + /// out of band. Distinguished from `Failed` so the TUI can render
84 + /// these yellow rather than red.
85 + Blocked { blocker: GateBlocker },
86 + }
87 +
88 + #[derive(Debug, Clone, Serialize, Deserialize)]
89 + #[serde(tag = "kind", rename_all = "snake_case")]
90 + pub enum PassNote {
91 + /// `boot_smoke` — the binary stayed up for the smoke window.
92 + StayedUp { duration_s: u32 },
93 + /// `burn_in` — the configured number of hours have elapsed since
94 + /// the gate's clock started.
95 + BurnInElapsed { hours: u32 },
96 + /// `migration_dry_run` — scratch DB restored from `backup_path` and
97 + /// every migration ran without error.
98 + Migrated { backup_path: String },
99 + /// `cargo_test` — `cargo test --release` exited 0.
100 + TestsPassed { duration_s: u32 },
101 + /// `manual_confirm` — an operator inserted a passing row out-of-band.
102 + OperatorConfirmed { at: DateTime<Utc> },
103 + /// Legacy rows backfilled from the pre-typed schema. Carries the
104 + /// original `detail` string so nothing is lost.
105 + Legacy { text: String },
106 + }
107 +
108 + impl PassNote {
109 + pub fn summary(&self) -> String {
110 + match self {
111 + PassNote::StayedUp { duration_s } => format!("stayed up for {duration_s}s"),
112 + PassNote::BurnInElapsed { hours } => format!("{hours} hours elapsed"),
113 + PassNote::Migrated { backup_path } => format!("restored {backup_path} + migrated"),
114 + PassNote::TestsPassed { duration_s } => format!("tests passed in {duration_s}s"),
115 + PassNote::OperatorConfirmed { at } => format!("operator confirmed at {at}"),
116 + PassNote::Legacy { text } => text.clone(),
117 + }
118 + }
119 + }
120 +
121 + #[derive(Debug, Clone, Serialize, Deserialize)]
122 + #[serde(tag = "kind", rename_all = "snake_case")]
123 + pub enum GateBlocker {
124 + /// `burn_in`: the tier's `tier_state.burn_in_started_at` is NULL.
125 + BurnInClockNotStarted,
126 + /// `burn_in`: clock running but not enough time elapsed yet.
127 + BurnInRemaining { hours_remaining: u32, hours_total: u32 },
128 + /// `manual_confirm`: no out-of-band passing row exists for this
129 + /// (tier, version).
130 + AwaitingOperatorConfirmation,
131 + /// `migration_dry_run`: no row in `backups` to restore from.
132 + NoBackupAvailable,
133 + /// `migration_dry_run` / `boot_smoke` / `cargo_test`: daemon config
134 + /// has no `scratch_db_url`.
135 + ScratchDbUrlUnset,
136 + /// `boot_smoke`: no `artifact_path` in `versions` for this version.
137 + ArtifactMissing { version: Version },
138 + }
139 +
140 + impl GateBlocker {
141 + pub fn summary(&self) -> String {
142 + match self {
143 + GateBlocker::BurnInClockNotStarted => "burn-in clock not started".into(),
144 + GateBlocker::BurnInRemaining { hours_remaining, hours_total } =>
145 + format!("{hours_remaining} hours remaining of {hours_total}"),
146 + GateBlocker::AwaitingOperatorConfirmation => "waiting on operator confirmation".into(),
147 + GateBlocker::NoBackupAvailable => "no backup fetched; call /backup/fetch first".into(),
148 + GateBlocker::ScratchDbUrlUnset => "scratch_db_url unset in daemon config".into(),
149 + GateBlocker::ArtifactMissing { version } => format!("no artifact for version {version}"),
150 + }
151 + }
152 + }
153 +
154 + #[derive(Debug, Clone, Serialize, Deserialize)]
155 + #[serde(tag = "kind", rename_all = "snake_case")]
156 + pub enum GateFailure {
157 + /// `cargo_test` exited non-zero. `failed_count` may be 0 if the
158 + /// classifier couldn't parse the count (e.g. compile error).
159 + CargoTest { failed_count: u32, first_failed: Option<String> },
160 + /// `migration_dry_run`: a migration that was previously applied is
161 + /// no longer present in the resolved migrations directory.
162 + MigrationDrift { migration: String },
163 + /// `migration_dry_run`: a migration that was previously applied has
164 + /// been modified (checksum mismatch).
165 + MigrationModified { migration: String },
166 + /// `migration_dry_run`: postgres rejected a migration's SQL.
167 + MigrationSqlError { migration: String, sqlstate: Option<String> },
168 + /// `migration_dry_run`: scratch DB reset or dump restore failed.
169 + RestoreFailed { reason: String },
170 + /// `boot_smoke`: binary exited with a non-zero status during the
171 + /// smoke window. Most likely a panic; `exit_code` carries the OS
172 + /// status when one is available.
173 + BootPanic { exit_code: Option<i32> },
174 + /// `boot_smoke`: binary exited 0 before the smoke window elapsed.
175 + BootExitedEarly { exit_code: Option<i32> },
176 + /// `cargo_test` / `boot_smoke`: tokio could not spawn the child.
177 + SpawnFailed { message: String },
178 + /// Gate took longer than the configured ceiling.
179 + Timeout { gate: GateKind, after_s: u32 },
180 + /// Classifier could not match the output to any known variant. The
181 + /// `log_ref` on the enclosing `GateOutcome` is the diagnostic path.
182 + Unclassified { legacy_detail: Option<String> },
183 + }
184 +
185 + impl GateFailure {
186 + pub fn summary(&self) -> String {
187 + match self {
188 + GateFailure::CargoTest { failed_count, first_failed: Some(name) } =>
189 + format!("{failed_count} test(s) failed; first: {name}"),
190 + GateFailure::CargoTest { failed_count, first_failed: None } =>
191 + format!("{failed_count} test(s) failed"),
192 + GateFailure::MigrationDrift { migration } =>
193 + format!("migration {migration} previously applied but missing"),
194 + GateFailure::MigrationModified { migration } =>
195 + format!("migration {migration} previously applied but modified"),
196 + GateFailure::MigrationSqlError { migration, sqlstate: Some(s) } =>
197 + format!("migration {migration} sql error ({s})"),
198 + GateFailure::MigrationSqlError { migration, sqlstate: None } =>
199 + format!("migration {migration} sql error"),
200 + GateFailure::RestoreFailed { reason } => format!("restore: {reason}"),
201 + GateFailure::BootPanic { exit_code: Some(c) } => format!("binary panicked: exit {c}"),
202 + GateFailure::BootPanic { exit_code: None } => "binary panicked".into(),
203 + GateFailure::BootExitedEarly { exit_code: Some(c) } => format!("binary exited early: exit {c}"),
204 + GateFailure::BootExitedEarly { exit_code: None } => "binary exited early".into(),
205 + GateFailure::SpawnFailed { message } => format!("spawn: {message}"),
206 + GateFailure::Timeout { gate, after_s } => format!("{gate} timed out after {after_s}s"),
207 + GateFailure::Unclassified { legacy_detail: Some(d) } => d.clone(),
208 + GateFailure::Unclassified { legacy_detail: None } => "unclassified failure".into(),
209 + }
210 + }
211 + }
212 +
213 + // ---------------------------------------------------------------------
214 + // Deploy outcomes (step 7)
215 + // ---------------------------------------------------------------------
216 +
217 + /// Typed outcome of one node-deploy attempt. Stored as `outcome_json` in
218 + /// the `deploys` table and emitted in `Event::DeployFailed` so consumers
219 + /// can distinguish a node-unreachable error (operator: check the box)
220 + /// from rsync mid-transfer corruption (operator: check disk/network).
221 + #[derive(Debug, Clone, Serialize, Deserialize)]
222 + pub struct DeployOutcome {
223 + pub status: DeployStatus,
224 + }
225 +
226 + impl DeployOutcome {
227 + pub fn ok() -> Self { Self { status: DeployStatus::Ok } }
228 + pub fn failed(failure: DeployFailureKind) -> Self {
229 + Self { status: DeployStatus::Failed { failure } }
230 + }
231 + pub fn in_progress() -> Self { Self { status: DeployStatus::InProgress } }
232 +
233 + /// `'in_progress' | 'ok' | 'failed'` — the value of the legacy
234 + /// `deploys.outcome` column.
235 + pub fn status_str(&self) -> &'static str {
236 + match self.status {
237 + DeployStatus::InProgress => "in_progress",
238 + DeployStatus::Ok => "ok",
239 + DeployStatus::Failed { .. } => "failed",
240 + }
241 + }
242 + }
243 +
244 + #[derive(Debug, Clone, Serialize, Deserialize)]
245 + #[serde(tag = "kind", rename_all = "snake_case")]
246 + pub enum DeployStatus {
247 + InProgress,
248 + Ok,
249 + Failed { failure: DeployFailureKind },
250 + }
251 +
252 + #[derive(Debug, Clone, Serialize, Deserialize)]
253 + #[serde(tag = "kind", rename_all = "snake_case")]
254 + pub enum DeployFailureKind {
255 + /// SSH to the node failed before any state changed. Typically a dead
256 + /// host, network partition, or stale known_hosts.
257 + NodeUnreachable { detail: String },
258 + /// rsync exited non-zero mid-transfer. The on-target release dir may
259 + /// be partially populated, but the `current` symlink is untouched.
260 + RsyncFailed { detail: String },
261 + /// Files copied successfully but the atomic symlink swap step
262 + /// failed. The new release is on disk; the service is still running
263 + /// the old one.
264 + SymlinkSwapFailed { detail: String },
265 + /// Symlink swapped but `systemctl reload-or-restart` returned
266 + /// non-zero. The new code is current but the service may have
267 + /// crashed on startup.
268 + ServiceRestartFailed { detail: String },
269 + /// Classifier couldn't match the error to a known variant. The full
270 + /// anyhow chain is in `detail`.
271 + Unclassified { detail: String },
272 + }
273 +
274 + impl DeployFailureKind {
275 + pub fn summary(&self) -> String {
276 + match self {
277 + DeployFailureKind::NodeUnreachable { detail } => format!("node unreachable: {detail}"),
278 + DeployFailureKind::RsyncFailed { detail } => format!("rsync: {detail}"),
279 + DeployFailureKind::SymlinkSwapFailed { detail } => format!("symlink swap: {detail}"),
280 + DeployFailureKind::ServiceRestartFailed { detail } => format!("service restart: {detail}"),
281 + DeployFailureKind::Unclassified { detail } => detail.chars().take(200).collect(),
282 + }
283 + }
284 + }
285 +
286 + /// Pointer to the on-disk gate log: a path relative to `cfg.logs_root`
287 + /// of the form `<version>/<gate_kind>.log`. Stored in `gate_runs.log_ref`
288 + /// and surfaced in `/state` so the TUI/operator can request the full
289 + /// tail via `GET /logs/<version>/<gate>` only when needed.
290 + #[derive(Debug, Clone, Serialize, Deserialize)]
291 + #[serde(transparent)]
292 + pub struct LogRef(pub String);
293 +
294 + impl LogRef {
295 + pub fn new(version: &Version, gate: GateKind) -> Self {
296 + Self(format!("{}/{}.log", version, gate.as_str()))
297 + }
298 + pub fn as_str(&self) -> &str { &self.0 }
299 + }
300 +
301 + impl std::fmt::Display for LogRef {
302 + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) }
303 + }
304 +
305 + #[cfg(test)]
306 + mod tests {
307 + use super::*;
308 +
309 + #[test]
310 + fn outcome_serialization_is_two_layer_tagged() {
311 + let o = GateOutcome::failed(GateFailure::MigrationDrift {
312 + migration: "0047_widgets".into(),
313 + });
314 + let v: serde_json::Value = serde_json::to_value(&o).unwrap();
315 + assert_eq!(v["status"]["kind"], "failed");
316 + assert_eq!(v["status"]["failure"]["kind"], "migration_drift");
317 + assert_eq!(v["status"]["failure"]["migration"], "0047_widgets");
318 + }
319 +
320 + #[test]
321 + fn outcome_round_trips_through_json() {
322 + let o = GateOutcome::passed(PassNote::TestsPassed { duration_s: 42 });
323 + let s = serde_json::to_string(&o).unwrap();
324 + let back: GateOutcome = serde_json::from_str(&s).unwrap();
325 + assert!(back.legacy_passed());
326 + assert_eq!(back.status_str(), "passed");
327 + }
328 +
329 + #[test]
330 + fn blocked_legacy_passed_is_false() {
331 + let o = GateOutcome::blocked(GateBlocker::BurnInClockNotStarted);
332 + assert!(!o.legacy_passed());
333 + assert_eq!(o.status_str(), "blocked");
334 + }
335 +
336 + #[test]
337 + fn legacy_detail_summaries_match_pre_typed_strings() {
338 + // The pre-typed gate runner wrote specific prose for each
339 + // pass/blocker case. The summary impl is what populates the
340 + // shadow `detail` column during the migration-003 shadow period
341 + // so old consumers (TUI, /state) keep working unchanged.
342 + assert_eq!(
343 + GateOutcome::blocked(GateBlocker::BurnInClockNotStarted).legacy_detail(),
344 + "burn-in clock not started",
345 + );
346 + assert_eq!(
347 + GateOutcome::blocked(GateBlocker::ScratchDbUrlUnset).legacy_detail(),
348 + "scratch_db_url unset in daemon config",
349 + );
350 + assert_eq!(
351 + GateOutcome::blocked(GateBlocker::NoBackupAvailable).legacy_detail(),
352 + "no backup fetched; call /backup/fetch first",
353 + );
354 + assert_eq!(
355 + GateOutcome::blocked(GateBlocker::AwaitingOperatorConfirmation).legacy_detail(),
356 + "waiting on operator confirmation",
357 + );
358 + assert_eq!(
359 + GateOutcome::blocked(GateBlocker::BurnInRemaining { hours_remaining: 47, hours_total: 168 })
360 + .legacy_detail(),
361 + "47 hours remaining of 168",
362 + );
363 + assert_eq!(
364 + GateOutcome::passed(PassNote::StayedUp { duration_s: 3 }).legacy_detail(),
365 + "stayed up for 3s",
366 + );
367 + }
368 +
369 + #[test]
370 + fn log_ref_construction_matches_disk_layout() {
371 + let v: Version = "0.9.6".parse().unwrap();
372 + let lr = LogRef::new(&v, GateKind::CargoTest);
373 + assert_eq!(lr.as_str(), "0.9.6/cargo_test.log");
374 + }
375 +
376 + #[test]
377 + fn unclassified_preserves_legacy_detail() {
378 + let o = GateOutcome::failed(GateFailure::Unclassified {
379 + legacy_detail: Some("binary exited early: exit status: 101\n==== stdout ====\n...".into()),
380 + });
381 + let v: serde_json::Value = serde_json::to_value(&o).unwrap();
382 + assert_eq!(v["status"]["failure"]["kind"], "unclassified");
383 + assert!(v["status"]["failure"]["legacy_detail"]
384 + .as_str().unwrap().contains("exit status: 101"));
385 + }
386 + }
@@ -43,9 +43,22 @@ struct TierView {
43 43 #[derive(Serialize)]
44 44 struct GateView {
45 45 kind: String,
46 + /// Shadow field — kept until step 5 (TUI typed-event handling) so
47 + /// the current TUI keeps rendering pass/fail without consulting
48 + /// `outcome`. Goes away in migration 004.
46 49 passed: Option<bool>,
47 50 finished_at: Option<String>,
51 + /// Shadow field — superseded by `outcome.status`.
48 52 detail: Option<String>,
53 + /// `'passed' | 'failed' | 'blocked'` or NULL while in-flight. The
54 + /// TUI can rely on this to choose green/red/yellow rendering once
55 + /// step 5 lands; until then it falls back to `passed`.
56 + status: Option<String>,
57 + /// Full typed `GateOutcome` as a JSON object, when present.
58 + /// Deserialized lazily by the consumer; sandod doesn't re-parse it.
59 + outcome: Option<serde_json::Value>,
60 + /// Relative path under `cfg.logs_root` to the persisted stdout/stderr.
61 + log_ref: Option<String>,
49 62 }
50 63
51 64 async fn get_state(State(s): State<AppState>) -> Result<Json<StateView>> {
@@ -89,7 +102,7 @@ async fn get_state(State(s): State<AppState>) -> Result<Json<StateView>> {
89 102 let gates: Vec<GateView> = if let Some(ver) = gate_version.as_ref() {
90 103 // Most recent gate_runs row per gate_kind for (tier, ver).
91 104 sqlx::query(
92 - "SELECT gate_kind, passed, finished_at, detail
105 + "SELECT gate_kind, passed, finished_at, detail, status, outcome_json, log_ref
93 106 FROM gate_runs g
94 107 WHERE tier = ?1 AND version = ?2
95 108 AND id = (SELECT MAX(id) FROM gate_runs
@@ -106,6 +119,10 @@ async fn get_state(State(s): State<AppState>) -> Result<Json<StateView>> {
106 119 passed: gr.get::<Option<i64>, _>("passed").map(|v| v != 0),
107 120 finished_at: gr.get("finished_at"),
108 121 detail: gr.get("detail"),
122 + status: gr.get("status"),
123 + outcome: gr.get::<Option<String>, _>("outcome_json")
124 + .and_then(|s| serde_json::from_str(&s).ok()),
125 + log_ref: gr.get("log_ref"),
109 126 })
110 127 .collect()
111 128 } else {
@@ -146,6 +163,7 @@ async fn promote(
146 163 body: Option<Json<PromoteBody>>,
147 164 ) -> Result<Json<serde_json::Value>> {
148 165 let body = body.map(|Json(b)| b).unwrap_or_default();
166 + let tier = crate::domain::TierId::new(tier);
149 167 let idx = s.topo.tiers.iter().position(|t| t.name == tier)
150 168 .ok_or(crate::error::Error::NotFound)?;
151 169 if idx == 0 {
@@ -157,7 +175,7 @@ async fn promote(
157 175 let source = &s.topo.tiers[idx - 1];
158 176
159 177 // Resolve version: explicit if given, else the source tier's current.
160 - let version = match body.version {
178 + let version_str = match body.version {
161 179 Some(v) => v,
162 180 None => sqlx::query_scalar::<_, Option<String>>(
163 181 "SELECT current_version FROM tier_state WHERE tier = ?",
@@ -170,10 +188,12 @@ async fn promote(
170 188 format!("no version specified and tier {} has no current_version", source.name),
171 189 ))?,
172 190 };
191 + let version = crate::domain::Version::parse(&version_str)
192 + .map_err(|e| crate::error::Error::Other(anyhow::anyhow!(e)))?;
173 193
174 194 // 1. Predecessor must have all of its gates green for this version (with
175 195 // optional hotfix override that skips burn_in).
176 - let pending = unsatisfied_gates(&s.pool, &source.name, &version, body.hotfix).await?;
196 + let pending = unsatisfied_gates(&s.pool, source.name.as_str(), &version_str, body.hotfix).await?;
177 197 if !pending.is_empty() {
178 198 return Err(crate::error::Error::GateBlocked(format!(
179 199 "{} gate(s) not satisfied on tier {}: {}",
@@ -208,30 +228,39 @@ async fn promote(
208 228 crate::events::emit(&s.events, crate::events::Event::DeployStart {
209 229 tier: target.name.clone(), node: node.name.clone(), version: version.clone(),
210 230 });
211 - let result = crate::deploy::deploy_node(node, &version, &staged_dir, s.cfg.primary_bin()).await;
231 + let result = crate::deploy::deploy_node(node, &version_str, &staged_dir, s.cfg.primary_bin()).await;
212 232 let finished = chrono::Utc::now().to_rfc3339();
213 - let (outcome, err_msg) = match &result {
214 - Ok(_) => ("ok", None),
215 - Err(e) => ("failed", Some(format!("{e:#}"))),
233 + let (outcome_obj, err_for_propagation) = match result {
234 + Ok(_) => (crate::outcome::DeployOutcome::ok(), None),
235 + Err(e) => {
236 + let msg = format!("{e:#}");
237 + let kind = crate::classify::classify_deploy_error(&msg);
238 + (crate::outcome::DeployOutcome::failed(kind), Some(e))
239 + }
216 240 };
241 + let outcome_json = serde_json::to_string(&outcome_obj)
242 + .unwrap_or_else(|e| format!("{{\"_serialize_error\":{e:?}}}"));
217 243 sqlx::query(
218 - "INSERT INTO deploys (version, tier, node, started_at, finished_at, outcome, hotfix, reset_burn_in)
219 - VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
244 + "INSERT INTO deploys (version, tier, node, started_at, finished_at, outcome, outcome_json, hotfix, reset_burn_in)
245 + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
220 246 )
221 247 .bind(&version).bind(&target.name).bind(&node.name)
222 - .bind(&started).bind(&finished).bind(outcome)
248 + .bind(&started).bind(&finished).bind(outcome_obj.status_str())
249 + .bind(&outcome_json)
223 250 .bind(body.hotfix as i64).bind(body.reset_burn_in as i64)
224 251 .execute(&s.pool).await.map_err(crate::error::Error::Db)?;
225 - if let Err(e) = result {
226 - let msg = err_msg.unwrap_or_default();
252 + if let Some(e) = err_for_propagation {
253 + let crate::outcome::DeployStatus::Failed { failure } = outcome_obj.status else {
254 + unreachable!("err_for_propagation is Some iff status is Failed");
255 + };
227 256 tracing::error!(
228 257 tier = %target.name, node = %node.name, version = %version,
229 - error = %msg,
258 + failure = failure.summary(),
230 259 "deploy failed; current symlink left intact, tier_state not advanced"
231 260 );
232 261 crate::events::emit(&s.events, crate::events::Event::DeployFailed {
233 262 tier: target.name.clone(), node: node.name.clone(),
234 - version: version.clone(), error: msg,
263 + version: version.clone(), failure,
235 264 });
236 265 return Err(crate::error::Error::Other(e));
237 266 }
@@ -317,6 +346,7 @@ async fn rollback(
317 346 State(s): State<AppState>,
318 347 Path(tier): Path<String>,
319 348 ) -> Result<Json<serde_json::Value>> {
349 + let tier = crate::domain::TierId::new(tier);
320 350 let target = s.topo.tiers.iter().find(|t| t.name == tier)
321 351 .ok_or(crate::error::Error::NotFound)?;
322 352
@@ -325,11 +355,15 @@ async fn rollback(
325 355 )
326 356 .bind(&tier)
327 357 .fetch_optional(&s.pool).await.map_err(crate::error::Error::Db)?;
328 - let (Some(current), Some(previous)) = row.unwrap_or((None, None)) else {
358 + let (Some(current_str), Some(previous_str)) = row.unwrap_or((None, None)) else {
329 359 return Err(crate::error::Error::GateBlocked(
330 360 "no previous_version to roll back to".into(),
331 361 ));
332 362 };
363 + let current = crate::domain::Version::parse(&current_str)
364 + .map_err(|e| crate::error::Error::Other(anyhow::anyhow!(e)))?;
365 + let previous = crate::domain::Version::parse(&previous_str)
366 + .map_err(|e| crate::error::Error::Other(anyhow::anyhow!(e)))?;
333 367
334 368 let bin: Option<(String,)> = sqlx::query_as(
335 369 "SELECT artifact_path FROM versions WHERE version = ?",
@@ -347,7 +381,7 @@ async fn rollback(
347 381 .to_path_buf();
348 382
349 383 for node in &target.nodes {
350 - crate::deploy::deploy_node(node, &previous, &staged_dir, s.cfg.primary_bin())
384 + crate::deploy::deploy_node(node, &previous_str, &staged_dir, s.cfg.primary_bin())
351 385 .await
352 386 .map_err(crate::error::Error::Other)?;
353 387 }
@@ -395,6 +429,11 @@ async fn rebuild(
395 429 .map_err(crate::error::Error::Other)?,
396 430 };
397 431
432 + // Boundary parse: a sha entering Sando must be hex of plausible length.
433 + // The build pipeline downstream only ever sees `GitSha`.
434 + let sha = crate::domain::GitSha::parse(&sha)
435 + .map_err(|e| crate::error::Error::Other(anyhow::anyhow!(e)))?;
436 +
398 437 tracing::info!(sha = %sha, "rebuild requested");
399 438 crate::events::emit(&s.events, crate::events::Event::RebuildRequested { sha: sha.clone() });
400 439
@@ -416,6 +455,7 @@ async fn rebuild(
416 455 let topo = s.topo.clone();
417 456 let events_for_task = s.events.clone();
418 457 let sha_for_task = sha.clone();
458 + let sha_response = sha.to_string();
419 459 let handle = tokio::spawn(async move {
420 460 if let Err(e) = crate::build::build_and_run_host(pool, cfg, topo, sha_for_task.clone(), events_for_task).await {
421 461 tracing::error!(sha = %sha_for_task, error = %e, "rebuild pipeline failed");
@@ -423,7 +463,7 @@ async fn rebuild(
423 463 });
424 464 *slot = Some(handle.abort_handle());
425 465
426 - Ok(Json(serde_json::json!({ "accepted": true, "sha": sha })))
466 + Ok(Json(serde_json::json!({ "accepted": true, "sha": sha_response })))
427 467 }
428 468
429 469 async fn confirm(
@@ -433,17 +473,20 @@ async fn confirm(
433 473 // Operator-driven satisfaction of a `manual_confirm` gate. Looks up the
434 474 // pending version (current MM version, or the tier's own if non-mm) and
435 475 // inserts a passing gate_runs row so /promote can advance.
476 + let tier = crate::domain::TierId::new(tier);
436 477 let target = s.topo.tiers.iter().find(|t| t.name == tier)
437 478 .ok_or(crate::error::Error::NotFound)?;
438 479
439 - let version: Option<String> = sqlx::query_scalar(
480 + let version_str: Option<String> = sqlx::query_scalar(
440 481 "SELECT current_version FROM tier_state WHERE tier = ?",
441 482 )
442 483 .bind(&target.name)
443 484 .fetch_optional(&s.pool).await.map_err(crate::error::Error::Db)?.flatten();
444 - let version = version.ok_or_else(|| crate::error::Error::GateBlocked(
485 + let version_str = version_str.ok_or_else(|| crate::error::Error::GateBlocked(
445 486 format!("tier {tier} has no current_version; nothing to confirm"),
446 487 ))?;
488 + let version = crate::domain::Version::parse(&version_str)
489 + .map_err(|e| crate::error::Error::Other(anyhow::anyhow!(e)))?;
447 490
448 491 let now = chrono::Utc::now().to_rfc3339();
449 492 sqlx::query(
@@ -1,3 +1,4 @@
1 + use crate::domain::{GateKind, NodeId, TierId};
1 2 use anyhow::{Context, Result};
2 3 use serde::{Deserialize, Serialize};
3 4 use std::path::Path;
@@ -24,7 +25,7 @@ pub struct BackupConfig {
24 25
25 26 #[derive(Debug, Clone, Serialize, Deserialize)]
26 27 pub struct Tier {
27 - pub name: String,
28 + pub name: TierId,
28 29 #[serde(default)]
29 30 pub provisioned: bool,
30 31 pub gates: Vec<Gate>,
@@ -36,7 +37,7 @@ pub struct Tier {
36 37
37 38 #[derive(Debug, Clone, Serialize, Deserialize)]
38 39 pub struct Node {
39 - pub name: String,
40 + pub name: NodeId,
40 41 pub ssh_target: String,
41 42 pub release_root: String,
42 43 /// systemd unit name to reload-or-restart after the symlink swap.
@@ -74,6 +75,21 @@ pub enum Gate {
74 75 ManualConfirm,
75 76 }
76 77
78 + impl Gate {
79 + /// The discriminant — the identifier we use in events, schema columns,
80 + /// and the TUI. Gate parameters (e.g. `BurnIn.hours`) stay with `Gate`
81 + /// and are not carried into `gate_runs` history.
82 + pub fn kind(&self) -> GateKind {
83 + match self {
84 + Gate::CargoTest => GateKind::CargoTest,
85 + Gate::MigrationDryRun => GateKind::MigrationDryRun,
86 + Gate::BootSmoke => GateKind::BootSmoke,
87 + Gate::BurnIn { .. } => GateKind::BurnIn,
88 + Gate::ManualConfirm => GateKind::ManualConfirm,
89 + }
90 + }
91 + }
92 +
77 93 impl Topology {
78 94 pub fn load(path: &Path) -> Result<Self> {
79 95 let raw = std::fs::read_to_string(path)
@@ -86,7 +102,7 @@ impl Topology {
86 102 fn validate(&self) -> Result<()> {
87 103 anyhow::ensure!(!self.tiers.is_empty(), "topology must declare at least one tier");
88 104 for t in &self.tiers {
89 - if t.provisioned && t.nodes.is_empty() && t.name != "host" {
105 + if t.provisioned && t.nodes.is_empty() && t.name.as_str() != "host" {
90 106 anyhow::bail!("tier {} is provisioned but has no nodes", t.name);
91 107 }
92 108 }
@@ -0,0 +1,213 @@
1 + # Sando observability architecture
2 +
3 + Status: draft, 2026-06-03. Argument shape, not a checklist.
4 +
5 + Goal: an error and visibility surface built on newtypes. Sando's current daemon and TUI agree by string convention — `tier: String`, `version: String`, `gate: String`, `detail: Option<String>` — and every consumer (schema, WS payloads, TUI render) reparses or pattern-matches those strings independently. The result is that gate failure classification only exists as prose in `gate_runs.detail`, and the TUI's only recourse on red is "open `/logs/<version>/<gate>` and read the tail." This document proposes the type graph that replaces the strings, the boundary parses that mint the types, and the persistence + wire shapes that follow.
6 +
7 + This is one cohesive design. The launchplan splits it into "Phase B (typed GateOutcome)" and "Phase C (live tail + remaining newtypes)"; that split is a delivery convenience, not an architectural seam. The migration order below preserves the option to ship Phase B first, but the types it introduces must already be the final shape, not a string stand-in to be newtyped later.
8 +
9 + ## What is currently stringly-typed
10 +
11 + Inventory from a read of `daemon/src/gates.rs`, `daemon/src/events.rs`, `daemon/src/topology.rs`, `daemon/migrations/001_init.sql`, and `tui/src/main.rs:186-266`:
12 +
13 + | Concept | Current shape | Where it appears |
14 + |---|---|---|
15 + | Tier | `tier: String` | `GateCtx`, every `Event` variant, `tiers.name`, `gate_runs.tier`, `tier_state.tier`, `deploys.tier`, TUI `str_v(v.get("tier"))` |
16 + | Node | `node: String` (sometimes `Option`) | `Tier.nodes[].name`, `deploys.node`, `Event::Deploy*` |
17 + | Version | `version: String` (semver `0.8.12`) | `GateCtx`, `versions.version`, every event, TUI display |
18 + | Git sha | `sha: String` (short or full?) | `versions.git_sha`, `Event::RebuildRequested/BuildStart/BuildOk/BuildFailed/BuildAborted`, `/logs/<sha>/<gate>` per launchplan §3 |
19 + | Gate kind | `Gate` enum *inside the daemon*; `gate_kind: &'static str` once it crosses the events/schema boundary; reparsed in TUI | `gates::kind_str`, `gate_runs.gate_kind`, `Event::Gate*.gate`, TUI `match kind` |
20 + | Gate outcome detail | `detail: Option<String>` | `GateOutcome.detail`, `gate_runs.detail`, conflates: config error, missing prerequisite, real failure with stderr tail, burn-in clock progress, "waiting on operator" |
21 + | Deploy outcome | `outcome TEXT DEFAULT 'in_progress'` | `deploys.outcome`; no enum on the Rust side |
22 + | Deploy failure | `error: String` | `Event::DeployFailed.error`; freeform |
23 + | Backup source | `source: String` | `backups.source`, `Event::BackupFetched.source`; an ssh URL today |
24 + | Canary policy | typed (`CanaryPolicy`) at config load, then `as_str()` to a `TEXT` column | `tiers.canary` |
25 +
26 + `gates::kind_str` and the TUI's `match kind` are the clearest tell — we have an enum on each side of the wire and a string in the middle. That string is the only schema, so adding a gate kind is a four-place edit with no compile-time enforcement.
27 +
28 + The same anti-pattern with higher stakes is `detail`. Three concrete examples from `gates.rs`:
29 +
30 + - `migration_dry_run` writes `"scratch_db_url unset in daemon config"` (a config bug, no point retrying).
31 + - `migration_dry_run` writes `"no backup fetched; call /backup/fetch first"` (a missing prerequisite, retry after `/backup/fetch`).
32 + - `boot_smoke` writes `"binary exited early: exit status: 101\n==== stdout ====\n…"` (a real failure carrying a tail).
33 + - `burn_in` writes `"47 hours remaining of 168"` (not a failure at all — the gate is correctly red but the deploy is not blocked by a defect).
34 +
35 + Today, an operator (or the TUI, or a future alerting rule) cannot tell those apart without substring-matching the prose. The point of the redesign is to make that classification a property of the value, not a property of how it's spelled.
36 +
37 + ## The type graph
38 +
39 + Domain types live in a new `daemon/src/domain.rs` (name negotiable). They are the vocabulary every other module speaks; they implement `Serialize`/`Deserialize`, `Display`, `FromStr`, and `sqlx::Type` so they round-trip through events, JSON responses, and SQLite columns without per-site conversion.
40 +
41 + ```text
42 + ┌──────────────┐
43 + │ TierId │ newtype String, validated against topology on construction
44 + └──────────────┘
45 + ┌──────────────┐
46 + │ NodeId │ newtype String, validated against TierId's nodes
47 + └──────────────┘
48 + ┌──────────────┐
49 + │ Version │ parsed semver (semver::Version), Display = "0.8.12"
50 + └──────────────┘
51 + ┌──────────────┐
52 + │ GitSha │ enforced 40-hex or short-7-hex; one canonical form
53 + └──────────────┘
54 + ┌──────────────┐
55 + │ GateKind │ enum, replaces both `Gate` discriminant and `gate_kind` string
56 + │ │ Variants: CargoTest, MigrationDryRun, BootSmoke,
57 + │ │ BurnIn, ManualConfirm. Display = snake_case.
58 + └──────────────┘
59 + ┌──────────────┐
60 + │ GateRunId │ newtype i64, identifies one row of gate_runs
61 + └──────────────┘
62 + ┌──────────────┐
63 + │ DeployId │ newtype i64
64 + └──────────────┘
65 + ```
66 +
67 + `Gate` (the topology struct with `BurnIn { hours }`) and `GateKind` (the discriminant) become distinct: `Gate` is the *config* (kind + parameters), `GateKind` is the *identifier* you use to talk about a class of gate in events and the schema. `gate_runs.gate_kind` stores `GateKind`; gate parameters at the time of the run, if we ever need them in history, become a separate column.
68 +
69 + ### `GateOutcome` redesign
70 +
71 + ```rust
72 + pub struct GateOutcome {
73 + pub status: GateStatus,
74 + pub log_ref: Option<LogRef>, // pointer to persisted stdout/stderr, not the tail itself
75 + }
76 +
77 + pub enum GateStatus {
78 + Passed { note: PassNote },
79 + Failed(GateFailure),
80 + Blocked(GateBlocker), // gate cannot run yet; not a defect
81 + }
82 +
83 + pub enum PassNote {
84 + StayedUp { duration_s: u32 }, // boot_smoke
85 + BurnInElapsed { hours: u32 }, // burn_in
86 + Migrated { backup: BackupId, count: u32 }, // migration_dry_run
87 + TestsPassed { count: u32, duration_s: u32 }, // cargo_test
88 + OperatorConfirmed { at: DateTime<Utc> }, // manual_confirm
89 + }
90 +
91 + pub enum GateBlocker {
92 + BurnInClockNotStarted,
93 + BurnInRemaining { hours: u32, total: u32 },
94 + AwaitingOperatorConfirmation,
95 + NoBackupAvailable,
96 + ScratchDbUrlUnset,
97 + ArtifactMissing { version: Version },
98 + }
99 +
100 + pub enum GateFailure {
101 + CargoTest { failed_count: u32, first_failed: Option<String> },
102 + MigrationDrift { migration: String }, // "previously applied but missing"
103 + MigrationModified { migration: String }, // "previously applied but modified"
104 + MigrationSqlError { migration: String, sqlstate: Option<String> },
105 + RestoreFailed { kind: RestoreFailureKind },
106 + BootPanic { exit_code: Option<i32> },
107 + BootExitedEarly { exit_code: Option<i32> },
108 + SpawnFailed { os_error: i32 },
109 + Timeout { gate: GateKind, after_s: u32 },
110 + Unclassified, // fallthrough; log_ref required
111 + }
112 + ```
113 +
114 + Three things to notice about this shape:
115 +
116 + 1. **`Blocked` is its own variant.** Burn-in not yet elapsed and "scratch_db_url unset" are not failures — they are pre-conditions the operator can address out-of-band. The TUI can render them yellow instead of red. Today they're red, indistinguishably.
117 + 2. **`log_ref`, not `detail`.** The structured variants carry just enough to render a one-line summary (`migration 0047 modified`, `tests failed: 3`). The full tail lives on disk and the variant carries a pointer to it. This is the architectural seam between Phase B (classification) and Phase C (live tail) — once `log_ref` exists, Phase C is "ship chunks of that log over WS as they're written" rather than a redesign.
118 + 3. **`Unclassified` is admitted.** Classifiers are best-effort and the migration plan must work when a new failure mode shows up that no classifier matches yet. Unclassified failures still have a `log_ref`; they degrade gracefully to the current "read the tail" experience without breaking the contract.
119 +
120 + ### Classifier layer
121 +
122 + Each gate's runner produces raw output (`stdout`, `stderr`, exit status, plus any structured signals like a sqlx error). A classifier maps `(GateKind, RawOutput) -> GateStatus`. Classifiers are pure functions and live in `daemon/src/classify/` — one file per gate. The taxonomy in `plans/migration-dryrun-failures.md` is already 80% of `migration_dry_run`'s classifier; it gets ported into code, not just docs.
123 +
124 + Classifiers can be unit-tested with captured fixtures (`tests/fixtures/cargo_test_failed_compile.txt` → `GateStatus::Failed(GateFailure::CargoTest { failed_count: 0, .. })`). This is the first place Sando gets meaningful test coverage for diagnostic behavior, which `todo.md` flags as a gap.
125 +
126 + ### Where each type enters Sando
127 +
128 + Boundary discipline: every newtype is constructed once, at the edge where its string form enters the process. Internally, only the typed form moves.
129 +
130 + - `TierId`, `NodeId` — at `Topology::load`. The validator already walks tiers; it now mints `TierId`s and rejects unknown references at parse time instead of letting them surface as foreign-key failures.
131 + - `Version` — at the build step (`build.rs`), parsed from the server `Cargo.toml`. Stored as text in `versions.version`; the column round-trips through `sqlx::Type for Version`.
132 + - `GitSha` — at the post-receive hook entry point in `routes.rs::rebuild`. The hook today passes a string; the route normalizes to `GitSha` immediately. The `/logs/<sha>/<gate>` route (launchplan §3 wording) becomes `/logs/<GitSha>/<GateKind>`, but on-disk storage continues to key by `Version` until the build step has both available together. The comment at `gates.rs:432-434` already anticipates this transition.
133 + - `GateKind` — at `Topology` deserialization. `Gate` keeps its parameter-carrying variants; `GateKind` is derived from `Gate` and used everywhere else.
134 + - `GateOutcome` — produced by gate runners, persisted, emitted in events, displayed in TUI. Never lowered to a `String` along the way.
135 +
136 + ## Persistence
137 +
138 + `gate_runs` becomes:
139 +
140 + ```sql
141 + CREATE TABLE gate_runs (
142 + id INTEGER PRIMARY KEY AUTOINCREMENT,
143 + version TEXT NOT NULL REFERENCES versions(version),
144 + tier TEXT NOT NULL REFERENCES tiers(name),
145 + gate_kind TEXT NOT NULL, -- GateKind, Display form
146 + started_at TEXT NOT NULL,
147 + finished_at TEXT,
148 + status TEXT, -- 'passed' | 'failed' | 'blocked' | NULL while in-flight
149 + outcome_json TEXT, -- serialized GateOutcome (PassNote / GateFailure / GateBlocker)
150 + log_ref TEXT -- relative path under cfg.logs_root, or NULL
151 + );
152 + ```
153 +
154 + `status` is denormalized for cheap indexing and `WHERE status = 'failed'` queries. `outcome_json` is the source of truth and is what the daemon reads back when serving `/state`. The migration drops `passed INTEGER` and `detail TEXT`, with a backfill that maps:
155 +
156 + - `passed = 1` → `status = 'passed'`, `outcome_json = {"kind":"passed","note":{"kind":"legacy","text":<old detail>}}`
157 + - `passed = 0`, detail matches a known prefix (`"burn-in"`, `"scratch_db_url unset"`, `"no backup fetched"`, `"waiting on operator"`) → `status = 'blocked'`, appropriate `GateBlocker` variant
158 + - `passed = 0` otherwise → `status = 'failed'`, `outcome_json = {"kind":"failed","failure":{"kind":"unclassified","legacy_detail":<old>}}`
159 +
160 + Backfill correctness is testable against the existing prod sandod sqlite — there are not many rows.
161 +
162 + `deploys.outcome` gets the same treatment in a smaller, separate migration: enum-typed (`InProgress | Ok | Failed { kind: DeployFailureKind }`), with the failure kind classifying spawn/transport/health-check distinctly from the freeform `error: String` carried by `Event::DeployFailed` today.
163 +
164 + ## Wire shape (events + `/state`)
165 +
166 + `Event::GateDone` becomes:
167 +
168 + ```rust
169 + GateDone {
170 + tier: TierId,
171 + version: Version,
172 + gate: GateKind,
173 + outcome: GateOutcome,
174 + }
175 + ```
176 +
177 + The old `passed: bool` is gone — `outcome.status` carries strictly more information. The TUI's `format_event` becomes a match on the typed envelope (deserialized via `serde_json::from_str::<EventEnvelope>`, not `Value` reflection), and the per-kind render functions in `tui/src/main.rs:186-266` collapse into `Display` impls on the domain types. The `str_v` / `num_v` helpers stop being needed.
178 +
179 + `/state` (the TUI's polling endpoint) gains a `gates: Vec<GateRunSummary>` per tier, where `GateRunSummary` is the typed outcome plus timestamps and a `log_ref`. The TUI no longer needs `/logs/...` to populate its primary view; `/logs` becomes the drill-down for unclassified failures only.
180 +
181 + Phase C's live tail rides on the same `log_ref`: while a gate is in flight, `log_ref` is present and the WS emits `GateLogChunk { run_id: GateRunId, bytes: Vec<u8>, seq: u32 }` events as the runner flushes to disk. The TUI keeps a per-run ring buffer keyed by `GateRunId`. Because `GateRunId` is a newtype with a clear identity, the wire protocol does not need to invent a separate stream identifier.
182 +
183 + ## Migration order
184 +
185 + The order is constrained by what compiles together, not by the launchplan's B-then-C framing.
186 +
187 + 1. **Domain types module** — introduce `TierId`, `NodeId`, `Version`, `GitSha`, `GateKind`, `GateRunId`, `DeployId` with `Serialize` / `sqlx::Type` impls. No call sites change yet. Pure addition.
188 + 2. **Topology + config use the types** — `Topology::load` mints `TierId` / `NodeId`; `GateCtx` carries `TierId`, `Version`, not strings. `gate_runs.tier` and `gate_runs.version` columns stay `TEXT` (no schema migration); the change is on the Rust side only. `kind_str` deletes.
189 + 3. **Events use the types** — every `Event` variant takes the newtypes. WS frames are unchanged on the wire (snake-case strings) because `Serialize` impls produce identical JSON. The TUI keeps its `Value` reflection during this step; nothing forces it to break.
190 + 4. **`GateOutcome` redesign + classifier layer + `gate_runs` migration** — the biggest single step. Runners produce typed outcomes; classifiers live in `daemon/src/classify/`; schema migration 003 lands with backfill. `/state` exposes typed outcomes. TUI keeps `Value` reflection but is now reading typed fields (`status`, `outcome_json`).
191 + 5. **TUI typed event handling** — deserialize `EventEnvelope` directly, drop `str_v` / `num_v`, render via `Display` on domain types. Yellow for `Blocked`, red for `Failed`, green for `Passed`.
192 + 6. **Live tail (`GateLogChunk` events, `log_ref` plumbing)** — gates write to a per-run log file as they run, not at completion; runners drop the `Vec<u8>` buffers in `boot_smoke` / `cargo_test` in favor of streaming through a `tokio::fs::File` writer that also broadcasts chunks. TUI consumes.
193 + 7. **Deploy outcome typing + `DeployFailed` failure-kind classifier** — same pattern, smaller scope. Closes out the rest of the string surface.
194 +
195 + Steps 1–3 can ship in one commit per step without changing observable behavior. Step 4 is where the operator visibility actually changes; it's the smallest unit that justifies a version bump and a sando deploy to itself.
196 +
197 + ## Non-goals and open questions
198 +
199 + - **Not aiming for a generic gate framework.** The `GateFailure` variants are MNW-specific (cargo, sqlx migrations, `makenotwork` binary boot). If we ever gate non-MNW projects, this enum will need to factor; until then, concrete variants are clearer than `Box<dyn>` traits.
200 + - **Not introducing `Result<GateOutcome, GateError>`.** Runner-internal failures (sqlite I/O, can't spawn) collapse into `GateStatus::Failed(GateFailure::SpawnFailed | …)` or `GateBlocker` as appropriate. The outer `Result` only exists for genuine "the daemon itself is broken" cases that prevent persistence at all.
201 + - **Open: backfill aggressiveness.** Should we run classifiers retroactively against the historical `detail` strings during the schema-3 backfill, or only forward? Forward-only is simpler and the history isn't operator-load-bearing — leaning toward forward-only with the `unclassified + legacy_detail` envelope above, but worth deciding.
202 + - **Open: where does `GateRunId` originate?** Today the `INSERT … RETURNING id` returns a raw i64 inside `gates::run`. Cleanest is for `gates::run` to mint a `GateRunId` and pass it back; the event bus then carries it from `GateStart` through `GateLogChunk` through `GateDone`, making client-side correlation trivial. Worth confirming this doesn't break the `manual_confirm` lookup pattern in `gates.rs:386-403`.
203 + - **Open: `serde(tag = "kind")` vs explicit variant shape on `GateOutcome`.** The events module already uses `#[serde(tag = "kind", rename_all = "snake_case")]`, which would give us `{"kind":"failed","failure":{"kind":"migration_drift","migration":"0047"}}`. Two layers of tagged unions is verbose but consistent. Alternative: flatten via `#[serde(flatten)]` and accept that the JSON shape diverges slightly from the Rust shape. Leaning toward the explicit two-layer form because the TUI's parser then mirrors the Rust enum 1:1.
204 + - **Open: do we keep `detail TEXT` for one release as a shadow column, or drop in the same migration?** Shadow is safer for rollback. Adds clutter. Probably worth one release of shadow.
205 +
206 + ## Acceptance, when this is done
207 +
208 + - `gates.rs` contains no `String` literals describing failure modes. Every failure path constructs a `GateFailure` variant.
209 + - The TUI's `format_event` is gone; rendering goes through `Display` on domain types.
210 + - An operator hitting `c /api/state` sees structured outcomes (not just `"passed": false, "detail": "..."`).
211 + - A new gate kind is added by extending `GateKind`, `Gate`, and a classifier; the compiler enforces every other site.
212 + - `/logs/<version>/<gate>` is a fallback for `Unclassified` outcomes, not the primary diagnostic path.
213 + - Live tail works for `boot_smoke` and `cargo_test`; the buffered `Vec<u8>` pattern in `gates.rs:317-332` is gone.
@@ -0,0 +1,163 @@
1 + # Session 3 — first sando-driven prod deploy
2 +
3 + Captured 2026-06-03 after the cutover. Resolves §6.5 step 8 of `launchplan_final.md`: first full sando deploy to Hetzner prod, replacing `deploy.sh` as the live deploy path.
4 +
5 + Status: **complete 2026-06-03.** Prod runs `makenotwork` 0.9.5 (sha `f0970b8`) from `/opt/mnw/current/`, deployed via `POST /promote/b {"hotfix":true}` from sandod on pop-os. Outage window 3m25s (02:50:33 → 02:53:58 UTC). All features green. See §F for outcomes and §G for the four hardcoded paths that block the eventual `rm -rf /opt/makenotwork/`.
6 +
7 + ## Background — Session 1 set the layout, Session 2 proved it on testnot, Session 3 cut prod over
8 +
9 + Session 1 redesigned the on-disk layout (`/opt/mnw/releases/<v>/` + `current` symlink; `/etc/mnw/makenotwork.env`; `/var/lib/mnw/` for state) and shipped the sando-side code that produces the full versioned bundle (binaries + static + docs + error-pages + assumptions). Session 2 reprovisioned testnot under that layout; the first remote deploy of the full bundle landed cleanly after three small gotchas (sqlx URL form, pg_ident map, `ASSUMPTIONS_PATH` mismatch — all logged in `launchplan_final.md` §6.9).
10 +
11 + Session 3 is the real-stakes one: prod was on 0.9.1 via `deploy.sh`, `/opt/makenotwork/` had eight months of accreted state (885M of backups, .env, yara-rules, ssh dir, rustdoc, sudoers entries, cron jobs, Caddyfile references). The Session 1 plan enumerated some of the move sequence but understated the surface area; the actual cutover surfaced several things worth documenting so the next major reprovision (or a disaster-recovery rebuild) doesn't re-discover them.
12 +
13 + ## A. Inventory taken before any prod write
14 +
15 + `/opt/makenotwork/` contents (`makenotwork:makenotwork` unless noted):
16 +
17 + - `makenotwork`, `mnw-admin` — 0.9.1 binaries (`root:root`)
18 + - `.env` (110 lines), 5× `.env.bak.*` files (`root:root`)
19 + - `docs/`, `static/`, `error-pages/` — content (will be replaced by release bundle)
20 + - `backups/` — 885M
21 + - `yara-rules/` — 8.5M compiled, `root:root`
22 + - `yara-rules-src/` — upstream YARA sources (compiled to `yara-rules/`), `root:root`
23 + - `rustdoc/` — generated docs, `501:staff` (uploaded from Mac via `deploy.sh`)
24 + - `ssh/` — `known_hosts` for build runner, `root:root`
25 + - `backup-db.sh` — cron'd daily at 03:00 UTC from `makenotwork`'s crontab
26 + - `deploy/` — `deploy.sh` staging area, `root:root`
27 +
28 + Other prod state in play:
29 +
30 + - `/opt/git/` — 99M, `git:git`. Both git user's home (`/etc/passwd` says `git:x:995:986::/opt/git:/bin/sh`) *and* the GIT_REPOS_PATH target. Conflating these turns out to matter (§F).
31 + - `/etc/caddy/Caddyfile` — three `root * /opt/makenotwork/error-pages` lines.
32 + - `/etc/sudoers.d/mnw-git-ssh` — `makenotwork ALL=(git) NOPASSWD: /opt/makenotwork/mnw-admin rebuild-keys`.
33 + - `/etc/sudoers.d/mnw-cli-git` — `mnw-cli ALL=(git) NOPASSWD: /usr/bin/git-*, /usr/bin/tee, /usr/bin/chmod`. No /opt path references; left alone.
34 + - `makenotwork` user crontab: `0 3 * * * /opt/makenotwork/backup-db.sh >> /opt/makenotwork/backups/backup.log 2>&1`.
35 + - Root crontab: `0 3 * * * /opt/backups/pg_backup.sh >> /var/log/pg_backup.log 2>&1` — unrelated, left alone.
36 +
37 + ## B. Pre-flight (no prod impact)
38 +
39 + 1. **`sando.toml` tier B fixed.** Was `deploy@prod-1.makenot.work` (NXDOMAIN, no port). Now `makenotwork@alpha-west-1` with port handling via `~sando/.ssh/config` Host block. Chose to keep service user as `makenotwork` rather than introduce a `deploy` user — avoids chowning 885M of backups and redoing pg peer auth that's been stable for months. The same reasoning applies to a hypothetical tier C: keep the existing user, don't introduce a new one for cosmetic uniformity with testnot.
40 + 2. **Sando pubkey installed** in `/home/makenotwork/.ssh/authorized_keys` (mode 0600, owned makenotwork).
41 + 3. **`chsh -s /bin/bash makenotwork`** — was `/usr/sbin/nologin`. SSH was rejecting connections, not key auth failing. Worth detecting/fixing in `bootstrap-node.sh` for future provisions where someone has hardened the runtime user.
42 + 4. **`/srv/sando/.ssh/config`** Host block for port 2200; `known_hosts` seeded via `ssh-keyscan -p 2200`.
43 + 5. **Dry-run rsync** from sando → prod's `/opt/mnw/releases/_probe/` succeeded (after `bootstrap-node.sh` created `/opt/mnw/`).
44 +
45 + ## C. Cutover sequence (3m25s outage)
46 +
47 + In order, with the exact reason each step exists:
48 +
49 + 1. **`systemctl stop makenotwork`** — 02:50:33 UTC. Outage window starts.
50 + 2. **Backups taken**: `/etc/systemd/system/makenotwork.service → /root/makenotwork.service.bak-pre-cutover`; `/opt/makenotwork/.env → /root/dotenv.bak-pre-cutover`; Caddyfile, sudoers, crontab also backed up to `/root/*.bak-pre-cutover`. Rollback path for any step failing before service restart.
51 + 3. **`bootstrap-node.sh`** with `SERVICE_USER=makenotwork SANDO_PUBKEY=… INSTALL_POSTGRES=0 INSTALL_CADDY=0 INSTALL_TAILSCALE=0 ENABLE_FIREWALL=0` — postgres/caddy/tailscale/UFW already configured on prod, don't touch. Created `/opt/mnw/`, `/etc/mnw/`, `/var/lib/mnw/`, the new systemd unit, the unused `deploy` user (harmless), the sudoers entry for `deploy`. The new unit references `EnvironmentFile=/etc/mnw/makenotwork.env` and `ReadWritePaths=/var/lib/mnw`, with `RestartPreventExitStatus=2` (MNW server convention: exit 2 = migration failure, don't crashloop).
52 + 4. **`cp /opt/makenotwork/.env /etc/mnw/makenotwork.env`** (copy, not move — original stays for one-week rollback). `chmod 0640 root:makenotwork`. Then `sed` rewrites of `DOCS_PATH`, `ASSUMPTIONS_PATH`, `YARA_RULES_DIR`, `GIT_REPOS_PATH` for the new layout. `HOST`, `PORT`, `DATABASE_URL`, `HOST_URL` unchanged.
53 + 5. **`ln -s /opt/makenotwork/yara-rules /opt/mnw/yara-rules`** — yara-rules is operator-managed (independent update cadence), not in the release bundle (Session 1 layout principle: category #3). The symlink lets the new env's `YARA_RULES_DIR=/opt/mnw/yara-rules` continue to resolve. When `/opt/makenotwork/` is eventually removed, the rules dir moves to a permanent path (probably `/var/lib/mnw/yara-rules` or `/etc/mnw/yara-rules`) and the symlink retargets.
54 + 6. **`rsync -aHX /opt/git/ /var/lib/mnw/git/`** — preserves `git:git` ownership and the directory hardlinks. `chmod 0755 /var/lib/mnw` so the git user can traverse (default was 0750 makenotwork:makenotwork, which blocked git's git-receive-pack from reaching the repos).
55 + 7. **Caddyfile rewrite**: `sed -i 's|/opt/makenotwork/error-pages|/opt/mnw/current/error-pages|g'`. `caddy validate` before reload; `systemctl reload caddy`.
56 + 8. **Sudoers rewrite**: same sed pattern on `/etc/sudoers.d/mnw-git-ssh`; `visudo -c -f` to validate.
57 + 9. **`systemctl daemon-reload`** to pick up the new unit.
58 + 10. **`systemctl restart sandod`** on pop-os — sandod caches `sando.toml` at startup; the new tier B target wouldn't have taken effect without this. **First `POST /promote/b` failed with NXDOMAIN against the stale `prod-1.makenot.work` because sandod hadn't been restarted yet.** Fixed by restarting sandod and re-promoting.
59 + 11. **`POST /promote/b {"hotfix":true}`** — `hotfix: true` bypasses the 48h burn-in on tier A (which had just promoted to 0.9.5 ~15 min prior; burn-in not yet elapsed). Sando rsync'd the 161MB bundle to `/opt/mnw/releases/0.9.5/`, swapped the `current` symlink, called `systemctl reload-or-restart makenotwork.service`.
60 + 12. **Service up 02:53:55 UTC.** Outage window ends 02:53:58 once health serves 200. 733 YARA rules compiled, all integrations (S3, Stripe, MT, WAM, git, scanner, custom domain cache) live.
61 + 13. **External smoke checks**: `/`, `/login`, `/pricing`, `/docs`, `/docs/economics`, `/docs/roadmap`, `/docs/tiers` — all 200.
62 + 14. **`rebuild-keys` to regenerate `/opt/git/.ssh/authorized_keys`** — `dotenvy` doesn't auto-load when running mnw-admin standalone (it loads from `/opt/makenotwork/.env`, mode 0600 `makenotwork:makenotwork`, unreadable by git). Worked around by sourcing the env in root then `sudo -u git -E`. **Regenerated keys still contain `command="/opt/makenotwork/mnw-admin git-auth ..."`** — see §G.
63 + 15. **Git push test** — `git ls-remote git@ssh.makenot.work:max/meta.git` returns refs cleanly. Cutover verified end-to-end.
64 +
65 + ## D. What stayed in place (intentional)
66 +
67 + - `/opt/makenotwork/` — full contents, untouched. Soak rollback path: stop new unit, swap systemd unit back, start old binary. Plan: `rm -rf` after a week, post-0.9.6 deploy (see §G).
68 + - `/opt/git/` — untouched. Git user's `/etc/passwd` home; mnw-admin's regenerated `authorized_keys` writes to `/opt/git/.ssh/authorized_keys` (not `/home/git/`, despite earlier confusion). The rsync to `/var/lib/mnw/git/` populated the new GIT_REPOS_PATH; the server reads from there, but git push lands in `/opt/git/` because that's git user's home. Both paths now hold the repo bytes; that's wasteful but harmless during the soak.
69 + - `/opt/makenotwork/backups/` — 885M of pg dumps. Script and cron still write there. Sando's backup-fetch on pop-os still pulls from there (configured pre-cutover). Migration to `/var/lib/mnw/backups/` is its own follow-up (touches script, crontab, pop-os sando config).
70 + - `yara-rules-src/`, `rustdoc/`, `ssh/`, `.env.bak.*` — not in any env var or systemd path. Confirmed by grepping the running 0.9.5 binary's path references. Will be swept in the post-soak cleanup.
71 +
72 + ## E. What broke and how it was caught
73 +
74 + Three small things, all caught by smoke checks:
75 +
76 + 1. **`sandod` cached `sando.toml`.** First promote attempt returned `creating remote release dir` (an in-flight progress string that became the error message). `journalctl -u sandod` showed it was still resolving `prod-1.makenot.work`. `scp sando.toml pop-os:/tmp/`, `sudo cp /tmp/sando.toml /etc/sando/sando.toml`, `sudo systemctl restart sandod`, re-promote. Worth documenting that `sandod` does not watch the file; alternative is to add an inotify or SIGHUP handler.
77 + 2. **First doc smoke checks were wrong URLs.** `/about/economics`, `/docs/about/economics` returned 404; panicked briefly that the cutover broke doc routing. False alarm: the route is `/docs/{slug}` where slug is the filename stem (e.g., `/docs/economics`). Verified with `grep doc_page MNW/server/src/` after the panic. **Worth fixing in any future smoke script** — use the real URL scheme, not guessed-from-filesystem paths.
78 + 3. **`mnw-admin rebuild-keys` needed env loading from root context.** `sudo -u git /opt/mnw/current/mnw-admin rebuild-keys` fails with `DATABASE_URL must be set: NotPresent` because the binary's `dotenvy::from_path("/opt/makenotwork/.env")` runs as git, which can't read `.env` (mode 0600 makenotwork). Workaround: `set -a; source /etc/mnw/makenotwork.env; set +a; sudo -u git -E /opt/mnw/current/mnw-admin rebuild-keys`. Cleanest long-term fix is in §G.
79 +
80 + ## F. Outcomes (verified)
81 +
82 + **Sando state after cutover:**
83 +
84 + ```
85 + host cur=0.9.5 prev=0.9.5 burn_in_started=2026-06-03T02:23:28Z
86 + a cur=0.9.5 prev=0.8.12 burn_in_started=2026-06-03T02:38:57Z
87 + b cur=0.9.5 prev=None burn_in_started=2026-06-03T02:53:56Z
88 + c not provisioned
89 + ```
90 +
91 + **Prod externally:**
92 + - `https://makenot.work/api/health` → `{"status":"operational","version":"0.9.5","checks":{"database":true}}`.
93 + - `/`, `/login`, `/pricing`, `/docs`, `/docs/economics`, `/docs/roadmap`, `/docs/tiers` → 200.
94 + - Git: `git ls-remote git@ssh.makenot.work:max/meta.git` → returns refs.
95 +
96 + **Prod internally:**
97 + - `systemctl status makenotwork` → active, PID 3123111, listening 0.0.0.0:3000.
98 + - 733 YARA rules compiled from `/opt/mnw/yara-rules` (symlink).
99 + - All integrations enabled per startup log: `s3=true, synckit_s3=false, stripe=true, scanner=true, mt=true, wam=true, git=true`.
100 +
101 + **deploy.sh path retained.** Not retired; remains as break-glass per `feedback_prefer_sando_over_deploy_sh` (sando is preferred *default*; deploy.sh stays runnable for outages where sando host is down).
102 +
103 + ## G. Open follow-ups
104 +
105 + ### G.1 The hardcoded `/opt/makenotwork/` paths (blocks the cleanup milestone)
106 +
107 + Session 1 outcomes claimed "`command=` prefixes auto-update on the first post-migration `rebuild-keys` run." That's wrong — confirmed during step 14. The path is a `const` in the binary, not pulled from env. Four sites need lifting before `/opt/makenotwork/` can be removed:
108 +
109 + | File | Line | Current value | Target |
110 + |---|---|---|---|
111 + | `server/src/git_ssh.rs` | 15 | `const MNW_ADMIN_PATH: &str = "/opt/makenotwork/mnw-admin"` | `/opt/mnw/current/mnw-admin` |
112 + | `server/src/bin/mnw-admin.rs` | 122 | `dotenvy::from_path("/opt/makenotwork/.env")` | `/etc/mnw/makenotwork.env` |
113 + | `server/src/build_runner.rs` | 467 | `const BUILD_SSH_KNOWN_HOSTS: &str = "/opt/makenotwork/ssh/known_hosts"` | `/etc/mnw/known_hosts` (or delete if dead — verify usage first) |
114 + | `server/src/routes/api/ssh_keys.rs` | 165 | `args(["-u", "git", "/opt/makenotwork/mnw-admin", "rebuild-keys"])` | `/opt/mnw/current/mnw-admin` |
115 +
116 + Ship as 0.9.6. Cleanup sequence after: deploy 0.9.6 via sando → `rebuild-keys` once (regenerates `authorized_keys` with new path in command=) → soak one week → `rm -rf /opt/makenotwork/`.
117 +
118 + ### G.2 The backups dir migration
119 +
120 + Independent of G.1. Touches:
121 + - `server/deploy/backup-db.sh` — hardcoded `BACKUP_DIR="/opt/makenotwork/backups"` near top.
122 + - `makenotwork` user crontab on prod.
123 + - Sando's `backup.source` URL on pop-os (currently pulls from `/opt/makenotwork/backups/latest.sql.gz` via rrsync).
124 +
125 + Easiest order: copy the existing 885M dir to `/var/lib/mnw/backups/`, edit script + crontab + sando config in one window, retire `/opt/makenotwork/backups/` after one successful daily backup lands in the new location and sando confirms it pulled cleanly.
126 +
127 + ### G.3 The `/opt/git` vs `/var/lib/mnw/git` duality
128 +
129 + Both directories currently hold the same repos. Git pushes land in `/opt/git/` (git user's home from `/etc/passwd`). Server reads from `/var/lib/mnw/git/` (GIT_REPOS_PATH). They drift the moment someone pushes.
130 +
131 + Two ways out:
132 + - (a) `usermod -d /var/lib/mnw/git git` to make git's home match GIT_REPOS_PATH. Single source of truth. Risk: any cron / script that reads git's home (none I found, but worth grepping) breaks.
133 + - (b) Revert GIT_REPOS_PATH to `/opt/git/`. Avoids the move but locks the path forever and reverts a piece of Session 1's FHS migration.
134 +
135 + (a) is the right answer. Do it during the post-0.9.6 soak window.
136 +
137 + ### G.4 `bootstrap-node.sh` polish
138 +
139 + From this cutover and Session 2:
140 +
141 + - **Detect `nologin` shell** on `SERVICE_USER` and refuse with a clear error (or auto-`chsh`). Costs ~1 min of cutover time if you don't know to check.
142 + - **Sibling `bootstrap-node-postgres.sh`** for the common pg_ident map case (when SERVICE_USER ≠ pg role name). Or document the manual steps in the script's "next steps" output.
143 + - **README-postgres.md note** on the sqlx URL form: `postgres:///db?host=/var/run/postgresql&user=name`, not `postgres://user@/db?host=...`.
144 +
145 + ### G.5 `ASSUMPTIONS_PATH` mismatch
146 +
147 + `sando-daemon.toml` puts the file at `<release>/docs/assumptions.toml`; prod's pre-existing env expected `<release>/docs/business/assumptions.toml` (matching the source layout `server/docs/business/assumptions.toml`). Worked around with an env edit during cutover but both prod and testnot now have non-canonical `ASSUMPTIONS_PATH=/opt/mnw/current/docs/assumptions.toml`. Fix: change `release_contents[3].dst` in `sando-daemon.toml` to `docs/business/assumptions.toml` and revert the env path on both nodes. Small, do it during the 0.9.6 sprint.
148 +
149 + ## H. Key paths (for orientation)
150 +
151 + - `MNW/sando/sando.toml` — tier B definition (`makenotwork@alpha-west-1`).
152 + - `MNW/sando/deploy/bootstrap-node.sh` — node-bootstrap; ran on prod with `SERVICE_USER=makenotwork`.
153 + - `MNW/sando/daemon/sando-daemon.toml` — release_contents (note §G.5 ASSUMPTIONS_PATH mismatch).
154 + - `MNW/server/src/{git_ssh.rs, build_runner.rs, bin/mnw-admin.rs, routes/api/ssh_keys.rs}` — the four hardcoded path sites.
155 + - `MNW/server/deploy/backup-db.sh` — hardcoded backup dir.
156 + - `/etc/systemd/system/makenotwork.service` (prod) — new FHS unit.
157 + - `/etc/mnw/makenotwork.env` (prod) — new env file location.
158 + - `/etc/sudoers.d/mnw-git-ssh` (prod) — updated to `/opt/mnw/current/mnw-admin`.
159 + - `/etc/caddy/Caddyfile` (prod) — three error-pages refs updated.
160 + - `/opt/makenotwork/` (prod) — full pre-cutover state, kept for soak rollback.
161 + - `launchplan_final.md` §6.5 step 8 — original plan this session closes.
162 + - `launchplan_final.md` §6.9 — Session 2/3 gotchas summary.
163 + - `launchplan_final.md` §7 — 0.9.6 path-decoupling spec.
@@ -61,8 +61,12 @@ gates = [
61 61 ]
62 62 [[tier.node]]
63 63 name = "prod-1"
64 - ssh_target = "deploy@prod-1.makenot.work"
64 + # Tailnet name; port 2200 supplied via /srv/sando/.ssh/config Host block.
65 + # Service user is "makenotwork" (pre-existing on prod), not "deploy" — chose
66 + # not to chown 885M of backups + redo postgres peer auth for a cosmetic rename.
67 + ssh_target = "makenotwork@alpha-west-1"
65 68 release_root = "/opt/mnw"
69 + service_name = "makenotwork.service"
66 70
67 71 # ---- C: prod-2 (declared, not yet provisioned) ----
68 72 [[tier]]
M sando/tui/Cargo.lock +1294 -35
M sando/tui/src/main.rs +287 -130