max / makenotwork
23 files changed,
+3852 insertions,
-375 deletions
| @@ -1623,6 +1623,7 @@ dependencies = [ | |||
| 1623 | 1623 | "metrics", | |
| 1624 | 1624 | "metrics-exporter-prometheus", | |
| 1625 | 1625 | "reqwest", | |
| 1626 | + | "semver", | |
| 1626 | 1627 | "serde", | |
| 1627 | 1628 | "serde_json", | |
| 1628 | 1629 | "sqlx", | |
| @@ -1648,6 +1649,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" | |||
| 1648 | 1649 | checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" | |
| 1649 | 1650 | ||
| 1650 | 1651 | [[package]] | |
| 1652 | + | name = "semver" | |
| 1653 | + | version = "1.0.28" | |
| 1654 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 1655 | + | checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" | |
| 1656 | + | dependencies = [ | |
| 1657 | + | "serde", | |
| 1658 | + | "serde_core", | |
| 1659 | + | ] | |
| 1660 | + | ||
| 1661 | + | [[package]] | |
| 1651 | 1662 | name = "serde" | |
| 1652 | 1663 | version = "1.0.228" | |
| 1653 | 1664 | source = "registry+https://github.com/rust-lang/crates.io-index" |
| @@ -1,6 +1,6 @@ | |||
| 1 | 1 | [package] | |
| 2 | 2 | name = "sando-daemon" | |
| 3 | - | version = "0.1.0" | |
| 3 | + | version = "0.2.0" | |
| 4 | 4 | edition = "2024" | |
| 5 | 5 | license = "MIT" | |
| 6 | 6 | ||
| @@ -22,6 +22,7 @@ metrics-exporter-prometheus = { version = "0.18.1", default-features = false } | |||
| 22 | 22 | anyhow = "1.0.102" | |
| 23 | 23 | thiserror = "2.0.18" | |
| 24 | 24 | chrono = { version = "0.4", features = ["serde"] } | |
| 25 | + | semver = { version = "1.0", features = ["serde"] } | |
| 25 | 26 | ||
| 26 | 27 | [dev-dependencies] | |
| 27 | 28 | tempfile = "3.20" |
| @@ -0,0 +1,116 @@ | |||
| 1 | + | -- Typed gate outcomes. See `plans/observability.md`. | |
| 2 | + | -- | |
| 3 | + | -- Adds three columns to `gate_runs`: | |
| 4 | + | -- status — 'passed' | 'failed' | 'blocked' | NULL while in-flight | |
| 5 | + | -- outcome_json — serialized GateOutcome (the source of truth post-migration) | |
| 6 | + | -- log_ref — relative path under cfg.logs_root to the stdout/stderr capture | |
| 7 | + | -- | |
| 8 | + | -- `passed` and `detail` are retained as SHADOW columns for one release. | |
| 9 | + | -- The runner double-writes them so any out-of-process consumer that still | |
| 10 | + | -- reads the old schema (or any rollback to a pre-003 binary) keeps working. | |
| 11 | + | -- Migration 004 will drop them. | |
| 12 | + | -- | |
| 13 | + | -- Backfill rules for legacy rows: | |
| 14 | + | -- passed = 1 → status='passed', outcome_json=passed/legacy{text=detail} | |
| 15 | + | -- passed IS NULL → status=NULL (in-flight; runner will write) | |
| 16 | + | -- detail matches a known blocker form → status='blocked', outcome_json=blocked/<variant> | |
| 17 | + | -- anything else → status='failed', outcome_json=failed/unclassified{legacy_detail=detail} | |
| 18 | + | -- | |
| 19 | + | -- The blocker patterns reproduce the exact prose the pre-typed runner wrote, | |
| 20 | + | -- which is stable across the history these rows came from. See | |
| 21 | + | -- `outcome::PassNote::summary` / `GateBlocker::summary` for the spelling. | |
| 22 | + | ||
| 23 | + | ALTER TABLE gate_runs ADD COLUMN status TEXT; | |
| 24 | + | ALTER TABLE gate_runs ADD COLUMN outcome_json TEXT; | |
| 25 | + | ALTER TABLE gate_runs ADD COLUMN log_ref TEXT; | |
| 26 | + | ||
| 27 | + | -- 1. In-flight rows (passed IS NULL): leave status/outcome_json NULL. | |
| 28 | + | -- 2. Passed rows: wrap legacy detail in PassNote::Legacy. | |
| 29 | + | UPDATE gate_runs | |
| 30 | + | SET status = 'passed', | |
| 31 | + | outcome_json = json_object( | |
| 32 | + | 'status', json_object( | |
| 33 | + | 'kind', 'passed', | |
| 34 | + | 'note', json_object('kind', 'legacy', 'text', COALESCE(detail, '')) | |
| 35 | + | ) | |
| 36 | + | ) | |
| 37 | + | WHERE passed = 1; | |
| 38 | + | ||
| 39 | + | -- 3. Failed-with-blocker prose. | |
| 40 | + | UPDATE gate_runs | |
| 41 | + | SET status = 'blocked', | |
| 42 | + | outcome_json = json_object( | |
| 43 | + | 'status', json_object( | |
| 44 | + | 'kind', 'blocked', | |
| 45 | + | 'blocker', json_object('kind', 'burn_in_clock_not_started') | |
| 46 | + | ) | |
| 47 | + | ) | |
| 48 | + | WHERE passed = 0 AND detail LIKE 'burn-in clock not started%'; | |
| 49 | + | ||
| 50 | + | UPDATE gate_runs | |
| 51 | + | SET status = 'blocked', | |
| 52 | + | outcome_json = json_object( | |
| 53 | + | 'status', json_object( | |
| 54 | + | 'kind', 'blocked', | |
| 55 | + | 'blocker', json_object('kind', 'scratch_db_url_unset') | |
| 56 | + | ) | |
| 57 | + | ) | |
| 58 | + | WHERE passed = 0 AND detail LIKE 'scratch_db_url unset%'; | |
| 59 | + | ||
| 60 | + | UPDATE gate_runs | |
| 61 | + | SET status = 'blocked', | |
| 62 | + | outcome_json = json_object( | |
| 63 | + | 'status', json_object( | |
| 64 | + | 'kind', 'blocked', | |
| 65 | + | 'blocker', json_object('kind', 'no_backup_available') | |
| 66 | + | ) | |
| 67 | + | ) | |
| 68 | + | WHERE passed = 0 AND detail LIKE 'no backup fetched%'; | |
| 69 | + | ||
| 70 | + | UPDATE gate_runs | |
| 71 | + | SET status = 'blocked', | |
| 72 | + | outcome_json = json_object( | |
| 73 | + | 'status', json_object( | |
| 74 | + | 'kind', 'blocked', | |
| 75 | + | 'blocker', json_object('kind', 'awaiting_operator_confirmation') | |
| 76 | + | ) | |
| 77 | + | ) | |
| 78 | + | WHERE passed = 0 AND detail LIKE 'waiting on operator%'; | |
| 79 | + | ||
| 80 | + | UPDATE gate_runs | |
| 81 | + | SET status = 'blocked', | |
| 82 | + | outcome_json = json_object( | |
| 83 | + | 'status', json_object( | |
| 84 | + | 'kind', 'blocked', | |
| 85 | + | 'blocker', json_object('kind', 'artifact_missing', 'version', '?') | |
| 86 | + | ) | |
| 87 | + | ) | |
| 88 | + | WHERE passed = 0 AND detail LIKE 'no artifact for version%'; | |
| 89 | + | ||
| 90 | + | -- "N hours remaining of M" — extract via SQL regexp would be nice but SQLite's | |
| 91 | + | -- regex is optional. Settle for the raw form; the typed runner will overwrite | |
| 92 | + | -- with proper values on next run. status='blocked' is still load-bearing. | |
| 93 | + | UPDATE gate_runs | |
| 94 | + | SET status = 'blocked', | |
| 95 | + | outcome_json = json_object( | |
| 96 | + | 'status', json_object( | |
| 97 | + | 'kind', 'blocked', | |
| 98 | + | 'blocker', json_object('kind', 'burn_in_remaining', | |
| 99 | + | 'hours_remaining', 0, | |
| 100 | + | 'hours_total', 0) | |
| 101 | + | ) | |
| 102 | + | ) | |
| 103 | + | WHERE passed = 0 AND detail LIKE '% hours remaining of %'; | |
| 104 | + | ||
| 105 | + | -- 4. Everything else with passed=0 is genuine failure, unclassified. | |
| 106 | + | UPDATE gate_runs | |
| 107 | + | SET status = 'failed', | |
| 108 | + | outcome_json = json_object( | |
| 109 | + | 'status', json_object( | |
| 110 | + | 'kind', 'failed', | |
| 111 | + | 'failure', json_object('kind', 'unclassified', 'legacy_detail', COALESCE(detail, '')) | |
| 112 | + | ) | |
| 113 | + | ) | |
| 114 | + | WHERE passed = 0 AND status IS NULL; | |
| 115 | + | ||
| 116 | + | CREATE INDEX gate_runs_status ON gate_runs(status); |
| @@ -0,0 +1,16 @@ | |||
| 1 | + | -- Typed deploy outcomes. See `plans/observability.md` step 7. | |
| 2 | + | -- | |
| 3 | + | -- Mirrors migration 003 for the `deploys` table: add `outcome_json` | |
| 4 | + | -- carrying the full `DeployOutcome` (and `DeployFailureKind` when the | |
| 5 | + | -- status is `failed`). The legacy `outcome TEXT` column stays as the | |
| 6 | + | -- high-level status word ('in_progress' | 'ok' | 'failed'), since | |
| 7 | + | -- nothing in the daemon's read path depends on the failure-kind | |
| 8 | + | -- structure today — operators see it through the WS event. | |
| 9 | + | -- | |
| 10 | + | -- No backfill: historical deploys with `outcome='failed'` lose their | |
| 11 | + | -- freeform `error` string (which previously rode the WS event only — | |
| 12 | + | -- nothing persisted it). For future failed deploys, the runner | |
| 13 | + | -- populates outcome_json alongside outcome. | |
| 14 | + | ||
| 15 | + | ALTER TABLE deploys ADD COLUMN outcome_json TEXT; | |
| 16 | + | CREATE INDEX deploys_by_outcome ON deploys(outcome); |
| @@ -6,6 +6,7 @@ | |||
| 6 | 6 | ||
| 7 | 7 | use crate::config::Config; | |
| 8 | 8 | use crate::deploy; | |
| 9 | + | use crate::domain::{GitSha, TierId, Version}; | |
| 9 | 10 | use crate::gates::{self, GateCtx}; | |
| 10 | 11 | use crate::git; | |
| 11 | 12 | use crate::topology::Topology; | |
| @@ -18,8 +19,8 @@ use tokio::process::Command; | |||
| 18 | 19 | ||
| 19 | 20 | #[derive(Debug, Clone)] | |
| 20 | 21 | pub struct BuildArtifact { | |
| 21 | - | pub version: String, | |
| 22 | - | pub git_sha: String, | |
| 22 | + | pub version: Version, | |
| 23 | + | pub git_sha: GitSha, | |
| 23 | 24 | pub worktree: PathBuf, | |
| 24 | 25 | /// One entry per `cfg.bin_names` in declared order. First is the primary | |
| 25 | 26 | /// (referenced by the systemd unit's ExecStart). Paths are inside the | |
| @@ -31,12 +32,12 @@ pub async fn run( | |||
| 31 | 32 | pool: SqlitePool, | |
| 32 | 33 | cfg: Arc<Config>, | |
| 33 | 34 | topo: Arc<Topology>, | |
| 34 | - | sha: String, | |
| 35 | + | sha: GitSha, | |
| 35 | 36 | events: crate::events::EventTx, | |
| 36 | 37 | ) -> Result<BuildArtifact> { | |
| 37 | - | let worktree = cfg.workdir.join(&sha); | |
| 38 | + | let worktree = cfg.workdir.join(sha.as_str()); | |
| 38 | 39 | let bare = PathBuf::from(&topo.repo.bare_path); | |
| 39 | - | git::checkout_worktree(&bare, &sha, &worktree).await?; | |
| 40 | + | git::checkout_worktree(&bare, sha.as_str(), &worktree).await?; | |
| 40 | 41 | ||
| 41 | 42 | let server_dir = worktree.join("server"); | |
| 42 | 43 | let version = read_pkg_version(&server_dir.join("Cargo.toml")).await | |
| @@ -53,7 +54,7 @@ pub async fn run( | |||
| 53 | 54 | .current_dir(&server_dir) | |
| 54 | 55 | .kill_on_drop(true); | |
| 55 | 56 | if let Some(scratch_url) = cfg.scratch_db_url.as_deref() { | |
| 56 | - | tracing::info!(sha = %sha, "preparing scratch DB schema for sqlx compile-time checks"); | |
| 57 | + | tracing::info!(sha = %sha.as_str(), "preparing scratch DB schema for sqlx compile-time checks"); | |
| 57 | 58 | crate::gates::reset_scratch(scratch_url).await | |
| 58 | 59 | .context("scratch DB reset before build")?; | |
| 59 | 60 | crate::gates::run_migrator(scratch_url, &server_dir.join("migrations")).await | |
| @@ -124,7 +125,7 @@ pub async fn build_and_run_host( | |||
| 124 | 125 | pool: SqlitePool, | |
| 125 | 126 | cfg: Arc<Config>, | |
| 126 | 127 | topo: Arc<Topology>, | |
| 127 | - | sha: String, | |
| 128 | + | sha: GitSha, | |
| 128 | 129 | events: crate::events::EventTx, | |
| 129 | 130 | ) -> Result<()> { | |
| 130 | 131 | let art = run(pool.clone(), cfg.clone(), topo.clone(), sha, events.clone()).await?; | |
| @@ -149,13 +150,13 @@ pub async fn build_and_run_host( | |||
| 149 | 150 | .execute(&pool) | |
| 150 | 151 | .await?; | |
| 151 | 152 | ||
| 152 | - | let host = topo.tiers.iter().find(|t| t.name == "host") | |
| 153 | + | let host = topo.tiers.iter().find(|t| t.name.as_str() == "host") | |
| 153 | 154 | .context("topology has no `host` tier")?; | |
| 154 | 155 | ||
| 155 | 156 | let ctx = GateCtx { | |
| 156 | 157 | pool: pool.clone(), | |
| 157 | 158 | cfg: cfg.clone(), | |
| 158 | - | tier: "host".to_string(), | |
| 159 | + | tier: TierId::new("host"), | |
| 159 | 160 | version: art.version.clone(), | |
| 160 | 161 | worktree: art.worktree.clone(), | |
| 161 | 162 | events: events.clone(), | |
| @@ -183,7 +184,7 @@ pub async fn build_and_run_host( | |||
| 183 | 184 | Ok(()) | |
| 184 | 185 | } | |
| 185 | 186 | ||
| 186 | - | async fn read_pkg_version(cargo_toml: &Path) -> Result<String> { | |
| 187 | + | async fn read_pkg_version(cargo_toml: &Path) -> Result<Version> { | |
| 187 | 188 | let raw = tokio::fs::read_to_string(cargo_toml).await?; | |
| 188 | 189 | let parsed: toml::Value = toml::from_str(&raw)?; | |
| 189 | 190 | let v = parsed | |
| @@ -191,7 +192,7 @@ async fn read_pkg_version(cargo_toml: &Path) -> Result<String> { | |||
| 191 | 192 | .and_then(|p| p.get("version")) | |
| 192 | 193 | .and_then(|v| v.as_str()) | |
| 193 | 194 | .context("package.version not found")?; | |
| 194 | - | Ok(v.to_string()) | |
| 195 | + | Version::parse(v).with_context(|| format!("parsing package.version `{v}`")) | |
| 195 | 196 | } | |
| 196 | 197 | ||
| 197 | 198 | fn tail(buf: &[u8], max: usize) -> String { |
| @@ -0,0 +1,359 @@ | |||
| 1 | + | //! Gate-output classifiers. | |
| 2 | + | //! | |
| 3 | + | //! Each `classify_*` function takes the raw signals produced by a gate | |
| 4 | + | //! runner (exit status, stdout/stderr tails, sqlx error strings) and | |
| 5 | + | //! maps them to a typed `GateFailure`. Anything that doesn't match a | |
| 6 | + | //! known pattern returns `GateFailure::Unclassified` with the original | |
| 7 | + | //! detail attached — the on-disk gate log is the ultimate fallback. | |
| 8 | + | //! | |
| 9 | + | //! Classifiers are pure functions: no IO, no async. That makes them | |
| 10 | + | //! fixture-testable, and it keeps the `gates.rs` runner code in charge | |
| 11 | + | //! of side effects (process spawning, log persistence). | |
| 12 | + | ||
| 13 | + | use crate::outcome::GateFailure; | |
| 14 | + | ||
| 15 | + | /// `cargo_test`: derive a `CargoTest` failure with whatever counts can | |
| 16 | + | /// be lifted out of the test runner's output. | |
| 17 | + | /// | |
| 18 | + | /// libtest emits a `test result: FAILED. P passed; F failed; ...` line | |
| 19 | + | /// near the end of stdout. We grab `F` from that. If the output never | |
| 20 | + | /// reached that line (compile error, runtime panic in the harness), we | |
| 21 | + | /// fall through to `Unclassified`. | |
| 22 | + | pub fn classify_cargo_test(stdout: &[u8], stderr: &[u8]) -> GateFailure { | |
| 23 | + | let stdout_s = String::from_utf8_lossy(stdout); | |
| 24 | + | ||
| 25 | + | let mut failed_count: u32 = 0; | |
| 26 | + | let mut first_failed: Option<String> = None; | |
| 27 | + | ||
| 28 | + | // `test result: FAILED. P passed; F failed; ...` lives near the | |
| 29 | + | // end. Walk backwards to find it cheaply on very large outputs. | |
| 30 | + | for line in stdout_s.lines().rev().take(50) { | |
| 31 | + | if let Some(rest) = line.strip_prefix("test result: FAILED.") { | |
| 32 | + | // Expect "P passed; F failed; ..." | |
| 33 | + | for piece in rest.split(';') { | |
| 34 | + | let p = piece.trim(); | |
| 35 | + | if let Some(num_str) = p.strip_suffix(" failed") { | |
| 36 | + | if let Ok(n) = num_str.parse::<u32>() { | |
| 37 | + | failed_count = n; | |
| 38 | + | } | |
| 39 | + | } | |
| 40 | + | } | |
| 41 | + | break; | |
| 42 | + | } | |
| 43 | + | } | |
| 44 | + | ||
| 45 | + | // libtest prints "failures:\n foo::bar" near the end too. Grab | |
| 46 | + | // the first one for the summary line. | |
| 47 | + | if let Some(idx) = stdout_s.find("\nfailures:\n") { | |
| 48 | + | for line in stdout_s[idx + 11..].lines() { | |
| 49 | + | let trimmed = line.trim(); | |
| 50 | + | if trimmed.is_empty() { break; } | |
| 51 | + | // The "failures:" block repeats — once with stdout per | |
| 52 | + | // failure, once as a plain name list. Either way the first | |
| 53 | + | // non-empty line is a candidate. | |
| 54 | + | first_failed = Some(trimmed.to_string()); | |
| 55 | + | break; | |
| 56 | + | } | |
| 57 | + | } | |
| 58 | + | ||
| 59 | + | if failed_count == 0 && first_failed.is_none() { | |
| 60 | + | // Compile error or harness panic — no usable signal in stdout. | |
| 61 | + | return GateFailure::Unclassified { | |
| 62 | + | legacy_detail: Some(combined_tail_for_classifier(stdout, stderr)), | |
| 63 | + | }; | |
| 64 | + | } | |
| 65 | + | ||
| 66 | + | GateFailure::CargoTest { failed_count, first_failed } | |
| 67 | + | } | |
| 68 | + | ||
| 69 | + | /// `migration_dry_run` is staged: scratch reset → restore dump → run | |
| 70 | + | /// migrator. Each stage has its own failure mode. The caller (the gate | |
| 71 | + | /// runner) knows which stage tripped; classifiers here turn the stage's | |
| 72 | + | /// error string into a typed variant. | |
| 73 | + | /// | |
| 74 | + | /// Inputs are the migration name (when known) and the error string sqlx | |
| 75 | + | /// returned. `migration` defaults to "?" when sqlx couldn't tell us | |
| 76 | + | /// which file blew up. | |
| 77 | + | pub fn classify_migration_error(err: &str, migration_hint: Option<&str>) -> GateFailure { | |
| 78 | + | // sqlx::migrate::MigrateError variants are stringified consistently. | |
| 79 | + | // Examples from `plans/migration-dryrun-failures.md`: | |
| 80 | + | // "migration 47 was previously applied but is missing in the resolved migrations" | |
| 81 | + | // "migration 47 was previously applied but has been modified" | |
| 82 | + | // sqlx::Error::Database with sqlstate (e.g. "42P01" relation does not exist) | |
| 83 | + | ||
| 84 | + | if let Some(m) = extract_drift(err) { | |
| 85 | + | return GateFailure::MigrationDrift { migration: m }; | |
| 86 | + | } | |
| 87 | + | if let Some(m) = extract_modified(err) { | |
| 88 | + | return GateFailure::MigrationModified { migration: m }; | |
| 89 | + | } | |
| 90 | + | let sqlstate = extract_sqlstate(err); | |
| 91 | + | let migration = migration_hint.map(str::to_owned).unwrap_or_else(|| "?".to_owned()); | |
| 92 | + | if sqlstate.is_some() { | |
| 93 | + | return GateFailure::MigrationSqlError { migration, sqlstate }; | |
| 94 | + | } | |
| 95 | + | GateFailure::Unclassified { | |
| 96 | + | legacy_detail: Some(err.chars().take(4_000).collect()), | |
| 97 | + | } | |
| 98 | + | } | |
| 99 | + | ||
| 100 | + | fn extract_drift(err: &str) -> Option<String> { | |
| 101 | + | // "migration N was previously applied but is missing in the resolved migrations" | |
| 102 | + | let idx = err.find(" was previously applied but is missing")?; | |
| 103 | + | let prefix = &err[..idx]; | |
| 104 | + | let mig = prefix.rsplit_once(' ').map(|(_, m)| m).unwrap_or(prefix); | |
| 105 | + | Some(mig.to_string()) | |
| 106 | + | } | |
| 107 | + | ||
| 108 | + | fn extract_modified(err: &str) -> Option<String> { | |
| 109 | + | let idx = err.find(" was previously applied but has been modified")?; | |
| 110 | + | let prefix = &err[..idx]; | |
| 111 | + | let mig = prefix.rsplit_once(' ').map(|(_, m)| m).unwrap_or(prefix); | |
| 112 | + | Some(mig.to_string()) | |
| 113 | + | } | |
| 114 | + | ||
| 115 | + | fn extract_sqlstate(err: &str) -> Option<String> { | |
| 116 | + | // Postgres errors surface as `... code: "42P01" ...` in the Debug | |
| 117 | + | // form sqlx produces. Be tolerant of the surrounding quoting. | |
| 118 | + | let idx = err.find("code: \"")?; | |
| 119 | + | let rest = &err[idx + 7..]; | |
| 120 | + | let end = rest.find('"')?; | |
| 121 | + | Some(rest[..end].to_string()) | |
| 122 | + | } | |
| 123 | + | ||
| 124 | + | /// `boot_smoke`: process exit info is the dominant signal. If the | |
| 125 | + | /// binary exited with a status during the smoke window, we map exit | |
| 126 | + | /// code 101 (Rust default for panic) to `BootPanic`, everything else | |
| 127 | + | /// to `BootExitedEarly`. If it never exited (stayed up), the caller | |
| 128 | + | /// constructs `PassNote::StayedUp` directly without consulting this. | |
| 129 | + | pub fn classify_boot_smoke(exit_code: Option<i32>) -> GateFailure { | |
| 130 | + | match exit_code { | |
| 131 | + | Some(101) => GateFailure::BootPanic { exit_code: Some(101) }, | |
| 132 | + | Some(c) if c < 0 => GateFailure::BootPanic { exit_code: Some(c) }, // killed by signal | |
| 133 | + | Some(c) => GateFailure::BootExitedEarly { exit_code: Some(c) }, | |
| 134 | + | None => GateFailure::BootExitedEarly { exit_code: None }, | |
| 135 | + | } | |
| 136 | + | } | |
| 137 | + | ||
| 138 | + | /// `Event::DeployFailed`: classify an anyhow chain produced by | |
| 139 | + | /// `deploy::deploy_node` into a typed `DeployFailureKind`. | |
| 140 | + | /// | |
| 141 | + | /// The anyhow chain is the `format!("{e:#}")` string the caller built, | |
| 142 | + | /// which joins each `.context(...)` layer with ": ". We probe for the | |
| 143 | + | /// contexts attached by `deploy_remote` (and well-known stderr patterns | |
| 144 | + | /// from ssh/rsync) in order of specificity. | |
| 145 | + | pub fn classify_deploy_error(err: &str) -> crate::outcome::DeployFailureKind { | |
| 146 | + | use crate::outcome::DeployFailureKind as K; | |
| 147 | + | ||
| 148 | + | // SSH-level transport failures bubble up under whatever context | |
| 149 | + | // their caller attached. Probe for the canonical OpenSSH stderr | |
| 150 | + | // patterns first so a "creating remote release dir: ... Connection | |
| 151 | + | // refused" doesn't get filed under NodeUnreachable's prose label. | |
| 152 | + | let unreachable_signals = [ | |
| 153 | + | "Connection refused", | |
| 154 | + | "Connection timed out", | |
| 155 | + | "Network is unreachable", | |
| 156 | + | "No route to host", | |
| 157 | + | "Could not resolve hostname", | |
| 158 | + | "Host key verification failed", | |
| 159 | + | "Permission denied (publickey", | |
| 160 | + | ]; | |
| 161 | + | if unreachable_signals.iter().any(|p| err.contains(p)) { | |
| 162 | + | return K::NodeUnreachable { detail: err.chars().take(400).collect() }; | |
| 163 | + | } | |
| 164 | + | ||
| 165 | + | // The contexts attached by `deploy_remote` (deploy.rs) are stable | |
| 166 | + | // strings; treat them as anchors. Order matters — "symlink swap + | |
| 167 | + | // systemctl" appears after a successful rsync, so probe rsync first | |
| 168 | + | // to avoid catching it under the swap heading. | |
| 169 | + | if err.contains("rsync failed") || err.contains("spawning rsync") { | |
| 170 | + | return K::RsyncFailed { detail: err.chars().take(400).collect() }; | |
| 171 | + | } | |
| 172 | + | if err.contains("creating remote release dir") { | |
| 173 | + | return K::NodeUnreachable { detail: err.chars().take(400).collect() }; | |
| 174 | + | } | |
| 175 | + | if err.contains("symlink swap + systemctl") { | |
| 176 | + | // Heuristic split inside the combined step: stderr containing | |
| 177 | + | // "systemctl" suggests the swap succeeded and the restart failed. | |
| 178 | + | if err.contains("systemctl") && !err.contains("ln:") { | |
| 179 | + | return K::ServiceRestartFailed { detail: err.chars().take(400).collect() }; | |
| 180 | + | } | |
| 181 | + | return K::SymlinkSwapFailed { detail: err.chars().take(400).collect() }; | |
| 182 | + | } | |
| 183 | + | if err.contains("symlink swap failed") { | |
| 184 | + | return K::SymlinkSwapFailed { detail: err.chars().take(400).collect() }; | |
| 185 | + | } | |
| 186 | + | ||
| 187 | + | K::Unclassified { detail: err.chars().take(400).collect() } | |
| 188 | + | } | |
| 189 | + | ||
| 190 | + | /// Concatenate stdout + stderr tails the way the legacy runner did, so | |
| 191 | + | /// `Unclassified.legacy_detail` looks like what operators are used to | |
| 192 | + | /// seeing in `gate_runs.detail` today. | |
| 193 | + | fn combined_tail_for_classifier(stdout: &[u8], stderr: &[u8]) -> String { | |
| 194 | + | let mut joined = Vec::with_capacity(stdout.len() + stderr.len() + 32); | |
| 195 | + | joined.extend_from_slice(b"==== stdout ====\n"); | |
| 196 | + | joined.extend_from_slice(stdout); | |
| 197 | + | if !stdout.last().is_some_and(|b| *b == b'\n') { joined.push(b'\n'); } | |
| 198 | + | joined.extend_from_slice(b"==== stderr ====\n"); | |
| 199 | + | joined.extend_from_slice(stderr); | |
| 200 | + | let s = String::from_utf8_lossy(&joined); | |
| 201 | + | if s.len() <= 4_000 { s.into_owned() } else { format!("...{}", &s[s.len() - 4_000..]) } | |
| 202 | + | } | |
| 203 | + | ||
| 204 | + | #[cfg(test)] | |
| 205 | + | mod tests { | |
| 206 | + | use super::*; | |
| 207 | + | ||
| 208 | + | #[test] | |
| 209 | + | fn cargo_test_extracts_failed_count() { | |
| 210 | + | let stdout = b"running 12 tests\n\ | |
| 211 | + | test foo ... ok\n\ | |
| 212 | + | test bar ... FAILED\n\ | |
| 213 | + | test baz ... FAILED\n\ | |
| 214 | + | \n\ | |
| 215 | + | failures:\n\ | |
| 216 | + | foo::bar\n\ | |
| 217 | + | foo::baz\n\ | |
| 218 | + | \n\ | |
| 219 | + | test result: FAILED. 10 passed; 2 failed; 0 ignored\n"; | |
| 220 | + | let GateFailure::CargoTest { failed_count, first_failed } = | |
| 221 | + | classify_cargo_test(stdout, b"") | |
| 222 | + | else { panic!("expected CargoTest variant"); }; | |
| 223 | + | assert_eq!(failed_count, 2); | |
| 224 | + | assert_eq!(first_failed.as_deref(), Some("foo::bar")); | |
| 225 | + | } | |
| 226 | + | ||
| 227 | + | #[test] | |
| 228 | + | fn cargo_test_compile_error_is_unclassified() { | |
| 229 | + | // No "test result:" line because cargo never got to running. | |
| 230 | + | let stderr = b"error[E0382]: borrow of moved value: `x`\n"; | |
| 231 | + | let f = classify_cargo_test(b"", stderr); | |
| 232 | + | match f { | |
| 233 | + | GateFailure::Unclassified { legacy_detail: Some(d) } => { | |
| 234 | + | assert!(d.contains("borrow of moved value")); | |
| 235 | + | } | |
| 236 | + | other => panic!("expected Unclassified, got {other:?}"), | |
| 237 | + | } | |
| 238 | + | } | |
| 239 | + | ||
| 240 | + | #[test] | |
| 241 | + | fn migration_drift_extracts_name() { | |
| 242 | + | let err = "migration 0047_widgets was previously applied but is missing in the resolved migrations"; | |
| 243 | + | let f = classify_migration_error(err, None); | |
| 244 | + | match f { | |
| 245 | + | GateFailure::MigrationDrift { migration } => assert_eq!(migration, "0047_widgets"), | |
| 246 | + | other => panic!("expected MigrationDrift, got {other:?}"), | |
| 247 | + | } | |
| 248 | + | } | |
| 249 | + | ||
| 250 | + | #[test] | |
| 251 | + | fn migration_modified_extracts_name() { | |
| 252 | + | let err = "migration 0042_seed was previously applied but has been modified"; | |
| 253 | + | let f = classify_migration_error(err, None); | |
| 254 | + | match f { | |
| 255 | + | GateFailure::MigrationModified { migration } => assert_eq!(migration, "0042_seed"), | |
| 256 | + | other => panic!("expected MigrationModified, got {other:?}"), | |
| 257 | + | } | |
| 258 | + | } | |
| 259 | + | ||
| 260 | + | #[test] | |
| 261 | + | fn migration_sql_error_extracts_sqlstate() { | |
| 262 | + | let err = r#"while executing migrations: error returned from database: code: "42P01" message: "relation \"widgets\" does not exist""#; | |
| 263 | + | let f = classify_migration_error(err, Some("0050_drop_widgets")); | |
| 264 | + | match f { | |
| 265 | + | GateFailure::MigrationSqlError { migration, sqlstate } => { | |
| 266 | + | assert_eq!(migration, "0050_drop_widgets"); | |
| 267 | + | assert_eq!(sqlstate.as_deref(), Some("42P01")); | |
| 268 | + | } | |
| 269 | + | other => panic!("expected MigrationSqlError, got {other:?}"), | |
| 270 | + | } | |
| 271 | + | } | |
| 272 | + | ||
| 273 | + | #[test] | |
| 274 | + | fn migration_unknown_error_is_unclassified() { | |
| 275 | + | let err = "something went wrong with the universe"; | |
| 276 | + | let f = classify_migration_error(err, None); | |
| 277 | + | match f { | |
| 278 | + | GateFailure::Unclassified { legacy_detail: Some(d) } => { | |
| 279 | + | assert!(d.contains("universe")); | |
| 280 | + | } | |
| 281 | + | other => panic!("expected Unclassified, got {other:?}"), | |
| 282 | + | } | |
| 283 | + | } | |
| 284 | + | ||
| 285 | + | #[test] | |
| 286 | + | fn boot_smoke_101_is_panic() { | |
| 287 | + | match classify_boot_smoke(Some(101)) { | |
| 288 | + | GateFailure::BootPanic { exit_code: Some(101) } => {} | |
| 289 | + | other => panic!("expected BootPanic(101), got {other:?}"), | |
| 290 | + | } | |
| 291 | + | } | |
| 292 | + | ||
| 293 | + | #[test] | |
| 294 | + | fn boot_smoke_signal_is_panic() { | |
| 295 | + | match classify_boot_smoke(Some(-9)) { | |
| 296 | + | GateFailure::BootPanic { exit_code: Some(-9) } => {} | |
| 297 | + | other => panic!("expected BootPanic(-9), got {other:?}"), | |
| 298 | + | } | |
| 299 | + | } | |
| 300 | + | ||
| 301 | + | #[test] | |
| 302 | + | fn boot_smoke_other_exit_is_exited_early() { | |
| 303 | + | match classify_boot_smoke(Some(2)) { | |
| 304 | + | GateFailure::BootExitedEarly { exit_code: Some(2) } => {} | |
| 305 | + | other => panic!("expected BootExitedEarly(2), got {other:?}"), | |
| 306 | + | } | |
| 307 | + | } | |
| 308 | + | ||
| 309 | + | #[test] | |
| 310 | + | fn deploy_connection_refused_is_node_unreachable() { | |
| 311 | + | use crate::outcome::DeployFailureKind as K; | |
| 312 | + | let err = "creating remote release dir: ssh testnot-1 failed: ssh: connect to host testnot-1 port 22: Connection refused"; | |
| 313 | + | match classify_deploy_error(err) { | |
| 314 | + | K::NodeUnreachable { .. } => {} | |
| 315 | + | other => panic!("expected NodeUnreachable, got {other:?}"), | |
| 316 | + | } | |
| 317 | + | } | |
| 318 | + | ||
| 319 | + | #[test] | |
| 320 | + | fn deploy_rsync_failure_is_rsync_failed() { | |
| 321 | + | use crate::outcome::DeployFailureKind as K; | |
| 322 | + | let err = "rsync failed (current symlink left intact): rsync: write failed on \"/srv/.../makenotwork\": No space left on device (28)"; | |
| 323 | + | match classify_deploy_error(err) { | |
| 324 | + | K::RsyncFailed { detail } => assert!(detail.contains("No space left")), | |
| 325 | + | other => panic!("expected RsyncFailed, got {other:?}"), | |
| 326 | + | } | |
| 327 | + | } | |
| 328 | + | ||
| 329 | + | #[test] | |
| 330 | + | fn deploy_systemctl_failure_is_service_restart_failed() { | |
| 331 | + | use crate::outcome::DeployFailureKind as K; | |
| 332 | + | // The combined "swap + restart" step where stderr mentions systemctl. | |
| 333 | + | let err = "symlink swap + systemctl reload-or-restart: ssh testnot-1 failed: Failed to restart makenotwork.service: Unit makenotwork.service failed to start"; | |
| 334 | + | match classify_deploy_error(err) { | |
| 335 | + | K::ServiceRestartFailed { .. } => {} | |
| 336 | + | other => panic!("expected ServiceRestartFailed, got {other:?}"), | |
| 337 | + | } | |
| 338 | + | } | |
| 339 | + | ||
| 340 | + | #[test] | |
| 341 | + | fn deploy_ln_failure_is_symlink_swap_failed() { | |
| 342 | + | use crate::outcome::DeployFailureKind as K; | |
| 343 | + | let err = "symlink swap + systemctl reload-or-restart: ssh testnot-1 failed: ln: failed to create symbolic link: Permission denied"; | |
| 344 | + | match classify_deploy_error(err) { | |
| 345 | + | K::SymlinkSwapFailed { .. } => {} | |
| 346 | + | other => panic!("expected SymlinkSwapFailed, got {other:?}"), | |
| 347 | + | } | |
| 348 | + | } | |
| 349 | + | ||
| 350 | + | #[test] | |
| 351 | + | fn deploy_unknown_is_unclassified() { | |
| 352 | + | use crate::outcome::DeployFailureKind as K; | |
| 353 | + | let err = "something went wrong in a way we did not anticipate"; | |
| 354 | + | match classify_deploy_error(err) { | |
| 355 | + | K::Unclassified { detail } => assert!(detail.contains("anticipate")), | |
| 356 | + | other => panic!("expected Unclassified, got {other:?}"), | |
| 357 | + | } | |
| 358 | + | } | |
| 359 | + | } |
| @@ -66,4 +66,19 @@ impl Config { | |||
| 66 | 66 | .with_context(|| format!("reading daemon config at {path}"))?; | |
| 67 | 67 | Ok(toml::from_str(&raw)?) | |
| 68 | 68 | } | |
| 69 | + | ||
| 70 | + | #[cfg(test)] | |
| 71 | + | pub fn for_tests() -> Self { | |
| 72 | + | Self { | |
| 73 | + | listen: "127.0.0.1:0".into(), | |
| 74 | + | db_path: PathBuf::from(":memory:"), | |
| 75 | + | topology_path: PathBuf::from("/tmp/sando-test-topology.toml"), | |
| 76 | + | workdir: PathBuf::from("/tmp/sando-test-workdir"), | |
| 77 | + | release_root: PathBuf::from("/tmp/sando-test-release-root"), | |
| 78 | + | scratch_db_url: None, | |
| 79 | + | bin_names: vec!["server".into()], | |
| 80 | + | logs_root: PathBuf::from("/tmp/sando-test-logs"), | |
| 81 | + | release_contents: Vec::new(), | |
| 82 | + | } | |
| 83 | + | } | |
| 69 | 84 | } |
| @@ -38,10 +38,10 @@ const RELEASES_TO_KEEP: usize = 5; | |||
| 38 | 38 | ||
| 39 | 39 | pub async fn deploy_local( | |
| 40 | 40 | release_root: &Path, | |
| 41 | - | version: &str, | |
| 41 | + | version: &crate::domain::Version, | |
| 42 | 42 | binaries: &[PathBuf], | |
| 43 | 43 | ) -> Result<PathBuf> { | |
| 44 | - | let release_dir = release_root.join("releases").join(version); | |
| 44 | + | let release_dir = release_root.join("releases").join(version.to_string()); | |
| 45 | 45 | tokio::fs::create_dir_all(&release_dir).await?; | |
| 46 | 46 | for binary in binaries { | |
| 47 | 47 | let name = binary.file_name() | |
| @@ -272,7 +272,7 @@ mod tests { | |||
| 272 | 272 | ||
| 273 | 273 | let staged = deploy_local( | |
| 274 | 274 | &release_root, | |
| 275 | - | "0.8.12", | |
| 275 | + | &"0.8.12".parse().unwrap(), | |
| 276 | 276 | &[primary.clone(), admin.clone()], | |
| 277 | 277 | ) | |
| 278 | 278 | .await | |
| @@ -303,10 +303,10 @@ mod tests { | |||
| 303 | 303 | let release_root = root.join("rr"); | |
| 304 | 304 | tokio::fs::create_dir_all(&release_root).await.unwrap(); | |
| 305 | 305 | ||
| 306 | - | deploy_local(&release_root, "0.1.0", &[bin.clone()]).await.unwrap(); | |
| 306 | + | deploy_local(&release_root, &"0.1.0".parse().unwrap(), &[bin.clone()]).await.unwrap(); | |
| 307 | 307 | // Rewrite source then deploy 0.2.0. | |
| 308 | 308 | tokio::fs::write(&bin, b"V2").await.unwrap(); | |
| 309 | - | deploy_local(&release_root, "0.2.0", &[bin.clone()]).await.unwrap(); | |
| 309 | + | deploy_local(&release_root, &"0.2.0".parse().unwrap(), &[bin.clone()]).await.unwrap(); | |
| 310 | 310 | ||
| 311 | 311 | // Both versions present on disk. | |
| 312 | 312 | assert!(release_root.join("releases/0.1.0/server").exists()); |
| @@ -0,0 +1,423 @@ | |||
| 1 | + | //! Domain types — the vocabulary every other module speaks. | |
| 2 | + | //! | |
| 3 | + | //! These newtypes replace string-typed fields across the daemon, schema, | |
| 4 | + | //! WS payloads, and TUI. Construction is the boundary parse: a `Version` | |
| 5 | + | //! exists because some byte sequence at the edge of the process passed | |
| 6 | + | //! semver validation; downstream code is freed from re-validating it. | |
| 7 | + | //! | |
| 8 | + | //! All types implement `Display`, `FromStr`, `Serialize`, `Deserialize`, | |
| 9 | + | //! and `sqlx::Type<Sqlite>` so they round-trip through events, JSON | |
| 10 | + | //! responses, and SQLite columns without per-site conversion. | |
| 11 | + | //! | |
| 12 | + | //! See `plans/observability.md` for the architecture this is the first | |
| 13 | + | //! step of. | |
| 14 | + | ||
| 15 | + | // Step 1 is pure addition: nothing else in the crate uses these yet. | |
| 16 | + | // Steps 2-7 thread the types through call sites; remove the allow then. | |
| 17 | + | #![allow(dead_code)] | |
| 18 | + | ||
| 19 | + | use serde::{Deserialize, Serialize}; | |
| 20 | + | use sqlx::Sqlite; | |
| 21 | + | use std::fmt; | |
| 22 | + | use std::str::FromStr; | |
| 23 | + | ||
| 24 | + | // --------------------------------------------------------------------- | |
| 25 | + | // String-backed identifiers | |
| 26 | + | // --------------------------------------------------------------------- | |
| 27 | + | ||
| 28 | + | /// A tier in the deploy topology (e.g. "host", "a", "b"). | |
| 29 | + | /// | |
| 30 | + | /// Construction does no cross-validation against the loaded `Topology` — | |
| 31 | + | /// that is the responsibility of `Topology::load`, which mints the | |
| 32 | + | /// canonical `TierId` set. Use `TierId::new` only at boundaries (config | |
| 33 | + | /// load, deserialization of inbound requests). | |
| 34 | + | #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, sqlx::Type)] | |
| 35 | + | #[sqlx(transparent)] | |
| 36 | + | #[serde(transparent)] | |
| 37 | + | pub struct TierId(String); | |
| 38 | + | ||
| 39 | + | impl TierId { | |
| 40 | + | pub fn new(s: impl Into<String>) -> Self { Self(s.into()) } | |
| 41 | + | pub fn as_str(&self) -> &str { &self.0 } | |
| 42 | + | } | |
| 43 | + | ||
| 44 | + | impl fmt::Display for TierId { | |
| 45 | + | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } | |
| 46 | + | } | |
| 47 | + | ||
| 48 | + | impl FromStr for TierId { | |
| 49 | + | type Err = std::convert::Infallible; | |
| 50 | + | fn from_str(s: &str) -> Result<Self, Self::Err> { Ok(Self(s.to_owned())) } | |
| 51 | + | } | |
| 52 | + | ||
| 53 | + | impl From<&str> for TierId { | |
| 54 | + | fn from(s: &str) -> Self { Self(s.to_owned()) } | |
| 55 | + | } | |
| 56 | + | ||
| 57 | + | impl From<String> for TierId { | |
| 58 | + | fn from(s: String) -> Self { Self(s) } | |
| 59 | + | } | |
| 60 | + | ||
| 61 | + | /// A node name within a tier (e.g. "alpha-west-1"). | |
| 62 | + | #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, sqlx::Type)] | |
| 63 | + | #[sqlx(transparent)] | |
| 64 | + | #[serde(transparent)] | |
| 65 | + | pub struct NodeId(String); | |
| 66 | + | ||
| 67 | + | impl NodeId { | |
| 68 | + | pub fn new(s: impl Into<String>) -> Self { Self(s.into()) } | |
| 69 | + | pub fn as_str(&self) -> &str { &self.0 } | |
| 70 | + | } | |
| 71 | + | ||
| 72 | + | impl fmt::Display for NodeId { | |
| 73 | + | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } | |
| 74 | + | } | |
| 75 | + | ||
| 76 | + | impl FromStr for NodeId { | |
| 77 | + | type Err = std::convert::Infallible; | |
| 78 | + | fn from_str(s: &str) -> Result<Self, Self::Err> { Ok(Self(s.to_owned())) } | |
| 79 | + | } | |
| 80 | + | ||
| 81 | + | impl From<&str> for NodeId { | |
| 82 | + | fn from(s: &str) -> Self { Self(s.to_owned()) } | |
| 83 | + | } | |
| 84 | + | ||
| 85 | + | impl From<String> for NodeId { | |
| 86 | + | fn from(s: String) -> Self { Self(s) } | |
| 87 | + | } | |
| 88 | + | ||
| 89 | + | // --------------------------------------------------------------------- | |
| 90 | + | // Version (semver) | |
| 91 | + | // --------------------------------------------------------------------- | |
| 92 | + | ||
| 93 | + | /// Server semver (e.g. `0.9.6`). Parsed once at the build step; stored | |
| 94 | + | /// as TEXT in the schema. | |
| 95 | + | #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] | |
| 96 | + | #[serde(try_from = "String", into = "String")] | |
| 97 | + | pub struct Version(semver::Version); | |
| 98 | + | ||
| 99 | + | #[derive(Debug, thiserror::Error)] | |
| 100 | + | #[error("invalid semver `{input}`: {source}")] | |
| 101 | + | pub struct VersionParseError { | |
| 102 | + | pub input: String, | |
| 103 | + | #[source] | |
| 104 | + | pub source: semver::Error, | |
| 105 | + | } | |
| 106 | + | ||
| 107 | + | impl Version { | |
| 108 | + | pub fn parse(s: &str) -> Result<Self, VersionParseError> { | |
| 109 | + | semver::Version::parse(s) | |
| 110 | + | .map(Self) | |
| 111 | + | .map_err(|e| VersionParseError { input: s.to_owned(), source: e }) | |
| 112 | + | } | |
| 113 | + | pub fn as_inner(&self) -> &semver::Version { &self.0 } | |
| 114 | + | } | |
| 115 | + | ||
| 116 | + | impl fmt::Display for Version { | |
| 117 | + | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } | |
| 118 | + | } | |
| 119 | + | ||
| 120 | + | impl FromStr for Version { | |
| 121 | + | type Err = VersionParseError; | |
| 122 | + | fn from_str(s: &str) -> Result<Self, Self::Err> { Self::parse(s) } | |
| 123 | + | } | |
| 124 | + | ||
| 125 | + | impl TryFrom<String> for Version { | |
| 126 | + | type Error = VersionParseError; | |
| 127 | + | fn try_from(s: String) -> Result<Self, Self::Error> { Self::parse(&s) } | |
| 128 | + | } | |
| 129 | + | ||
| 130 | + | impl From<Version> for String { | |
| 131 | + | fn from(v: Version) -> Self { v.0.to_string() } | |
| 132 | + | } | |
| 133 | + | ||
| 134 | + | impl sqlx::Type<Sqlite> for Version { | |
| 135 | + | fn type_info() -> <Sqlite as sqlx::Database>::TypeInfo { <String as sqlx::Type<Sqlite>>::type_info() } | |
| 136 | + | fn compatible(ty: &<Sqlite as sqlx::Database>::TypeInfo) -> bool { <String as sqlx::Type<Sqlite>>::compatible(ty) } | |
| 137 | + | } | |
| 138 | + | ||
| 139 | + | impl<'q> sqlx::Encode<'q, Sqlite> for Version { | |
| 140 | + | fn encode_by_ref( | |
| 141 | + | &self, | |
| 142 | + | buf: &mut <Sqlite as sqlx::Database>::ArgumentBuffer<'q>, | |
| 143 | + | ) -> Result<sqlx::encode::IsNull, sqlx::error::BoxDynError> { | |
| 144 | + | <String as sqlx::Encode<Sqlite>>::encode(self.0.to_string(), buf) | |
| 145 | + | } | |
| 146 | + | } | |
| 147 | + | ||
| 148 | + | impl<'r> sqlx::Decode<'r, Sqlite> for Version { | |
| 149 | + | fn decode( | |
| 150 | + | value: <Sqlite as sqlx::Database>::ValueRef<'r>, | |
| 151 | + | ) -> Result<Self, sqlx::error::BoxDynError> { | |
| 152 | + | let s = <String as sqlx::Decode<Sqlite>>::decode(value)?; | |
| 153 | + | Ok(Version::parse(&s)?) | |
| 154 | + | } | |
| 155 | + | } | |
| 156 | + | ||
| 157 | + | // --------------------------------------------------------------------- | |
| 158 | + | // Git sha | |
| 159 | + | // --------------------------------------------------------------------- | |
| 160 | + | ||
| 161 | + | /// A git commit sha. Always stored in its full 40-hex-character form; | |
| 162 | + | /// short forms entering at the edge are accepted only if the topology | |
| 163 | + | /// resolves them unambiguously (resolution happens at the call site, | |
| 164 | + | /// not in this type — this type only enforces shape). | |
| 165 | + | #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] | |
| 166 | + | #[serde(try_from = "String", into = "String")] | |
| 167 | + | pub struct GitSha(String); | |
| 168 | + | ||
| 169 | + | #[derive(Debug, thiserror::Error)] | |
| 170 | + | pub enum GitShaParseError { | |
| 171 | + | #[error("git sha `{0}` is not 7-40 hex chars")] | |
| 172 | + | BadShape(String), | |
| 173 | + | } | |
| 174 | + | ||
| 175 | + | impl GitSha { | |
| 176 | + | pub fn parse(s: &str) -> Result<Self, GitShaParseError> { | |
| 177 | + | let len = s.len(); | |
| 178 | + | let ok = (7..=40).contains(&len) && s.bytes().all(|b| b.is_ascii_hexdigit()); | |
| 179 | + | if ok { Ok(Self(s.to_ascii_lowercase())) } else { Err(GitShaParseError::BadShape(s.to_owned())) } | |
| 180 | + | } | |
| 181 | + | pub fn as_str(&self) -> &str { &self.0 } | |
| 182 | + | /// Best-effort 7-char prefix for display. | |
| 183 | + | pub fn short(&self) -> &str { &self.0[..self.0.len().min(7)] } | |
| 184 | + | } | |
| 185 | + | ||
| 186 | + | impl fmt::Display for GitSha { | |
| 187 | + | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } | |
| 188 | + | } | |
| 189 | + | ||
| 190 | + | impl FromStr for GitSha { | |
| 191 | + | type Err = GitShaParseError; | |
| 192 | + | fn from_str(s: &str) -> Result<Self, Self::Err> { Self::parse(s) } | |
| 193 | + | } | |
| 194 | + | ||
| 195 | + | impl TryFrom<String> for GitSha { | |
| 196 | + | type Error = GitShaParseError; | |
| 197 | + | fn try_from(s: String) -> Result<Self, Self::Error> { Self::parse(&s) } | |
| 198 | + | } | |
| 199 | + | ||
| 200 | + | impl From<GitSha> for String { | |
| 201 | + | fn from(g: GitSha) -> Self { g.0 } | |
| 202 | + | } | |
| 203 | + | ||
| 204 | + | impl sqlx::Type<Sqlite> for GitSha { | |
| 205 | + | fn type_info() -> <Sqlite as sqlx::Database>::TypeInfo { <String as sqlx::Type<Sqlite>>::type_info() } | |
| 206 | + | fn compatible(ty: &<Sqlite as sqlx::Database>::TypeInfo) -> bool { <String as sqlx::Type<Sqlite>>::compatible(ty) } | |
| 207 | + | } | |
| 208 | + | ||
| 209 | + | impl<'q> sqlx::Encode<'q, Sqlite> for GitSha { | |
| 210 | + | fn encode_by_ref( | |
| 211 | + | &self, | |
| 212 | + | buf: &mut <Sqlite as sqlx::Database>::ArgumentBuffer<'q>, | |
| 213 | + | ) -> Result<sqlx::encode::IsNull, sqlx::error::BoxDynError> { | |
| 214 | + | <String as sqlx::Encode<Sqlite>>::encode(self.0.clone(), buf) | |
| 215 | + | } | |
| 216 | + | } | |
| 217 | + | ||
| 218 | + | impl<'r> sqlx::Decode<'r, Sqlite> for GitSha { | |
| 219 | + | fn decode( | |
| 220 | + | value: <Sqlite as sqlx::Database>::ValueRef<'r>, | |
| 221 | + | ) -> Result<Self, sqlx::error::BoxDynError> { | |
| 222 | + | let s = <String as sqlx::Decode<Sqlite>>::decode(value)?; | |
| 223 | + | Ok(GitSha::parse(&s)?) | |
| 224 | + | } | |
| 225 | + | } | |
| 226 | + | ||
| 227 | + | // --------------------------------------------------------------------- | |
| 228 | + | // Gate kind | |
| 229 | + | // --------------------------------------------------------------------- | |
| 230 | + | ||
| 231 | + | /// The discriminant of `topology::Gate`. `Gate` carries gate parameters | |
| 232 | + | /// (e.g. `BurnIn { hours }`); `GateKind` is the identifier we use in | |
| 233 | + | /// events, schema columns, and the TUI. They were the same type before; | |
| 234 | + | /// splitting them is what lets a gate's parameters evolve without | |
| 235 | + | /// touching the wire/schema vocabulary. | |
| 236 | + | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] | |
| 237 | + | #[serde(rename_all = "snake_case")] | |
| 238 | + | pub enum GateKind { | |
| 239 | + | CargoTest, | |
| 240 | + | MigrationDryRun, | |
| 241 | + | BootSmoke, | |
| 242 | + | BurnIn, | |
| 243 | + | ManualConfirm, | |
| 244 | + | } | |
| 245 | + | ||
| 246 | + | impl GateKind { | |
| 247 | + | pub fn as_str(self) -> &'static str { | |
| 248 | + | match self { | |
| 249 | + | GateKind::CargoTest => "cargo_test", | |
| 250 | + | GateKind::MigrationDryRun => "migration_dry_run", | |
| 251 | + | GateKind::BootSmoke => "boot_smoke", | |
| 252 | + | GateKind::BurnIn => "burn_in", | |
| 253 | + | GateKind::ManualConfirm => "manual_confirm", | |
| 254 | + | } | |
| 255 | + | } | |
| 256 | + | } | |
| 257 | + | ||
| 258 | + | #[derive(Debug, thiserror::Error)] | |
| 259 | + | #[error("unknown gate kind `{0}`")] | |
| 260 | + | pub struct GateKindParseError(pub String); | |
| 261 | + | ||
| 262 | + | impl FromStr for GateKind { | |
| 263 | + | type Err = GateKindParseError; | |
| 264 | + | fn from_str(s: &str) -> Result<Self, Self::Err> { | |
| 265 | + | match s { | |
| 266 | + | "cargo_test" => Ok(GateKind::CargoTest), | |
| 267 | + | "migration_dry_run" => Ok(GateKind::MigrationDryRun), | |
| 268 | + | "boot_smoke" => Ok(GateKind::BootSmoke), | |
| 269 | + | "burn_in" => Ok(GateKind::BurnIn), | |
| 270 | + | "manual_confirm" => Ok(GateKind::ManualConfirm), | |
| 271 | + | other => Err(GateKindParseError(other.to_owned())), | |
| 272 | + | } | |
| 273 | + | } | |
| 274 | + | } | |
| 275 | + | ||
| 276 | + | impl fmt::Display for GateKind { | |
| 277 | + | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.write_str(self.as_str()) } | |
| 278 | + | } | |
| 279 | + | ||
| 280 | + | impl sqlx::Type<Sqlite> for GateKind { | |
| 281 | + | fn type_info() -> <Sqlite as sqlx::Database>::TypeInfo { <String as sqlx::Type<Sqlite>>::type_info() } | |
| 282 | + | fn compatible(ty: &<Sqlite as sqlx::Database>::TypeInfo) -> bool { <String as sqlx::Type<Sqlite>>::compatible(ty) } | |
| 283 | + | } | |
| 284 | + | ||
| 285 | + | impl<'q> sqlx::Encode<'q, Sqlite> for GateKind { | |
| 286 | + | fn encode_by_ref( | |
| 287 | + | &self, | |
| 288 | + | buf: &mut <Sqlite as sqlx::Database>::ArgumentBuffer<'q>, | |
| 289 | + | ) -> Result<sqlx::encode::IsNull, sqlx::error::BoxDynError> { | |
| 290 | + | <String as sqlx::Encode<Sqlite>>::encode(self.as_str().to_owned(), buf) | |
| 291 | + | } | |
| 292 | + | } | |
| 293 | + | ||
| 294 | + | impl<'r> sqlx::Decode<'r, Sqlite> for GateKind { | |
| 295 | + | fn decode( | |
| 296 | + | value: <Sqlite as sqlx::Database>::ValueRef<'r>, | |
| 297 | + | ) -> Result<Self, sqlx::error::BoxDynError> { | |
| 298 | + | let s = <String as sqlx::Decode<Sqlite>>::decode(value)?; | |
| 299 | + | Ok(GateKind::from_str(&s)?) | |
| 300 | + | } | |
| 301 | + | } | |
| 302 | + | ||
| 303 | + | // --------------------------------------------------------------------- | |
| 304 | + | // Row ids | |
| 305 | + | // --------------------------------------------------------------------- | |
| 306 | + | ||
| 307 | + | /// Primary key of `gate_runs`. Carried through `GateStart` → `GateLogChunk` | |
| 308 | + | /// → `GateDone` so client-side correlation is trivial. | |
| 309 | + | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, sqlx::Type)] | |
| 310 | + | #[sqlx(transparent)] | |
| 311 | + | #[serde(transparent)] | |
| 312 | + | pub struct GateRunId(pub i64); | |
| 313 | + | ||
| 314 | + | impl fmt::Display for GateRunId { | |
| 315 | + | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } | |
| 316 | + | } | |
| 317 | + | ||
| 318 | + | /// Primary key of `deploys`. | |
| 319 | + | #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, sqlx::Type)] | |
| 320 | + | #[sqlx(transparent)] | |
| 321 | + | #[serde(transparent)] | |
| 322 | + | pub struct DeployId(pub i64); | |
| 323 | + | ||
| 324 | + | impl fmt::Display for DeployId { | |
| 325 | + | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } | |
| 326 | + | } | |
| 327 | + | ||
| 328 | + | #[cfg(test)] | |
| 329 | + | mod tests { | |
| 330 | + | use super::*; | |
| 331 | + | ||
| 332 | + | #[test] | |
| 333 | + | fn tier_id_round_trips_through_json() { | |
| 334 | + | let t = TierId::new("host"); | |
| 335 | + | let s = serde_json::to_string(&t).unwrap(); | |
| 336 | + | assert_eq!(s, "\"host\""); | |
| 337 | + | let back: TierId = serde_json::from_str(&s).unwrap(); | |
| 338 | + | assert_eq!(t, back); | |
| 339 | + | } | |
| 340 | + | ||
| 341 | + | #[test] | |
| 342 | + | fn version_parses_and_displays() { | |
| 343 | + | let v: Version = "0.9.6".parse().unwrap(); | |
| 344 | + | assert_eq!(v.to_string(), "0.9.6"); | |
| 345 | + | assert!("not-a-version".parse::<Version>().is_err()); | |
| 346 | + | } | |
| 347 | + | ||
| 348 | + | #[test] | |
| 349 | + | fn version_json_is_string_form() { | |
| 350 | + | let v: Version = "1.2.3-rc.1".parse().unwrap(); | |
| 351 | + | let s = serde_json::to_string(&v).unwrap(); | |
| 352 | + | assert_eq!(s, "\"1.2.3-rc.1\""); | |
| 353 | + | let back: Version = serde_json::from_str(&s).unwrap(); | |
| 354 | + | assert_eq!(v, back); | |
| 355 | + | } | |
| 356 | + | ||
| 357 | + | #[test] | |
| 358 | + | fn git_sha_accepts_short_and_full() { | |
| 359 | + | assert!(GitSha::parse("abc1234").is_ok()); | |
| 360 | + | assert!(GitSha::parse("0123456789abcdef0123456789abcdef01234567").is_ok()); | |
| 361 | + | // length out of range | |
| 362 | + | assert!(GitSha::parse("abc").is_err()); | |
| 363 | + | assert!(GitSha::parse(&"a".repeat(41)).is_err()); | |
| 364 | + | // non-hex | |
| 365 | + | assert!(GitSha::parse("zzzzzzz").is_err()); | |
| 366 | + | } | |
| 367 | + | ||
| 368 | + | #[test] | |
| 369 | + | fn git_sha_short_truncates_safely() { | |
| 370 | + | let s = GitSha::parse("abc1234").unwrap(); | |
| 371 | + | assert_eq!(s.short(), "abc1234"); | |
| 372 | + | let long = GitSha::parse("0123456789abcdef0123456789abcdef01234567").unwrap(); | |
| 373 | + | assert_eq!(long.short(), "0123456"); | |
| 374 | + | } | |
| 375 | + | ||
| 376 | + | #[test] | |
| 377 | + | fn git_sha_normalizes_to_lowercase() { | |
| 378 | + | let s = GitSha::parse("ABCdef1").unwrap(); | |
| 379 | + | assert_eq!(s.as_str(), "abcdef1"); | |
| 380 | + | } | |
| 381 | + | ||
| 382 | + | #[test] | |
| 383 | + | fn gate_kind_round_trips_through_json() { | |
| 384 | + | // serde_json uses #[serde(rename_all = "snake_case")] — verify the | |
| 385 | + | // shape the TUI's `format_event` already consumes is preserved. | |
| 386 | + | let k = GateKind::MigrationDryRun; | |
| 387 | + | let s = serde_json::to_string(&k).unwrap(); | |
| 388 | + | assert_eq!(s, "\"migration_dry_run\""); | |
| 389 | + | let back: GateKind = serde_json::from_str(&s).unwrap(); | |
| 390 | + | assert_eq!(k, back); | |
| 391 | + | } | |
| 392 | + | ||
| 393 | + | #[test] | |
| 394 | + | fn gate_kind_as_str_matches_serde_form() { | |
| 395 | + | // The legacy `gates::kind_str` helper produced strings the TUI | |
| 396 | + | // matched on. Locking in that our serde form matches those exactly | |
| 397 | + | // so step 3 (events use the types) doesn't change the wire shape. | |
| 398 | + | for k in [ | |
| 399 | + | GateKind::CargoTest, | |
| 400 | + | GateKind::MigrationDryRun, | |
| 401 | + | GateKind::BootSmoke, | |
| 402 | + | GateKind::BurnIn, | |
| 403 | + | GateKind::ManualConfirm, | |
| 404 | + | ] { | |
| 405 | + | let via_serde: String = serde_json::from_str::<String>( | |
| 406 | + | &serde_json::to_string(&k).unwrap(), | |
| 407 | + | ) | |
| 408 | + | .unwrap(); | |
| 409 | + | assert_eq!(via_serde, k.as_str()); | |
| 410 | + | } | |
| 411 | + | } | |
| 412 | + | ||
| 413 | + | #[test] | |
| 414 | + | fn gate_kind_from_str_rejects_unknown() { | |
| 415 | + | assert!("not_a_gate".parse::<GateKind>().is_err()); | |
| 416 | + | } | |
| 417 | + | ||
| 418 | + | #[test] | |
| 419 | + | fn gate_run_id_serializes_as_number() { | |
| 420 | + | let id = GateRunId(42); | |
| 421 | + | assert_eq!(serde_json::to_string(&id).unwrap(), "42"); | |
| 422 | + | } | |
| 423 | + | } |
| @@ -5,8 +5,10 @@ | |||
| 5 | 5 | //! subscribes to the bus and forwards each envelope to the connected TUI as | |
| 6 | 6 | //! a JSON text frame. | |
| 7 | 7 | ||
| 8 | + | use crate::domain::{GateKind, GateRunId, GitSha, NodeId, TierId, Version}; | |
| 9 | + | use crate::outcome::{DeployFailureKind, GateOutcome}; | |
| 8 | 10 | use chrono::{DateTime, Utc}; | |
| 9 | - | use serde::Serialize; | |
| 11 | + | use serde::{Deserialize, Serialize}; | |
| 10 | 12 | use tokio::sync::broadcast; | |
| 11 | 13 | ||
| 12 | 14 | /// Capacity of the broadcast channel. Slow subscribers that fall behind by | |
| @@ -16,32 +18,65 @@ pub const CAPACITY: usize = 256; | |||
| 16 | 18 | ||
| 17 | 19 | pub type EventTx = broadcast::Sender<EventEnvelope>; | |
| 18 | 20 | ||
| 19 | - | #[derive(Clone, Debug, Serialize)] | |
| 21 | + | #[derive(Clone, Debug, Serialize, Deserialize)] | |
| 20 | 22 | pub struct EventEnvelope { | |
| 21 | 23 | pub at: DateTime<Utc>, | |
| 22 | 24 | #[serde(flatten)] | |
| 23 | 25 | pub event: Event, | |
| 24 | 26 | } | |
| 25 | 27 | ||
| 26 | - | #[derive(Clone, Debug, Serialize)] | |
| 28 | + | #[derive(Clone, Debug, Serialize, Deserialize)] | |
| 27 | 29 | #[serde(tag = "kind", rename_all = "snake_case")] | |
| 28 | 30 | pub enum Event { | |
| 29 | 31 | /// A /rebuild was accepted (post-receive hook or operator). | |
| 30 | - | RebuildRequested { sha: String }, | |
| 32 | + | RebuildRequested { sha: GitSha }, | |
| 31 | 33 | /// A previous in-flight build was aborted because a newer /rebuild arrived. | |
| 32 | - | BuildAborted { sha_aborted: String }, | |
| 33 | - | BuildStart { sha: String, version: String }, | |
| 34 | - | BuildOk { sha: String, version: String, elapsed_s: u64 }, | |
| 35 | - | BuildFailed { sha: String, version: String, elapsed_s: u64 }, | |
| 36 | - | GateStart { tier: String, version: String, gate: String }, | |
| 37 | - | GateDone { tier: String, version: String, gate: String, passed: bool }, | |
| 38 | - | DeployStart { tier: String, node: String, version: String }, | |
| 39 | - | DeployOk { tier: String, node: String, version: String }, | |
| 40 | - | DeployFailed { tier: String, node: String, version: String, error: String }, | |
| 41 | - | PromoteComplete { tier: String, version: String }, | |
| 42 | - | Rollback { tier: String, from: String, to: String }, | |
| 34 | + | BuildAborted { sha_aborted: GitSha }, | |
| 35 | + | BuildStart { sha: GitSha, version: Version }, | |
| 36 | + | BuildOk { sha: GitSha, version: Version, elapsed_s: u64 }, | |
| 37 | + | BuildFailed { sha: GitSha, version: Version, elapsed_s: u64 }, | |
| 38 | + | GateStart { | |
| 39 | + | run_id: GateRunId, | |
| 40 | + | tier: TierId, | |
| 41 | + | version: Version, | |
| 42 | + | gate: GateKind, | |
| 43 | + | }, | |
| 44 | + | /// Chunk of combined stdout+stderr from a gate that's currently running. | |
| 45 | + | /// `run_id` correlates back to the `GateStart` for the same gate run; the | |
| 46 | + | /// TUI uses it to group chunks if it wants a per-run buffer. `seq` is a | |
| 47 | + | /// monotonic counter scoped to one run (resets across runs). `text` is a | |
| 48 | + | /// UTF-8-lossy slice of bytes — chunks reflect tokio read boundaries, not | |
| 49 | + | /// line boundaries; the on-disk log at `outcome.log_ref` is the full, | |
| 50 | + | /// byte-exact stream. | |
| 51 | + | GateLogChunk { | |
| 52 | + | run_id: GateRunId, | |
| 53 | + | seq: u32, | |
| 54 | + | text: String, | |
| 55 | + | }, | |
| 56 | + | /// `passed` is a shadow field. `outcome` is the source of truth and | |
| 57 | + | /// carries classification, blocker variants, and the log_ref. | |
| 58 | + | GateDone { | |
| 59 | + | run_id: GateRunId, | |
| 60 | + | tier: TierId, | |
| 61 | + | version: Version, | |
| 62 | + | gate: GateKind, | |
| 63 | + | passed: bool, | |
| 64 | + | outcome: GateOutcome, | |
| 65 | + | }, | |
| 66 | + | DeployStart { tier: TierId, node: NodeId, version: Version }, | |
| 67 | + | DeployOk { tier: TierId, node: NodeId, version: Version }, | |
| 68 | + | DeployFailed { | |
| 69 | + | tier: TierId, | |
| 70 | + | node: NodeId, | |
| 71 | + | version: Version, | |
| 72 | + | failure: DeployFailureKind, | |
| 73 | + | }, | |
| 74 | + | PromoteComplete { tier: TierId, version: Version }, | |
| 75 | + | Rollback { tier: TierId, from: Version, to: Version }, | |
| 76 | + | /// `source` is an ssh URL, kept freeform on purpose — it's a transport | |
| 77 | + | /// detail, not a domain identifier. | |
| 43 | 78 | BackupFetched { source: String, byte_size: i64 }, | |
| 44 | - | ManualConfirm { tier: String, version: String }, | |
| 79 | + | ManualConfirm { tier: TierId, version: Version }, | |
| 45 | 80 | } | |
| 46 | 81 | ||
| 47 | 82 | pub fn channel() -> EventTx { | |
| @@ -67,7 +102,7 @@ mod tests { | |||
| 67 | 102 | // to `.unwrap()` someday, every build/deploy site will start | |
| 68 | 103 | // crashing. | |
| 69 | 104 | let tx = channel(); | |
| 70 | - | emit(&tx, Event::RebuildRequested { sha: "abc".into() }); | |
| 105 | + | emit(&tx, Event::RebuildRequested { sha: GitSha::parse("abc1234").unwrap() }); | |
| 71 | 106 | emit(&tx, Event::BackupFetched { source: "x".into(), byte_size: 1 }); | |
| 72 | 107 | } | |
| 73 | 108 | ||
| @@ -75,12 +110,15 @@ mod tests { | |||
| 75 | 110 | async fn emit_reaches_a_subscriber() { | |
| 76 | 111 | let tx = channel(); | |
| 77 | 112 | let mut rx = tx.subscribe(); | |
| 78 | - | emit(&tx, Event::PromoteComplete { tier: "a".into(), version: "0.8.12".into() }); | |
| 113 | + | emit(&tx, Event::PromoteComplete { | |
| 114 | + | tier: TierId::new("a"), | |
| 115 | + | version: "0.8.12".parse().unwrap(), | |
| 116 | + | }); | |
| 79 | 117 | let env = rx.recv().await.expect("envelope"); | |
| 80 | 118 | match env.event { | |
| 81 | 119 | Event::PromoteComplete { tier, version } => { | |
| 82 | - | assert_eq!(tier, "a"); | |
| 83 | - | assert_eq!(version, "0.8.12"); | |
| 120 | + | assert_eq!(tier.as_str(), "a"); | |
| 121 | + | assert_eq!(version.to_string(), "0.8.12"); | |
| 84 | 122 | } | |
| 85 | 123 | _ => panic!("wrong event kind"), | |
| 86 | 124 | } | |
| @@ -93,9 +131,10 @@ mod tests { | |||
| 93 | 131 | let env = EventEnvelope { | |
| 94 | 132 | at: Utc::now(), | |
| 95 | 133 | event: Event::GateStart { | |
| 96 | - | tier: "host".into(), | |
| 97 | - | version: "0.8.12".into(), | |
| 98 | - | gate: "cargo_test".into(), | |
| 134 | + | run_id: GateRunId(42), | |
| 135 | + | tier: TierId::new("host"), | |
| 136 | + | version: "0.8.12".parse().unwrap(), | |
| 137 | + | gate: GateKind::CargoTest, | |
| 99 | 138 | }, | |
| 100 | 139 | }; | |
| 101 | 140 | let s = serde_json::to_string(&env).unwrap(); | |
| @@ -115,7 +154,9 @@ mod tests { | |||
| 115 | 154 | let tx = channel(); | |
| 116 | 155 | let mut rx = tx.subscribe(); | |
| 117 | 156 | for i in 0..(CAPACITY + 10) { | |
| 118 | - | emit(&tx, Event::RebuildRequested { sha: format!("{i}") }); | |
| 157 | + | // 7+ hex chars satisfy GitSha::parse; pad i into that shape. | |
| 158 | + | let sha = GitSha::parse(&format!("{i:0>7x}")).unwrap(); | |
| 159 | + | emit(&tx, Event::RebuildRequested { sha }); | |
| 119 | 160 | } | |
| 120 | 161 | let err = rx.recv().await.expect_err("expected Lagged"); | |
| 121 | 162 | match err { |
| @@ -3,8 +3,12 @@ | |||
| 3 | 3 | //! human-readable reason). Outcomes are persisted to `gate_runs` so /state | |
| 4 | 4 | //! and the TUI can show them. | |
| 5 | 5 | ||
| 6 | + | use crate::classify; | |
| 6 | 7 | use crate::config::Config; | |
| 8 | + | use crate::domain::{GateKind, GateRunId, TierId, Version}; | |
| 7 | 9 | use crate::events::{self, Event, EventTx}; | |
| 10 | + | use crate::live_log::LiveLog; | |
| 11 | + | use crate::outcome::{GateBlocker, GateFailure, GateOutcome, LogRef, PassNote}; | |
| 8 | 12 | use crate::topology::Gate; | |
| 9 | 13 | use anyhow::Result; | |
| 10 | 14 | use chrono::Utc; | |
| @@ -17,22 +21,16 @@ use tokio::process::Command; | |||
| 17 | 21 | pub struct GateCtx { | |
| 18 | 22 | pub pool: SqlitePool, | |
| 19 | 23 | pub cfg: Arc<Config>, | |
| 20 | - | pub tier: String, | |
| 21 | - | pub version: String, | |
| 24 | + | pub tier: TierId, | |
| 25 | + | pub version: Version, | |
| 22 | 26 | pub worktree: PathBuf, | |
| 23 | 27 | pub events: EventTx, | |
| 24 | 28 | } | |
| 25 | 29 | ||
| 26 | - | #[derive(Debug, Clone)] | |
| 27 | - | pub struct GateOutcome { | |
| 28 | - | pub passed: bool, | |
| 29 | - | pub detail: Option<String>, | |
| 30 | - | } | |
| 31 | - | ||
| 32 | 30 | /// Run a single gate end-to-end: insert the in-flight row, execute the gate, | |
| 33 | 31 | /// update the row with the outcome. Returns the outcome for the caller. | |
| 34 | 32 | pub async fn run(ctx: &GateCtx, gate: &Gate) -> Result<GateOutcome> { | |
| 35 | - | let kind = kind_str(gate); | |
| 33 | + | let kind = gate.kind(); | |
| 36 | 34 | let started_at = Utc::now().to_rfc3339(); | |
| 37 | 35 | ||
| 38 | 36 | let id: i64 = sqlx::query_scalar( | |
| @@ -45,46 +43,63 @@ pub async fn run(ctx: &GateCtx, gate: &Gate) -> Result<GateOutcome> { | |||
| 45 | 43 | .bind(&started_at) | |
| 46 | 44 | .fetch_one(&ctx.pool) | |
| 47 | 45 | .await?; | |
| 46 | + | let run_id = GateRunId(id); | |
| 48 | 47 | ||
| 49 | - | tracing::info!(tier = %ctx.tier, version = %ctx.version, gate = kind, "gate start"); | |
| 48 | + | tracing::info!( | |
| 49 | + | run_id = %run_id, tier = %ctx.tier, version = %ctx.version, gate = %kind, | |
| 50 | + | "gate start", | |
| 51 | + | ); | |
| 50 | 52 | events::emit(&ctx.events, Event::GateStart { | |
| 53 | + | run_id, | |
| 51 | 54 | tier: ctx.tier.clone(), | |
| 52 | 55 | version: ctx.version.clone(), | |
| 53 | - | gate: kind.into(), | |
| 56 | + | gate: kind, | |
| 54 | 57 | }); | |
| 55 | 58 | ||
| 56 | 59 | let outcome = match gate { | |
| 57 | - | Gate::CargoTest => cargo_test(ctx).await, | |
| 60 | + | Gate::CargoTest => cargo_test(ctx, run_id).await, | |
| 58 | 61 | Gate::MigrationDryRun => migration_dry_run(ctx).await, | |
| 59 | - | Gate::BootSmoke => boot_smoke(ctx).await, | |
| 62 | + | Gate::BootSmoke => boot_smoke(ctx, run_id).await, | |
| 60 | 63 | Gate::BurnIn { hours } => burn_in(ctx, *hours).await, | |
| 61 | 64 | Gate::ManualConfirm => manual_confirm(ctx).await, | |
| 62 | 65 | }; | |
| 63 | 66 | ||
| 64 | - | let outcome = outcome.unwrap_or_else(|e| GateOutcome { | |
| 65 | - | passed: false, | |
| 66 | - | detail: Some(format!("gate runner errored: {e}")), | |
| 67 | - | }); | |
| 67 | + | let outcome = outcome.unwrap_or_else(|e| GateOutcome::failed(GateFailure::Unclassified { | |
| 68 | + | legacy_detail: Some(format!("gate runner errored: {e}")), | |
| 69 | + | })); | |
| 68 | 70 | ||
| 71 | + | // Migration 003 added status/outcome_json/log_ref. Until migration 004 | |
| 72 | + | // drops them, we double-write the shadow `passed` and `detail` columns | |
| 73 | + | // so any consumer still reading the old schema keeps working. | |
| 74 | + | let outcome_json = serde_json::to_string(&outcome) | |
| 75 | + | .unwrap_or_else(|e| format!("{{\"_serialize_error\":{e:?}}}")); | |
| 69 | 76 | sqlx::query( | |
| 70 | - | "UPDATE gate_runs SET finished_at = ?, passed = ?, detail = ? WHERE id = ?", | |
| 77 | + | "UPDATE gate_runs | |
| 78 | + | SET finished_at = ?, passed = ?, detail = ?, | |
| 79 | + | status = ?, outcome_json = ?, log_ref = ? | |
| 80 | + | WHERE id = ?", | |
| 71 | 81 | ) | |
| 72 | 82 | .bind(Utc::now().to_rfc3339()) | |
| 73 | - | .bind(outcome.passed as i64) | |
| 74 | - | .bind(outcome.detail.as_deref()) | |
| 83 | + | .bind(outcome.legacy_passed() as i64) | |
| 84 | + | .bind(outcome.legacy_detail()) | |
| 85 | + | .bind(outcome.status_str()) | |
| 86 | + | .bind(&outcome_json) | |
| 87 | + | .bind(outcome.log_ref.as_ref().map(|l| l.as_str())) | |
| 75 | 88 | .bind(id) | |
| 76 | 89 | .execute(&ctx.pool) | |
| 77 | 90 | .await?; | |
| 78 | 91 | ||
| 79 | 92 | tracing::info!( | |
| 80 | - | tier = %ctx.tier, version = %ctx.version, gate = kind, | |
| 81 | - | passed = outcome.passed, "gate done", | |
| 93 | + | tier = %ctx.tier, version = %ctx.version, gate = %kind, | |
| 94 | + | status = outcome.status_str(), "gate done", | |
| 82 | 95 | ); | |
| 83 | 96 | events::emit(&ctx.events, Event::GateDone { | |
| 97 | + | run_id, | |
| 84 | 98 | tier: ctx.tier.clone(), | |
| 85 | 99 | version: ctx.version.clone(), | |
| 86 | - | gate: kind.into(), | |
| 87 | - | passed: outcome.passed, | |
| 100 | + | gate: kind, | |
| 101 | + | passed: outcome.legacy_passed(), | |
| 102 | + | outcome: outcome.clone(), | |
| 88 | 103 | }); | |
| 89 | 104 | ||
| 90 | 105 | Ok(outcome) | |
| @@ -99,26 +114,16 @@ pub async fn run_all(ctx: &GateCtx, gates: &[Gate]) -> Result<bool> { | |||
| 99 | 114 | let mut all_ok = true; | |
| 100 | 115 | for g in gates { | |
| 101 | 116 | let o = run(ctx, g).await?; | |
| 102 | - | if !o.passed { | |
| 117 | + | if !o.legacy_passed() { | |
| 103 | 118 | all_ok = false; | |
| 104 | 119 | } | |
| 105 | 120 | } | |
| 106 | 121 | Ok(all_ok) | |
| 107 | 122 | } | |
| 108 | 123 | ||
| 109 | - | fn kind_str(g: &Gate) -> &'static str { | |
| 110 | - | match g { | |
| 111 | - | Gate::CargoTest => "cargo_test", | |
| 112 | - | Gate::MigrationDryRun => "migration_dry_run", | |
| 113 | - | Gate::BootSmoke => "boot_smoke", | |
| 114 | - | Gate::BurnIn { .. } => "burn_in", | |
| 115 | - | Gate::ManualConfirm => "manual_confirm", | |
| 116 | - | } | |
| 117 | - | } | |
| 118 | - | ||
| 119 | 124 | // ---- individual gate runners ---- | |
| 120 | 125 | ||
| 121 | - | async fn cargo_test(ctx: &GateCtx) -> Result<GateOutcome> { | |
| 126 | + | async fn cargo_test(ctx: &GateCtx, run_id: GateRunId) -> Result<GateOutcome> { | |
| 122 | 127 | let server_dir = ctx.worktree.join("server"); | |
| 123 | 128 | let mut cmd = Command::new("cargo"); | |
| 124 | 129 | // Match CI (`server/deploy/run-ci.sh`): `--features fast-tests` relaxes | |
| @@ -128,6 +133,8 @@ async fn cargo_test(ctx: &GateCtx) -> Result<GateOutcome> { | |||
| 128 | 133 | // this in `server/src/constants.rs:87`. | |
| 129 | 134 | cmd.args(["test", "--release", "--features", "fast-tests"]) | |
| 130 | 135 | .current_dir(&server_dir) | |
| 136 | + | .stdout(std::process::Stdio::piped()) | |
| 137 | + | .stderr(std::process::Stdio::piped()) | |
| 131 | 138 | .kill_on_drop(true); | |
| 132 | 139 | // Same online-mode rationale as the build step: sqlx query macros need a | |
| 133 | 140 | // live DB to type-check against. The scratch DB is left in migrated state | |
| @@ -144,27 +151,42 @@ async fn cargo_test(ctx: &GateCtx) -> Result<GateOutcome> { | |||
| 144 | 151 | .unwrap_or(scratch_url); | |
| 145 | 152 | cmd.env("TEST_DATABASE_URL", test_url); | |
| 146 | 153 | } | |
| 147 | - | let out = cmd.output().await?; | |
| 148 | - | persist_gate_log(ctx, "cargo_test", &out.stdout, &out.stderr).await; | |
| 149 | - | Ok(GateOutcome { | |
| 150 | - | passed: out.status.success(), | |
| 151 | - | detail: Some(combined_tail(&out.stdout, &out.stderr, 4_000)), | |
| 152 | - | }) | |
| 154 | + | let started = std::time::Instant::now(); | |
| 155 | + | let log_path = gate_log_path(ctx, GateKind::CargoTest); | |
| 156 | + | let log_ref = LogRef::new(&ctx.version, GateKind::CargoTest); | |
| 157 | + | let mut child = match cmd.spawn() { | |
| 158 | + | Ok(c) => c, | |
| 159 | + | Err(e) => { | |
| 160 | + | return Ok(GateOutcome::failed(GateFailure::SpawnFailed { | |
| 161 | + | message: e.to_string(), | |
| 162 | + | }).with_log_ref(log_ref)); | |
| 163 | + | } | |
| 164 | + | }; | |
| 165 | + | let (stdout_buf, stderr_buf, status) = | |
| 166 | + | stream_child_to_live_log(&mut child, ctx.events.clone(), run_id, log_path).await?; | |
| 167 | + | let duration_s = started.elapsed().as_secs() as u32; | |
| 168 | + | if status.success() { | |
| 169 | + | Ok(GateOutcome::passed(PassNote::TestsPassed { duration_s }).with_log_ref(log_ref)) | |
| 170 | + | } else { | |
| 171 | + | let failure = classify::classify_cargo_test(&stdout_buf, &stderr_buf); | |
| 172 | + | Ok(GateOutcome::failed(failure).with_log_ref(log_ref)) | |
| 173 | + | } | |
| 153 | 174 | } | |
| 154 | 175 | ||
| 155 | 176 | async fn migration_dry_run(ctx: &GateCtx) -> Result<GateOutcome> { | |
| 156 | 177 | let mut log_buf: Vec<u8> = Vec::new(); | |
| 178 | + | let log_ref = LogRef::new(&ctx.version, GateKind::MigrationDryRun); | |
| 157 | 179 | let finish = |outcome: GateOutcome, buf: Vec<u8>| async move { | |
| 158 | - | persist_gate_log(ctx, "migration_dry_run", &buf, &[]).await; | |
| 180 | + | persist_gate_log(ctx, GateKind::MigrationDryRun, &buf, &[]).await; | |
| 159 | 181 | outcome | |
| 160 | 182 | }; | |
| 161 | 183 | ||
| 162 | 184 | let Some(db_url) = ctx.cfg.scratch_db_url.as_deref() else { | |
| 163 | 185 | log_buf.extend_from_slice(b"scratch_db_url unset in daemon config\n"); | |
| 164 | - | return Ok(finish(GateOutcome { | |
| 165 | - | passed: false, | |
| 166 | - | detail: Some("scratch_db_url unset in daemon config".into()), | |
| 167 | - | }, log_buf).await); | |
| 186 | + | return Ok(finish( | |
| 187 | + | GateOutcome::blocked(GateBlocker::ScratchDbUrlUnset).with_log_ref(log_ref), | |
| 188 | + | log_buf, | |
| 189 | + | ).await); | |
| 168 | 190 | }; | |
| 169 | 191 | ||
| 170 | 192 | let backup: Option<(String,)> = sqlx::query_as( | |
| @@ -174,23 +196,29 @@ async fn migration_dry_run(ctx: &GateCtx) -> Result<GateOutcome> { | |||
| 174 | 196 | .await?; | |
| 175 | 197 | let Some((backup_path,)) = backup else { | |
| 176 | 198 | log_buf.extend_from_slice(b"no backup fetched; call /backup/fetch first\n"); | |
| 177 | - | return Ok(finish(GateOutcome { | |
| 178 | - | passed: false, | |
| 179 | - | detail: Some("no backup fetched; call /backup/fetch first".into()), | |
| 180 | - | }, log_buf).await); | |
| 199 | + | return Ok(finish( | |
| 200 | + | GateOutcome::blocked(GateBlocker::NoBackupAvailable).with_log_ref(log_ref), | |
| 201 | + | log_buf, | |
| 202 | + | ).await); | |
| 181 | 203 | }; | |
| 182 | 204 | ||
| 183 | - | log_buf.extend_from_slice(format!("---- reset_scratch ----\n").as_bytes()); | |
| 205 | + | log_buf.extend_from_slice(b"---- reset_scratch ----\n"); | |
| 184 | 206 | if let Err(e) = reset_scratch(db_url).await { | |
| 185 | 207 | let msg = format!("scratch reset: {e}"); | |
| 186 | 208 | log_buf.extend_from_slice(msg.as_bytes()); | |
| 187 | - | return Ok(finish(GateOutcome { passed: false, detail: Some(msg) }, log_buf).await); | |
| 209 | + | return Ok(finish( | |
| 210 | + | GateOutcome::failed(GateFailure::RestoreFailed { reason: msg }).with_log_ref(log_ref), | |
| 211 | + | log_buf, | |
| 212 | + | ).await); | |
| 188 | 213 | } | |
| 189 | 214 | log_buf.extend_from_slice(format!("---- restore_dump ({backup_path}) ----\n").as_bytes()); | |
| 190 | 215 | if let Err(e) = restore_dump(db_url, &backup_path, &mut log_buf).await { | |
| 191 | 216 | let msg = format!("restore: {e}"); | |
| 192 | 217 | log_buf.extend_from_slice(msg.as_bytes()); | |
| 193 | - | return Ok(finish(GateOutcome { passed: false, detail: Some(msg) }, log_buf).await); | |
| 218 | + | return Ok(finish( | |
| 219 | + | GateOutcome::failed(GateFailure::RestoreFailed { reason: msg }).with_log_ref(log_ref), | |
| 220 | + | log_buf, | |
| 221 | + | ).await); | |
| 194 | 222 | } | |
| 195 | 223 | ||
| 196 | 224 | let migrations_dir = ctx.worktree.join("server").join("migrations"); | |
| @@ -199,12 +227,20 @@ async fn migration_dry_run(ctx: &GateCtx) -> Result<GateOutcome> { | |||
| 199 | 227 | Ok(()) => { | |
| 200 | 228 | let detail = format!("restored {backup_path} + migrated"); | |
| 201 | 229 | log_buf.extend_from_slice(detail.as_bytes()); | |
| 202 | - | Ok(finish(GateOutcome { passed: true, detail: Some(detail) }, log_buf).await) | |
| 230 | + | Ok(finish( | |
| 231 | + | GateOutcome::passed(PassNote::Migrated { backup_path: backup_path.clone() }) | |
| 232 | + | .with_log_ref(log_ref), | |
| 233 | + | log_buf, | |
| 234 | + | ).await) | |
| 203 | 235 | } | |
| 204 | 236 | Err(e) => { | |
| 205 | 237 | let err_s = e.to_string(); | |
| 206 | 238 | log_buf.extend_from_slice(err_s.as_bytes()); | |
| 207 | - | Ok(finish(GateOutcome { passed: false, detail: Some(tail(err_s.as_bytes(), 4_000)) }, log_buf).await) | |
| 239 | + | let failure = classify::classify_migration_error(&err_s, None); | |
| 240 | + | Ok(finish( | |
| 241 | + | GateOutcome::failed(failure).with_log_ref(log_ref), | |
| 242 | + | log_buf, | |
| 243 | + | ).await) | |
| 208 | 244 | } | |
| 209 | 245 | } | |
| 210 | 246 | } | |
| @@ -271,7 +307,7 @@ fn shell_escape(s: &str) -> String { | |||
| 271 | 307 | format!("'{}'", s.replace('\'', "'\\''")) | |
| 272 | 308 | } | |
| 273 | 309 | ||
| 274 | - | async fn boot_smoke(ctx: &GateCtx) -> Result<GateOutcome> { | |
| 310 | + | async fn boot_smoke(ctx: &GateCtx, run_id: GateRunId) -> Result<GateOutcome> { | |
| 275 | 311 | let bin: Option<(String,)> = sqlx::query_as( | |
| 276 | 312 | "SELECT artifact_path FROM versions WHERE version = ?", | |
| 277 | 313 | ) | |
| @@ -279,7 +315,9 @@ async fn boot_smoke(ctx: &GateCtx) -> Result<GateOutcome> { | |||
| 279 | 315 | .fetch_optional(&ctx.pool) | |
| 280 | 316 | .await?; | |
| 281 | 317 | let Some((bin,)) = bin else { | |
| 282 | - | return Ok(GateOutcome { passed: false, detail: Some("no artifact for version".into()) }); | |
| 318 | + | return Ok(GateOutcome::blocked(GateBlocker::ArtifactMissing { | |
| 319 | + | version: ctx.version.clone(), | |
| 320 | + | })); | |
| 283 | 321 | }; | |
| 284 | 322 | ||
| 285 | 323 | // Lowest-bar smoke: start the binary and verify it stays up for a few | |
| @@ -302,34 +340,32 @@ async fn boot_smoke(ctx: &GateCtx) -> Result<GateOutcome> { | |||
| 302 | 340 | if let Some(scratch_url) = ctx.cfg.scratch_db_url.as_deref() { | |
| 303 | 341 | cmd.env("DATABASE_URL", scratch_url); | |
| 304 | 342 | } | |
| 343 | + | let log_path = gate_log_path(ctx, GateKind::BootSmoke); | |
| 344 | + | let log_ref = LogRef::new(&ctx.version, GateKind::BootSmoke); | |
| 305 | 345 | let mut child = match cmd.spawn() { | |
| 306 | 346 | Ok(c) => c, | |
| 307 | 347 | Err(e) => { | |
| 308 | - | persist_gate_log(ctx, "boot_smoke", format!("spawn: {e}").as_bytes(), &[]).await; | |
| 309 | - | return Ok(GateOutcome { passed: false, detail: Some(format!("spawn: {e}")) }); | |
| 348 | + | // Spawn failures get a one-off log line via LiveLog so the | |
| 349 | + | // on-disk file still exists for `GET /logs/...`. | |
| 350 | + | let mut log = LiveLog::open(ctx.events.clone(), run_id, log_path).await; | |
| 351 | + | log.write_chunk(format!("spawn: {e}\n").as_bytes()).await; | |
| 352 | + | log.close().await; | |
| 353 | + | return Ok(GateOutcome::failed(GateFailure::SpawnFailed { | |
| 354 | + | message: e.to_string(), | |
| 355 | + | }).with_log_ref(log_ref)); | |
| 310 | 356 | } | |
| 311 | 357 | }; | |
| 312 | 358 | ||
| 313 | - | // Drain stdout/stderr concurrently into in-memory buffers so the log | |
| 314 | - | // captures whatever the binary printed before exiting (or being killed | |
| 315 | - | // after the 3s smoke window). Without piping + draining, output goes to | |
| 316 | - | // sandod's own stdio and is lost. | |
| 317 | - | let mut stdout_h = child.stdout.take(); | |
| 318 | - | let mut stderr_h = child.stderr.take(); | |
| 319 | - | let stdout_task = tokio::spawn(async move { | |
| 320 | - | let mut buf = Vec::new(); | |
| 321 | - | if let Some(s) = stdout_h.as_mut() { | |
| 322 | - | let _ = s.read_to_end(&mut buf).await; | |
| 323 | - | } | |
| 324 | - | buf | |
| 325 | - | }); | |
| 326 | - | let stderr_task = tokio::spawn(async move { | |
| 327 | - | let mut buf = Vec::new(); | |
| 328 | - | if let Some(s) = stderr_h.as_mut() { | |
| 329 | - | let _ = s.read_to_end(&mut buf).await; | |
| 330 | - | } | |
| 331 | - | buf | |
| 332 | - | }); | |
| 359 | + | // The boot smoke window is 3s. Drain stdout/stderr concurrently through | |
| 360 | + | // a shared LiveLog sink so the operator sees panics/log lines stream in | |
| 361 | + | // real time before the kill, AND the on-disk log gets the full byte | |
| 362 | + | // stream for post-mortem reads. The drainers exit when their pipe | |
| 363 | + | // closes — which happens when the child exits naturally or after kill. | |
| 364 | + | let log = std::sync::Arc::new(tokio::sync::Mutex::new( | |
| 365 | + | LiveLog::open(ctx.events.clone(), run_id, log_path).await, | |
| 366 | + | )); | |
| 367 | + | let stdout_task = tokio::spawn(stream_into_log(child.stdout.take(), log.clone())); | |
| 368 | + | let stderr_task = tokio::spawn(stream_into_log(child.stderr.take(), log.clone())); | |
| 333 | 369 | ||
| 334 | 370 | tokio::time::sleep(std::time::Duration::from_secs(3)).await; | |
| 335 | 371 | ||
| @@ -337,22 +373,79 @@ async fn boot_smoke(ctx: &GateCtx) -> Result<GateOutcome> { | |||
| 337 | 373 | if exit.is_none() { | |
| 338 | 374 | let _ = child.kill().await; | |
| 339 | 375 | } | |
| 340 | - | // Both read tasks complete once the child's stdio is closed (kill closes | |
| 341 | - | // the pipes via the dropped Child on exit-after-kill). | |
| 342 | - | let stdout_buf = stdout_task.await.unwrap_or_default(); | |
| 343 | - | let stderr_buf = stderr_task.await.unwrap_or_default(); | |
| 344 | - | persist_gate_log(ctx, "boot_smoke", &stdout_buf, &stderr_buf).await; | |
| 376 | + | // The boot_smoke classifier looks at exit code only — the streamed | |
| 377 | + | // bytes already landed in the live log and the on-disk file for the | |
| 378 | + | // post-mortem reader. Drain the join handles to avoid hangs. | |
| 379 | + | let _ = stdout_task.await; | |
| 380 | + | let _ = stderr_task.await; | |
| 381 | + | // Unique owner of the Arc at this point (both tasks dropped their clones). | |
| 382 | + | if let Ok(mutex) = std::sync::Arc::try_unwrap(log) { | |
| 383 | + | mutex.into_inner().close().await; | |
| 384 | + | } | |
| 345 | 385 | ||
| 346 | 386 | match exit { | |
| 347 | - | Some(status) => Ok(GateOutcome { | |
| 348 | - | passed: false, | |
| 349 | - | detail: Some(format!( | |
| 350 | - | "binary exited early: {status}\n{}", | |
| 351 | - | combined_tail(&stdout_buf, &stderr_buf, 4_000), | |
| 352 | - | )), | |
| 353 | - | }), | |
| 354 | - | None => Ok(GateOutcome { passed: true, detail: Some("stayed up for 3s".into()) }), | |
| 387 | + | Some(status) => { | |
| 388 | + | let failure = classify::classify_boot_smoke(status.code()); | |
| 389 | + | Ok(GateOutcome::failed(failure).with_log_ref(log_ref)) | |
| 390 | + | } | |
| 391 | + | None => Ok(GateOutcome::passed(PassNote::StayedUp { duration_s: 3 }) | |
| 392 | + | .with_log_ref(log_ref)), | |
| 393 | + | } | |
| 394 | + | } | |
| 395 | + | ||
| 396 | + | /// Drain `stream` into the shared `LiveLog` (which forwards each chunk to | |
| 397 | + | /// the on-disk log file AND broadcasts a `GateLogChunk` event), and return | |
| 398 | + | /// the concatenated bytes so the classifier can still operate on the full | |
| 399 | + | /// output post-hoc. | |
| 400 | + | async fn stream_into_log<R>( | |
| 401 | + | stream: Option<R>, | |
| 402 | + | log: std::sync::Arc<tokio::sync::Mutex<LiveLog>>, | |
| 403 | + | ) -> Vec<u8> | |
| 404 | + | where | |
| 405 | + | R: tokio::io::AsyncRead + Unpin + Send + 'static, | |
| 406 | + | { | |
| 407 | + | let mut total = Vec::new(); | |
| 408 | + | let Some(mut s) = stream else { return total }; | |
| 409 | + | let mut buf = [0u8; 4096]; | |
| 410 | + | loop { | |
| 411 | + | match s.read(&mut buf).await { | |
| 412 | + | Ok(0) => break, | |
| 413 | + | Err(_) => break, | |
| 414 | + | Ok(n) => { | |
| 415 | + | total.extend_from_slice(&buf[..n]); | |
| 416 | + | log.lock().await.write_chunk(&buf[..n]).await; | |
| 417 | + | } | |
| 418 | + | } | |
| 355 | 419 | } | |
| 420 | + | total | |
| 421 | + | } | |
| 422 | + | ||
| 423 | + | /// Spawn a child, drain its stdout/stderr through a `LiveLog`, return the | |
| 424 | + | /// combined buffers and exit status. Shared by `cargo_test` (no deadline) | |
| 425 | + | /// and ad-hoc callers — `boot_smoke` rolls its own variant because of its | |
| 426 | + | /// 3s kill window. | |
| 427 | + | async fn stream_child_to_live_log( | |
| 428 | + | child: &mut tokio::process::Child, | |
| 429 | + | events: EventTx, | |
| 430 | + | run_id: GateRunId, | |
| 431 | + | log_path: PathBuf, | |
| 432 | + | ) -> Result<(Vec<u8>, Vec<u8>, std::process::ExitStatus)> { | |
| 433 | + | let log = std::sync::Arc::new(tokio::sync::Mutex::new( | |
| 434 | + | LiveLog::open(events, run_id, log_path).await, | |
| 435 | + | )); | |
| 436 | + | let stdout_task = tokio::spawn(stream_into_log(child.stdout.take(), log.clone())); | |
| 437 | + | let stderr_task = tokio::spawn(stream_into_log(child.stderr.take(), log.clone())); | |
| 438 | + | let status = child.wait().await?; | |
| 439 | + | let stdout_buf = stdout_task.await.unwrap_or_default(); | |
| 440 | + | let stderr_buf = stderr_task.await.unwrap_or_default(); | |
| 441 | + | if let Ok(mutex) = std::sync::Arc::try_unwrap(log) { | |
| 442 | + | mutex.into_inner().close().await; | |
| 443 | + | } | |
| 444 | + | Ok((stdout_buf, stderr_buf, status)) | |
| 445 | + | } | |
| 446 | + | ||
| 447 | + | fn gate_log_path(ctx: &GateCtx, gate: GateKind) -> PathBuf { | |
| 448 | + | ctx.cfg.logs_root.join(ctx.version.to_string()).join(format!("{}.log", gate.as_str())) | |
| 356 | 449 | } | |
| 357 | 450 | ||
| 358 | 451 | async fn burn_in(ctx: &GateCtx, hours: u32) -> Result<GateOutcome> { | |
| @@ -367,19 +460,19 @@ async fn burn_in(ctx: &GateCtx, hours: u32) -> Result<GateOutcome> { | |||
| 367 | 460 | .await? | |
| 368 | 461 | .flatten(); | |
| 369 | 462 | let Some(started) = started else { | |
| 370 | - | return Ok(GateOutcome { passed: false, detail: Some("burn-in clock not started".into()) }); | |
| 463 | + | return Ok(GateOutcome::blocked(GateBlocker::BurnInClockNotStarted)); | |
| 371 | 464 | }; | |
| 372 | 465 | let started = chrono::DateTime::parse_from_rfc3339(&started)?.with_timezone(&Utc); | |
| 373 | 466 | let elapsed = Utc::now() - started; | |
| 374 | 467 | let needed = chrono::Duration::hours(hours as i64); | |
| 375 | 468 | if elapsed >= needed { | |
| 376 | - | Ok(GateOutcome { passed: true, detail: Some(format!("{} hours elapsed", elapsed.num_hours())) }) | |
| 469 | + | Ok(GateOutcome::passed(PassNote::BurnInElapsed { hours: elapsed.num_hours() as u32 })) | |
| 377 | 470 | } else { | |
| 378 | - | let remaining = needed - elapsed; | |
| 379 | - | Ok(GateOutcome { | |
| 380 | - | passed: false, | |
| 381 | - | detail: Some(format!("{} hours remaining of {hours}", remaining.num_hours())), | |
| 382 | - | }) | |
| 471 | + | let remaining = (needed - elapsed).num_hours().max(0) as u32; | |
| 472 | + | Ok(GateOutcome::blocked(GateBlocker::BurnInRemaining { | |
| 473 | + | hours_remaining: remaining, | |
| 474 | + | hours_total: hours, | |
| 475 | + | })) | |
| 383 | 476 | } | |
| 384 | 477 | } | |
| 385 | 478 | ||
| @@ -387,40 +480,24 @@ async fn manual_confirm(ctx: &GateCtx) -> Result<GateOutcome> { | |||
| 387 | 480 | // Pass iff a row in gate_runs exists with passed=1 for this (tier, version, manual_confirm) | |
| 388 | 481 | // that was inserted out-of-band by an operator action. Since the harness inserts the | |
| 389 | 482 | // in-flight row itself, look for a prior confirmation row. | |
| 390 | - | let prior: Option<i64> = sqlx::query_scalar( | |
| 391 | - | "SELECT COUNT(*) FROM gate_runs | |
| 392 | - | WHERE tier = ? AND version = ? AND gate_kind = 'manual_confirm' AND passed = 1", | |
| 483 | + | let prior_at: Option<String> = sqlx::query_scalar( | |
| 484 | + | "SELECT finished_at FROM gate_runs | |
| 485 | + | WHERE tier = ? AND version = ? AND gate_kind = 'manual_confirm' AND passed = 1 | |
| 486 | + | ORDER BY id DESC LIMIT 1", | |
| 393 | 487 | ) | |
| 394 | 488 | .bind(&ctx.tier) | |
| 395 | 489 | .bind(&ctx.version) | |
| 396 | 490 | .fetch_optional(&ctx.pool) | |
| 397 | 491 | .await?; | |
| 398 | - | let passed = prior.unwrap_or(0) > 0; | |
| 399 | - | Ok(GateOutcome { | |
| 400 | - | passed, | |
| 401 | - | detail: if passed { None } else { Some("waiting on operator confirmation".into()) }, | |
| 402 | - | }) | |
| 403 | - | } | |
| 404 | - | ||
| 405 | - | fn tail(buf: &[u8], max: usize) -> String { | |
| 406 | - | let s = String::from_utf8_lossy(buf); | |
| 407 | - | if s.len() <= max { s.into_owned() } else { format!("...{}", &s[s.len() - max..]) } | |
| 408 | - | } | |
| 409 | - | ||
| 410 | - | /// Tail combining stdout + stderr. Phase A doesn't preserve true interleaving | |
| 411 | - | /// (the streams were captured separately); we concatenate stderr after stdout | |
| 412 | - | /// so the failing test output (cargo writes it to stdout) is no longer hidden | |
| 413 | - | /// behind the 4KB stderr tail. Full unsliced output lives in the on-disk log. | |
| 414 | - | fn combined_tail(stdout: &[u8], stderr: &[u8], max: usize) -> String { | |
| 415 | - | let mut joined = Vec::with_capacity(stdout.len() + stderr.len() + 32); | |
| 416 | - | joined.extend_from_slice(b"==== stdout ====\n"); | |
| 417 | - | joined.extend_from_slice(stdout); | |
| 418 | - | if !stdout.last().is_some_and(|b| *b == b'\n') { | |
| 419 | - | joined.push(b'\n'); | |
| 492 | + | match prior_at { | |
| 493 | + | Some(at_str) => { | |
| 494 | + | let at = chrono::DateTime::parse_from_rfc3339(&at_str) | |
| 495 | + | .map(|d| d.with_timezone(&Utc)) | |
| 496 | + | .unwrap_or_else(|_| Utc::now()); | |
| 497 | + | Ok(GateOutcome::passed(PassNote::OperatorConfirmed { at })) | |
| 498 | + | } | |
| 499 | + | None => Ok(GateOutcome::blocked(GateBlocker::AwaitingOperatorConfirmation)), | |
| 420 | 500 | } | |
| 421 | - | joined.extend_from_slice(b"==== stderr ====\n"); | |
| 422 | - | joined.extend_from_slice(stderr); |
Lines truncated
| @@ -0,0 +1,27 @@ | |||
| 1 | + | //! sando-daemon as a library. | |
| 2 | + | //! | |
| 3 | + | //! Exposes every module so the `sandod` binary (in `src/main.rs`) and | |
| 4 | + | //! the `sando` TUI (in `../tui`) can share wire-facing types — events, | |
| 5 | + | //! outcomes, domain newtypes — by import rather than duplication. | |
| 6 | + | //! | |
| 7 | + | //! External consumers should only need `domain`, `outcome`, and `events`; | |
| 8 | + | //! the rest are exposed because the bin re-uses them via this crate. | |
| 9 | + | ||
| 10 | + | pub mod backup; | |
| 11 | + | pub mod build; | |
| 12 | + | pub mod classify; | |
| 13 | + | pub mod config; | |
| 14 | + | pub mod db; | |
| 15 | + | pub mod deploy; | |
| 16 | + | pub mod domain; | |
| 17 | + | pub mod error; | |
| 18 | + | pub mod events; | |
| 19 | + | pub mod gates; | |
| 20 | + | pub mod git; | |
| 21 | + | pub mod live_log; | |
| 22 | + | pub mod metrics; | |
| 23 | + | pub mod outcome; | |
| 24 | + | pub mod routes; | |
| 25 | + | pub mod state; | |
| 26 | + | pub mod sync; | |
| 27 | + | pub mod topology; |
| @@ -0,0 +1,160 @@ | |||
| 1 | + | //! Live gate-log sink. | |
| 2 | + | //! | |
| 3 | + | //! Wraps the on-disk per-run log file with chunk broadcasting. Gate | |
| 4 | + | //! runners (`cargo_test`, `boot_smoke`) push their stdout/stderr through | |
| 5 | + | //! `LiveLog::write_chunk` as it arrives; the sink: | |
| 6 | + | //! 1. appends to the on-disk file at `<logs_root>/<version>/<gate>.log` | |
| 7 | + | //! so the post-mortem `GET /logs/...` route still has the full byte | |
| 8 | + | //! stream; | |
| 9 | + | //! 2. broadcasts a `GateLogChunk` event with a UTF-8-lossy slice of the | |
| 10 | + | //! same bytes, so the TUI sees the tail in real time. | |
| 11 | + | //! | |
| 12 | + | //! Chunks reflect tokio read boundaries — they are NOT line-aligned. | |
| 13 | + | //! Consumers that want lines must reassemble; the on-disk log preserves | |
| 14 | + | //! the exact byte stream for that purpose. | |
| 15 | + | ||
| 16 | + | use crate::domain::GateRunId; | |
| 17 | + | use crate::events::{self, Event, EventTx}; | |
| 18 | + | use std::path::{Path, PathBuf}; | |
| 19 | + | use tokio::fs::File; | |
| 20 | + | use tokio::io::AsyncWriteExt; | |
| 21 | + | ||
| 22 | + | pub struct LiveLog { | |
| 23 | + | file: Option<File>, | |
| 24 | + | /// Kept for diagnostic logging when file IO is unavailable. | |
| 25 | + | path: PathBuf, | |
| 26 | + | events: EventTx, | |
| 27 | + | run_id: GateRunId, | |
| 28 | + | seq: u32, | |
| 29 | + | } | |
| 30 | + | ||
| 31 | + | impl LiveLog { | |
| 32 | + | /// Open the log file for append-streaming. Creates parent directories | |
| 33 | + | /// as needed. If the file can't be opened, the sink degrades to | |
| 34 | + | /// "broadcast only" — chunks still go out as `GateLogChunk` events; | |
| 35 | + | /// the missing on-disk log is logged as a warning. This matches the | |
| 36 | + | /// pre-step-6 invariant: a broken log dir doesn't turn a passing gate | |
| 37 | + | /// red. | |
| 38 | + | pub async fn open(events: EventTx, run_id: GateRunId, path: PathBuf) -> Self { | |
| 39 | + | let file = open_for_append(&path).await; | |
| 40 | + | Self { file, path, events, run_id, seq: 0 } | |
| 41 | + | } | |
| 42 | + | ||
| 43 | + | /// Append `bytes` to the on-disk log and broadcast a `GateLogChunk`. | |
| 44 | + | /// The broadcast goes out even if the disk write fails — operators | |
| 45 | + | /// watching live still see the chunk. | |
| 46 | + | pub async fn write_chunk(&mut self, bytes: &[u8]) { | |
| 47 | + | if bytes.is_empty() { return; } | |
| 48 | + | if let Some(f) = self.file.as_mut() { | |
| 49 | + | if let Err(e) = f.write_all(bytes).await { | |
| 50 | + | tracing::warn!(error = %e, path = %self.path.display(), "live log write failed"); | |
| 51 | + | self.file = None; | |
| 52 | + | } | |
| 53 | + | } | |
| 54 | + | let text = String::from_utf8_lossy(bytes).into_owned(); | |
| 55 | + | events::emit(&self.events, Event::GateLogChunk { | |
| 56 | + | run_id: self.run_id, | |
| 57 | + | seq: self.seq, | |
| 58 | + | text, | |
| 59 | + | }); | |
| 60 | + | self.seq = self.seq.saturating_add(1); | |
| 61 | + | } | |
| 62 | + | ||
| 63 | + | /// Flush and close the file. Best-effort: errors are logged. | |
| 64 | + | pub async fn close(mut self) { | |
| 65 | + | if let Some(mut f) = self.file.take() { | |
| 66 | + | if let Err(e) = f.flush().await { | |
| 67 | + | tracing::warn!(error = %e, path = %self.path.display(), "live log flush failed"); | |
| 68 | + | } | |
| 69 | + | } | |
| 70 | + | } | |
| 71 | + | ||
| 72 | + | pub fn run_id(&self) -> GateRunId { self.run_id } | |
| 73 | + | pub fn chunks_emitted(&self) -> u32 { self.seq } | |
| 74 | + | } | |
| 75 | + | ||
| 76 | + | async fn open_for_append(path: &Path) -> Option<File> { | |
| 77 | + | if let Some(parent) = path.parent() { | |
| 78 | + | if let Err(e) = tokio::fs::create_dir_all(parent).await { | |
| 79 | + | tracing::warn!(error = %e, dir = %parent.display(), "could not create gate log dir"); | |
| 80 | + | return None; | |
| 81 | + | } | |
| 82 | + | } | |
| 83 | + | match tokio::fs::OpenOptions::new() | |
| 84 | + | .create(true) | |
| 85 | + | .append(true) | |
| 86 | + | .open(path) | |
| 87 | + | .await | |
| 88 | + | { | |
| 89 | + | Ok(f) => Some(f), | |
| 90 | + | Err(e) => { | |
| 91 | + | tracing::warn!(error = %e, path = %path.display(), "could not open gate log file"); | |
| 92 | + | None | |
| 93 | + | } | |
| 94 | + | } | |
| 95 | + | } | |
| 96 | + | ||
| 97 | + | #[cfg(test)] | |
| 98 | + | mod tests { | |
| 99 | + | use super::*; | |
| 100 | + | use crate::events::EventEnvelope; | |
| 101 | + | ||
| 102 | + | #[tokio::test] | |
| 103 | + | async fn write_chunk_emits_event_and_appends_to_file() { | |
| 104 | + | let dir = tempfile::tempdir().unwrap(); | |
| 105 | + | let path = dir.path().join("nested/test.log"); | |
| 106 | + | let events = events::channel(); | |
| 107 | + | let mut rx = events.subscribe(); | |
| 108 | + | let mut log = LiveLog::open(events.clone(), GateRunId(7), path.clone()).await; | |
| 109 | + | log.write_chunk(b"hello ").await; | |
| 110 | + | log.write_chunk(b"world\n").await; | |
| 111 | + | log.close().await; | |
| 112 | + | ||
| 113 | + | let on_disk = tokio::fs::read_to_string(&path).await.unwrap(); | |
| 114 | + | assert_eq!(on_disk, "hello world\n"); | |
| 115 | + | ||
| 116 | + | let mut chunks: Vec<String> = Vec::new(); | |
| 117 | + | while let Ok(env) = rx.try_recv() { | |
| 118 | + | if let Event::GateLogChunk { run_id, seq, text } = env.event { | |
| 119 | + | assert_eq!(run_id, GateRunId(7)); | |
| 120 | + | assert_eq!(seq as usize, chunks.len()); | |
| 121 | + | chunks.push(text); | |
| 122 | + | } | |
| 123 | + | } | |
| 124 | + | assert_eq!(chunks, vec!["hello ".to_string(), "world\n".to_string()]); | |
| 125 | + | } | |
| 126 | + | ||
| 127 | + | #[tokio::test] | |
| 128 | + | async fn write_chunk_emits_even_when_file_cannot_be_opened() { | |
| 129 | + | // Point at a path whose parent is a regular file — create_dir_all | |
| 130 | + | // will fail. The broadcast must still fire. | |
| 131 | + | let dir = tempfile::tempdir().unwrap(); | |
| 132 | + | let blocker = dir.path().join("blocker"); | |
| 133 | + | tokio::fs::write(&blocker, b"i am a file, not a dir").await.unwrap(); | |
| 134 | + | let path = blocker.join("inside.log"); // parent is a file | |
| 135 | + | let events = events::channel(); | |
| 136 | + | let mut rx = events.subscribe(); | |
| 137 | + | let mut log = LiveLog::open(events.clone(), GateRunId(1), path).await; | |
| 138 | + | log.write_chunk(b"streamed despite no file\n").await; | |
| 139 | + | log.close().await; | |
| 140 | + | ||
| 141 | + | let env: EventEnvelope = rx.try_recv().unwrap(); | |
| 142 | + | match env.event { | |
| 143 | + | Event::GateLogChunk { text, .. } => assert_eq!(text, "streamed despite no file\n"), | |
| 144 | + | _ => panic!("expected GateLogChunk"), | |
| 145 | + | } | |
| 146 | + | } | |
| 147 | + | ||
| 148 | + | #[tokio::test] | |
| 149 | + | async fn empty_chunk_is_noop() { | |
| 150 | + | let dir = tempfile::tempdir().unwrap(); | |
| 151 | + | let path = dir.path().join("empty.log"); | |
| 152 | + | let events = events::channel(); | |
| 153 | + | let mut rx = events.subscribe(); | |
| 154 | + | let mut log = LiveLog::open(events.clone(), GateRunId(9), path).await; | |
| 155 | + | log.write_chunk(b"").await; | |
| 156 | + | assert_eq!(log.chunks_emitted(), 0); | |
| 157 | + | assert!(rx.try_recv().is_err()); | |
| 158 | + | log.close().await; | |
| 159 | + | } | |
| 160 | + | } |
| @@ -1,23 +1,9 @@ | |||
| 1 | 1 | use anyhow::Result; | |
| 2 | + | use sando_daemon::{config, db, events, git, metrics, routes, state, sync, topology}; | |
| 2 | 3 | use std::net::SocketAddr; | |
| 3 | 4 | use std::path::Path; | |
| 4 | 5 | use std::sync::Arc; | |
| 5 | 6 | ||
| 6 | - | mod backup; | |
| 7 | - | mod build; | |
| 8 | - | mod config; | |
| 9 | - | mod db; | |
| 10 | - | mod deploy; | |
| 11 | - | mod error; | |
| 12 | - | mod events; | |
| 13 | - | mod gates; | |
| 14 | - | mod git; | |
| 15 | - | mod metrics; | |
| 16 | - | mod routes; | |
| 17 | - | mod state; | |
| 18 | - | mod sync; | |
| 19 | - | mod topology; | |
| 20 | - | ||
| 21 | 7 | #[tokio::main] | |
| 22 | 8 | async fn main() -> Result<()> { | |
| 23 | 9 | tracing_subscriber::fmt() | |
| @@ -27,9 +13,10 @@ async fn main() -> Result<()> { | |||
| 27 | 13 | .with_writer(std::io::stderr) | |
| 28 | 14 | .with_env_filter( | |
| 29 | 15 | tracing_subscriber::EnvFilter::try_from_default_env() | |
| 30 | - | // bin target name is `sandod`, NOT the package name `sando-daemon` — | |
| 31 | - | // `module_path!()` uses the binary's crate name, so events come from `sandod::*`. | |
| 32 | - | .unwrap_or_else(|_| "sandod=info,tower_http=info".into()), | |
| 16 | + | // Modules live under the library crate `sando_daemon` (since | |
| 17 | + | // the step-5 lib/bin split). `sandod` is kept for any | |
| 18 | + | // top-level events that originate in main.rs itself. | |
| 19 | + | .unwrap_or_else(|_| "sando_daemon=info,sandod=info,tower_http=info".into()), | |
| 33 | 20 | ) | |
| 34 | 21 | .init(); | |
| 35 | 22 |
| @@ -0,0 +1,386 @@ | |||
| 1 | + | //! Typed gate outcomes. | |
| 2 | + | //! | |
| 3 | + | //! Replaces the `(passed: bool, detail: Option<String>)` pair on | |
| 4 | + | //! `GateOutcome`. The point is to push failure classification into the | |
| 5 | + | //! type itself: a `GateFailure::MigrationDrift { migration }` is what it | |
| 6 | + | //! says, not a string the operator has to parse. See | |
| 7 | + | //! `plans/observability.md` for the full argument. | |
| 8 | + | //! | |
| 9 | + | //! The variants here describe what the gate runner actually observed. | |
| 10 | + | //! Mapping raw process output (stderr tails, exit codes) to these | |
| 11 | + | //! variants is the classifier's job — `classify.rs`. | |
| 12 | + | ||
| 13 | + | use crate::domain::{GateKind, Version}; | |
| 14 | + | use chrono::{DateTime, Utc}; | |
| 15 | + | use serde::{Deserialize, Serialize}; | |
| 16 | + | ||
| 17 | + | /// A gate's result, persisted to `gate_runs.outcome_json` and emitted | |
| 18 | + | /// over WS in `GateDone`. | |
| 19 | + | #[derive(Debug, Clone, Serialize, Deserialize)] | |
| 20 | + | pub struct GateOutcome { | |
| 21 | + | pub status: GateStatus, | |
| 22 | + | /// Relative path under `cfg.logs_root` to the persisted stdout/stderr | |
| 23 | + | /// for this run. `None` for gates that don't produce process output | |
| 24 | + | /// (burn_in, manual_confirm). | |
| 25 | + | #[serde(skip_serializing_if = "Option::is_none", default)] | |
| 26 | + | pub log_ref: Option<LogRef>, | |
| 27 | + | } | |
| 28 | + | ||
| 29 | + | impl GateOutcome { | |
| 30 | + | pub fn passed(note: PassNote) -> Self { | |
| 31 | + | Self { status: GateStatus::Passed { note }, log_ref: None } | |
| 32 | + | } | |
| 33 | + | pub fn failed(failure: GateFailure) -> Self { | |
| 34 | + | Self { status: GateStatus::Failed { failure }, log_ref: None } | |
| 35 | + | } | |
| 36 | + | pub fn blocked(blocker: GateBlocker) -> Self { | |
| 37 | + | Self { status: GateStatus::Blocked { blocker }, log_ref: None } | |
| 38 | + | } | |
| 39 | + | pub fn with_log_ref(mut self, log_ref: LogRef) -> Self { | |
| 40 | + | self.log_ref = Some(log_ref); | |
| 41 | + | self | |
| 42 | + | } | |
| 43 | + | ||
| 44 | + | /// Shadow column: until migration 004 drops `passed`, every write | |
| 45 | + | /// also populates the legacy boolean. `Blocked` reads as failing | |
| 46 | + | /// because gates that are blocked have not satisfied the pipeline. | |
| 47 | + | pub fn legacy_passed(&self) -> bool { | |
| 48 | + | matches!(self.status, GateStatus::Passed { .. }) | |
| 49 | + | } | |
| 50 | + | ||
| 51 | + | /// Shadow column: human-readable single-line summary for the legacy | |
| 52 | + | /// `gate_runs.detail` column. Goes away when migration 004 drops it. | |
| 53 | + | pub fn legacy_detail(&self) -> String { | |
| 54 | + | match &self.status { | |
| 55 | + | GateStatus::Passed { note } => note.summary(), | |
| 56 | + | GateStatus::Failed { failure } => failure.summary(), | |
| 57 | + | GateStatus::Blocked { blocker } => blocker.summary(), | |
| 58 | + | } | |
| 59 | + | } | |
| 60 | + | ||
| 61 | + | /// The high-level status word for the `gate_runs.status` column. | |
| 62 | + | pub fn status_str(&self) -> &'static str { | |
| 63 | + | match self.status { | |
| 64 | + | GateStatus::Passed { .. } => "passed", | |
| 65 | + | GateStatus::Failed { .. } => "failed", | |
| 66 | + | GateStatus::Blocked { .. } => "blocked", | |
| 67 | + | } | |
| 68 | + | } | |
| 69 | + | } | |
| 70 | + | ||
| 71 | + | #[derive(Debug, Clone, Serialize, Deserialize)] | |
| 72 | + | #[serde(tag = "kind", rename_all = "snake_case")] | |
| 73 | + | pub enum GateStatus { | |
| 74 | + | /// Gate ran and succeeded. The note carries gate-specific evidence | |
| 75 | + | /// (e.g. `TestsPassed { duration_s }`). | |
| 76 | + | Passed { note: PassNote }, | |
| 77 | + | /// Gate ran and failed. Two-layer tag: outer `kind = "failed"`, inner | |
| 78 | + | /// `failure.kind` names the classified variant. If no classifier | |
| 79 | + | /// matched, that's `unclassified`. | |
| 80 | + | Failed { failure: GateFailure }, | |
| 81 | + | /// Gate cannot run yet. Burn-in clock not started, scratch DB not | |
| 82 | + | /// configured, backup missing — pre-conditions the operator can fix | |
| 83 | + | /// out of band. Distinguished from `Failed` so the TUI can render | |
| 84 | + | /// these yellow rather than red. | |
| 85 | + | Blocked { blocker: GateBlocker }, | |
| 86 | + | } | |
| 87 | + | ||
| 88 | + | #[derive(Debug, Clone, Serialize, Deserialize)] | |
| 89 | + | #[serde(tag = "kind", rename_all = "snake_case")] | |
| 90 | + | pub enum PassNote { | |
| 91 | + | /// `boot_smoke` — the binary stayed up for the smoke window. | |
| 92 | + | StayedUp { duration_s: u32 }, | |
| 93 | + | /// `burn_in` — the configured number of hours have elapsed since | |
| 94 | + | /// the gate's clock started. | |
| 95 | + | BurnInElapsed { hours: u32 }, | |
| 96 | + | /// `migration_dry_run` — scratch DB restored from `backup_path` and | |
| 97 | + | /// every migration ran without error. | |
| 98 | + | Migrated { backup_path: String }, | |
| 99 | + | /// `cargo_test` — `cargo test --release` exited 0. | |
| 100 | + | TestsPassed { duration_s: u32 }, | |
| 101 | + | /// `manual_confirm` — an operator inserted a passing row out-of-band. | |
| 102 | + | OperatorConfirmed { at: DateTime<Utc> }, | |
| 103 | + | /// Legacy rows backfilled from the pre-typed schema. Carries the | |
| 104 | + | /// original `detail` string so nothing is lost. | |
| 105 | + | Legacy { text: String }, | |
| 106 | + | } | |
| 107 | + | ||
| 108 | + | impl PassNote { | |
| 109 | + | pub fn summary(&self) -> String { | |
| 110 | + | match self { | |
| 111 | + | PassNote::StayedUp { duration_s } => format!("stayed up for {duration_s}s"), | |
| 112 | + | PassNote::BurnInElapsed { hours } => format!("{hours} hours elapsed"), | |
| 113 | + | PassNote::Migrated { backup_path } => format!("restored {backup_path} + migrated"), | |
| 114 | + | PassNote::TestsPassed { duration_s } => format!("tests passed in {duration_s}s"), | |
| 115 | + | PassNote::OperatorConfirmed { at } => format!("operator confirmed at {at}"), | |
| 116 | + | PassNote::Legacy { text } => text.clone(), | |
| 117 | + | } | |
| 118 | + | } | |
| 119 | + | } | |
| 120 | + | ||
| 121 | + | #[derive(Debug, Clone, Serialize, Deserialize)] | |
| 122 | + | #[serde(tag = "kind", rename_all = "snake_case")] | |
| 123 | + | pub enum GateBlocker { | |
| 124 | + | /// `burn_in`: the tier's `tier_state.burn_in_started_at` is NULL. | |
| 125 | + | BurnInClockNotStarted, | |
| 126 | + | /// `burn_in`: clock running but not enough time elapsed yet. | |
| 127 | + | BurnInRemaining { hours_remaining: u32, hours_total: u32 }, | |
| 128 | + | /// `manual_confirm`: no out-of-band passing row exists for this | |
| 129 | + | /// (tier, version). | |
| 130 | + | AwaitingOperatorConfirmation, | |
| 131 | + | /// `migration_dry_run`: no row in `backups` to restore from. | |
| 132 | + | NoBackupAvailable, | |
| 133 | + | /// `migration_dry_run` / `boot_smoke` / `cargo_test`: daemon config | |
| 134 | + | /// has no `scratch_db_url`. | |
| 135 | + | ScratchDbUrlUnset, | |
| 136 | + | /// `boot_smoke`: no `artifact_path` in `versions` for this version. | |
| 137 | + | ArtifactMissing { version: Version }, | |
| 138 | + | } | |
| 139 | + | ||
| 140 | + | impl GateBlocker { | |
| 141 | + | pub fn summary(&self) -> String { | |
| 142 | + | match self { | |
| 143 | + | GateBlocker::BurnInClockNotStarted => "burn-in clock not started".into(), | |
| 144 | + | GateBlocker::BurnInRemaining { hours_remaining, hours_total } => | |
| 145 | + | format!("{hours_remaining} hours remaining of {hours_total}"), | |
| 146 | + | GateBlocker::AwaitingOperatorConfirmation => "waiting on operator confirmation".into(), | |
| 147 | + | GateBlocker::NoBackupAvailable => "no backup fetched; call /backup/fetch first".into(), | |
| 148 | + | GateBlocker::ScratchDbUrlUnset => "scratch_db_url unset in daemon config".into(), | |
| 149 | + | GateBlocker::ArtifactMissing { version } => format!("no artifact for version {version}"), | |
| 150 | + | } | |
| 151 | + | } | |
| 152 | + | } | |
| 153 | + | ||
| 154 | + | #[derive(Debug, Clone, Serialize, Deserialize)] | |
| 155 | + | #[serde(tag = "kind", rename_all = "snake_case")] | |
| 156 | + | pub enum GateFailure { | |
| 157 | + | /// `cargo_test` exited non-zero. `failed_count` may be 0 if the | |
| 158 | + | /// classifier couldn't parse the count (e.g. compile error). | |
| 159 | + | CargoTest { failed_count: u32, first_failed: Option<String> }, | |
| 160 | + | /// `migration_dry_run`: a migration that was previously applied is | |
| 161 | + | /// no longer present in the resolved migrations directory. | |
| 162 | + | MigrationDrift { migration: String }, | |
| 163 | + | /// `migration_dry_run`: a migration that was previously applied has | |
| 164 | + | /// been modified (checksum mismatch). | |
| 165 | + | MigrationModified { migration: String }, | |
| 166 | + | /// `migration_dry_run`: postgres rejected a migration's SQL. | |
| 167 | + | MigrationSqlError { migration: String, sqlstate: Option<String> }, | |
| 168 | + | /// `migration_dry_run`: scratch DB reset or dump restore failed. | |
| 169 | + | RestoreFailed { reason: String }, | |
| 170 | + | /// `boot_smoke`: binary exited with a non-zero status during the | |
| 171 | + | /// smoke window. Most likely a panic; `exit_code` carries the OS | |
| 172 | + | /// status when one is available. | |
| 173 | + | BootPanic { exit_code: Option<i32> }, | |
| 174 | + | /// `boot_smoke`: binary exited 0 before the smoke window elapsed. | |
| 175 | + | BootExitedEarly { exit_code: Option<i32> }, | |
| 176 | + | /// `cargo_test` / `boot_smoke`: tokio could not spawn the child. | |
| 177 | + | SpawnFailed { message: String }, | |
| 178 | + | /// Gate took longer than the configured ceiling. | |
| 179 | + | Timeout { gate: GateKind, after_s: u32 }, | |
| 180 | + | /// Classifier could not match the output to any known variant. The | |
| 181 | + | /// `log_ref` on the enclosing `GateOutcome` is the diagnostic path. | |
| 182 | + | Unclassified { legacy_detail: Option<String> }, | |
| 183 | + | } | |
| 184 | + | ||
| 185 | + | impl GateFailure { | |
| 186 | + | pub fn summary(&self) -> String { | |
| 187 | + | match self { | |
| 188 | + | GateFailure::CargoTest { failed_count, first_failed: Some(name) } => | |
| 189 | + | format!("{failed_count} test(s) failed; first: {name}"), | |
| 190 | + | GateFailure::CargoTest { failed_count, first_failed: None } => | |
| 191 | + | format!("{failed_count} test(s) failed"), | |
| 192 | + | GateFailure::MigrationDrift { migration } => | |
| 193 | + | format!("migration {migration} previously applied but missing"), | |
| 194 | + | GateFailure::MigrationModified { migration } => | |
| 195 | + | format!("migration {migration} previously applied but modified"), | |
| 196 | + | GateFailure::MigrationSqlError { migration, sqlstate: Some(s) } => | |
| 197 | + | format!("migration {migration} sql error ({s})"), | |
| 198 | + | GateFailure::MigrationSqlError { migration, sqlstate: None } => | |
| 199 | + | format!("migration {migration} sql error"), | |
| 200 | + | GateFailure::RestoreFailed { reason } => format!("restore: {reason}"), | |
| 201 | + | GateFailure::BootPanic { exit_code: Some(c) } => format!("binary panicked: exit {c}"), | |
| 202 | + | GateFailure::BootPanic { exit_code: None } => "binary panicked".into(), | |
| 203 | + | GateFailure::BootExitedEarly { exit_code: Some(c) } => format!("binary exited early: exit {c}"), | |
| 204 | + | GateFailure::BootExitedEarly { exit_code: None } => "binary exited early".into(), | |
| 205 | + | GateFailure::SpawnFailed { message } => format!("spawn: {message}"), | |
| 206 | + | GateFailure::Timeout { gate, after_s } => format!("{gate} timed out after {after_s}s"), | |
| 207 | + | GateFailure::Unclassified { legacy_detail: Some(d) } => d.clone(), | |
| 208 | + | GateFailure::Unclassified { legacy_detail: None } => "unclassified failure".into(), | |
| 209 | + | } | |
| 210 | + | } | |
| 211 | + | } | |
| 212 | + | ||
| 213 | + | // --------------------------------------------------------------------- | |
| 214 | + | // Deploy outcomes (step 7) | |
| 215 | + | // --------------------------------------------------------------------- | |
| 216 | + | ||
| 217 | + | /// Typed outcome of one node-deploy attempt. Stored as `outcome_json` in | |
| 218 | + | /// the `deploys` table and emitted in `Event::DeployFailed` so consumers | |
| 219 | + | /// can distinguish a node-unreachable error (operator: check the box) | |
| 220 | + | /// from rsync mid-transfer corruption (operator: check disk/network). | |
| 221 | + | #[derive(Debug, Clone, Serialize, Deserialize)] | |
| 222 | + | pub struct DeployOutcome { | |
| 223 | + | pub status: DeployStatus, | |
| 224 | + | } | |
| 225 | + | ||
| 226 | + | impl DeployOutcome { | |
| 227 | + | pub fn ok() -> Self { Self { status: DeployStatus::Ok } } | |
| 228 | + | pub fn failed(failure: DeployFailureKind) -> Self { | |
| 229 | + | Self { status: DeployStatus::Failed { failure } } | |
| 230 | + | } | |
| 231 | + | pub fn in_progress() -> Self { Self { status: DeployStatus::InProgress } } | |
| 232 | + | ||
| 233 | + | /// `'in_progress' | 'ok' | 'failed'` — the value of the legacy | |
| 234 | + | /// `deploys.outcome` column. | |
| 235 | + | pub fn status_str(&self) -> &'static str { | |
| 236 | + | match self.status { | |
| 237 | + | DeployStatus::InProgress => "in_progress", | |
| 238 | + | DeployStatus::Ok => "ok", | |
| 239 | + | DeployStatus::Failed { .. } => "failed", | |
| 240 | + | } | |
| 241 | + | } | |
| 242 | + | } | |
| 243 | + | ||
| 244 | + | #[derive(Debug, Clone, Serialize, Deserialize)] | |
| 245 | + | #[serde(tag = "kind", rename_all = "snake_case")] | |
| 246 | + | pub enum DeployStatus { | |
| 247 | + | InProgress, | |
| 248 | + | Ok, | |
| 249 | + | Failed { failure: DeployFailureKind }, | |
| 250 | + | } | |
| 251 | + | ||
| 252 | + | #[derive(Debug, Clone, Serialize, Deserialize)] | |
| 253 | + | #[serde(tag = "kind", rename_all = "snake_case")] | |
| 254 | + | pub enum DeployFailureKind { | |
| 255 | + | /// SSH to the node failed before any state changed. Typically a dead | |
| 256 | + | /// host, network partition, or stale known_hosts. | |
| 257 | + | NodeUnreachable { detail: String }, | |
| 258 | + | /// rsync exited non-zero mid-transfer. The on-target release dir may | |
| 259 | + | /// be partially populated, but the `current` symlink is untouched. | |
| 260 | + | RsyncFailed { detail: String }, | |
| 261 | + | /// Files copied successfully but the atomic symlink swap step | |
| 262 | + | /// failed. The new release is on disk; the service is still running | |
| 263 | + | /// the old one. | |
| 264 | + | SymlinkSwapFailed { detail: String }, | |
| 265 | + | /// Symlink swapped but `systemctl reload-or-restart` returned | |
| 266 | + | /// non-zero. The new code is current but the service may have | |
| 267 | + | /// crashed on startup. | |
| 268 | + | ServiceRestartFailed { detail: String }, | |
| 269 | + | /// Classifier couldn't match the error to a known variant. The full | |
| 270 | + | /// anyhow chain is in `detail`. | |
| 271 | + | Unclassified { detail: String }, | |
| 272 | + | } | |
| 273 | + | ||
| 274 | + | impl DeployFailureKind { | |
| 275 | + | pub fn summary(&self) -> String { | |
| 276 | + | match self { | |
| 277 | + | DeployFailureKind::NodeUnreachable { detail } => format!("node unreachable: {detail}"), | |
| 278 | + | DeployFailureKind::RsyncFailed { detail } => format!("rsync: {detail}"), | |
| 279 | + | DeployFailureKind::SymlinkSwapFailed { detail } => format!("symlink swap: {detail}"), | |
| 280 | + | DeployFailureKind::ServiceRestartFailed { detail } => format!("service restart: {detail}"), | |
| 281 | + | DeployFailureKind::Unclassified { detail } => detail.chars().take(200).collect(), | |
| 282 | + | } | |
| 283 | + | } | |
| 284 | + | } | |
| 285 | + | ||
| 286 | + | /// Pointer to the on-disk gate log: a path relative to `cfg.logs_root` | |
| 287 | + | /// of the form `<version>/<gate_kind>.log`. Stored in `gate_runs.log_ref` | |
| 288 | + | /// and surfaced in `/state` so the TUI/operator can request the full | |
| 289 | + | /// tail via `GET /logs/<version>/<gate>` only when needed. | |
| 290 | + | #[derive(Debug, Clone, Serialize, Deserialize)] | |
| 291 | + | #[serde(transparent)] | |
| 292 | + | pub struct LogRef(pub String); | |
| 293 | + | ||
| 294 | + | impl LogRef { | |
| 295 | + | pub fn new(version: &Version, gate: GateKind) -> Self { | |
| 296 | + | Self(format!("{}/{}.log", version, gate.as_str())) | |
| 297 | + | } | |
| 298 | + | pub fn as_str(&self) -> &str { &self.0 } | |
| 299 | + | } | |
| 300 | + | ||
| 301 | + | impl std::fmt::Display for LogRef { | |
| 302 | + | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) } | |
| 303 | + | } | |
| 304 | + | ||
| 305 | + | #[cfg(test)] | |
| 306 | + | mod tests { | |
| 307 | + | use super::*; | |
| 308 | + | ||
| 309 | + | #[test] | |
| 310 | + | fn outcome_serialization_is_two_layer_tagged() { | |
| 311 | + | let o = GateOutcome::failed(GateFailure::MigrationDrift { | |
| 312 | + | migration: "0047_widgets".into(), | |
| 313 | + | }); | |
| 314 | + | let v: serde_json::Value = serde_json::to_value(&o).unwrap(); | |
| 315 | + | assert_eq!(v["status"]["kind"], "failed"); | |
| 316 | + | assert_eq!(v["status"]["failure"]["kind"], "migration_drift"); | |
| 317 | + | assert_eq!(v["status"]["failure"]["migration"], "0047_widgets"); | |
| 318 | + | } | |
| 319 | + | ||
| 320 | + | #[test] | |
| 321 | + | fn outcome_round_trips_through_json() { | |
| 322 | + | let o = GateOutcome::passed(PassNote::TestsPassed { duration_s: 42 }); | |
| 323 | + | let s = serde_json::to_string(&o).unwrap(); | |
| 324 | + | let back: GateOutcome = serde_json::from_str(&s).unwrap(); | |
| 325 | + | assert!(back.legacy_passed()); | |
| 326 | + | assert_eq!(back.status_str(), "passed"); | |
| 327 | + | } | |
| 328 | + | ||
| 329 | + | #[test] | |
| 330 | + | fn blocked_legacy_passed_is_false() { | |
| 331 | + | let o = GateOutcome::blocked(GateBlocker::BurnInClockNotStarted); | |
| 332 | + | assert!(!o.legacy_passed()); | |
| 333 | + | assert_eq!(o.status_str(), "blocked"); | |
| 334 | + | } | |
| 335 | + | ||
| 336 | + | #[test] | |
| 337 | + | fn legacy_detail_summaries_match_pre_typed_strings() { | |
| 338 | + | // The pre-typed gate runner wrote specific prose for each | |
| 339 | + | // pass/blocker case. The summary impl is what populates the | |
| 340 | + | // shadow `detail` column during the migration-003 shadow period | |
| 341 | + | // so old consumers (TUI, /state) keep working unchanged. | |
| 342 | + | assert_eq!( | |
| 343 | + | GateOutcome::blocked(GateBlocker::BurnInClockNotStarted).legacy_detail(), | |
| 344 | + | "burn-in clock not started", | |
| 345 | + | ); | |
| 346 | + | assert_eq!( | |
| 347 | + | GateOutcome::blocked(GateBlocker::ScratchDbUrlUnset).legacy_detail(), | |
| 348 | + | "scratch_db_url unset in daemon config", | |
| 349 | + | ); | |
| 350 | + | assert_eq!( | |
| 351 | + | GateOutcome::blocked(GateBlocker::NoBackupAvailable).legacy_detail(), | |
| 352 | + | "no backup fetched; call /backup/fetch first", | |
| 353 | + | ); | |
| 354 | + | assert_eq!( | |
| 355 | + | GateOutcome::blocked(GateBlocker::AwaitingOperatorConfirmation).legacy_detail(), | |
| 356 | + | "waiting on operator confirmation", | |
| 357 | + | ); | |
| 358 | + | assert_eq!( | |
| 359 | + | GateOutcome::blocked(GateBlocker::BurnInRemaining { hours_remaining: 47, hours_total: 168 }) | |
| 360 | + | .legacy_detail(), | |
| 361 | + | "47 hours remaining of 168", | |
| 362 | + | ); | |
| 363 | + | assert_eq!( | |
| 364 | + | GateOutcome::passed(PassNote::StayedUp { duration_s: 3 }).legacy_detail(), | |
| 365 | + | "stayed up for 3s", | |
| 366 | + | ); | |
| 367 | + | } | |
| 368 | + | ||
| 369 | + | #[test] | |
| 370 | + | fn log_ref_construction_matches_disk_layout() { | |
| 371 | + | let v: Version = "0.9.6".parse().unwrap(); | |
| 372 | + | let lr = LogRef::new(&v, GateKind::CargoTest); | |
| 373 | + | assert_eq!(lr.as_str(), "0.9.6/cargo_test.log"); | |
| 374 | + | } | |
| 375 | + | ||
| 376 | + | #[test] | |
| 377 | + | fn unclassified_preserves_legacy_detail() { | |
| 378 | + | let o = GateOutcome::failed(GateFailure::Unclassified { | |
| 379 | + | legacy_detail: Some("binary exited early: exit status: 101\n==== stdout ====\n...".into()), | |
| 380 | + | }); | |
| 381 | + | let v: serde_json::Value = serde_json::to_value(&o).unwrap(); | |
| 382 | + | assert_eq!(v["status"]["failure"]["kind"], "unclassified"); | |
| 383 | + | assert!(v["status"]["failure"]["legacy_detail"] | |
| 384 | + | .as_str().unwrap().contains("exit status: 101")); | |
| 385 | + | } | |
| 386 | + | } |
| @@ -43,9 +43,22 @@ struct TierView { | |||
| 43 | 43 | #[derive(Serialize)] | |
| 44 | 44 | struct GateView { | |
| 45 | 45 | kind: String, | |
| 46 | + | /// Shadow field — kept until step 5 (TUI typed-event handling) so | |
| 47 | + | /// the current TUI keeps rendering pass/fail without consulting | |
| 48 | + | /// `outcome`. Goes away in migration 004. | |
| 46 | 49 | passed: Option<bool>, | |
| 47 | 50 | finished_at: Option<String>, | |
| 51 | + | /// Shadow field — superseded by `outcome.status`. | |
| 48 | 52 | detail: Option<String>, | |
| 53 | + | /// `'passed' | 'failed' | 'blocked'` or NULL while in-flight. The | |
| 54 | + | /// TUI can rely on this to choose green/red/yellow rendering once | |
| 55 | + | /// step 5 lands; until then it falls back to `passed`. | |
| 56 | + | status: Option<String>, | |
| 57 | + | /// Full typed `GateOutcome` as a JSON object, when present. | |
| 58 | + | /// Deserialized lazily by the consumer; sandod doesn't re-parse it. | |
| 59 | + | outcome: Option<serde_json::Value>, | |
| 60 | + | /// Relative path under `cfg.logs_root` to the persisted stdout/stderr. | |
| 61 | + | log_ref: Option<String>, | |
| 49 | 62 | } | |
| 50 | 63 | ||
| 51 | 64 | async fn get_state(State(s): State<AppState>) -> Result<Json<StateView>> { | |
| @@ -89,7 +102,7 @@ async fn get_state(State(s): State<AppState>) -> Result<Json<StateView>> { | |||
| 89 | 102 | let gates: Vec<GateView> = if let Some(ver) = gate_version.as_ref() { | |
| 90 | 103 | // Most recent gate_runs row per gate_kind for (tier, ver). | |
| 91 | 104 | sqlx::query( | |
| 92 | - | "SELECT gate_kind, passed, finished_at, detail | |
| 105 | + | "SELECT gate_kind, passed, finished_at, detail, status, outcome_json, log_ref | |
| 93 | 106 | FROM gate_runs g | |
| 94 | 107 | WHERE tier = ?1 AND version = ?2 | |
| 95 | 108 | AND id = (SELECT MAX(id) FROM gate_runs | |
| @@ -106,6 +119,10 @@ async fn get_state(State(s): State<AppState>) -> Result<Json<StateView>> { | |||
| 106 | 119 | passed: gr.get::<Option<i64>, _>("passed").map(|v| v != 0), | |
| 107 | 120 | finished_at: gr.get("finished_at"), | |
| 108 | 121 | detail: gr.get("detail"), | |
| 122 | + | status: gr.get("status"), | |
| 123 | + | outcome: gr.get::<Option<String>, _>("outcome_json") | |
| 124 | + | .and_then(|s| serde_json::from_str(&s).ok()), | |
| 125 | + | log_ref: gr.get("log_ref"), | |
| 109 | 126 | }) | |
| 110 | 127 | .collect() | |
| 111 | 128 | } else { | |
| @@ -146,6 +163,7 @@ async fn promote( | |||
| 146 | 163 | body: Option<Json<PromoteBody>>, | |
| 147 | 164 | ) -> Result<Json<serde_json::Value>> { | |
| 148 | 165 | let body = body.map(|Json(b)| b).unwrap_or_default(); | |
| 166 | + | let tier = crate::domain::TierId::new(tier); | |
| 149 | 167 | let idx = s.topo.tiers.iter().position(|t| t.name == tier) | |
| 150 | 168 | .ok_or(crate::error::Error::NotFound)?; | |
| 151 | 169 | if idx == 0 { | |
| @@ -157,7 +175,7 @@ async fn promote( | |||
| 157 | 175 | let source = &s.topo.tiers[idx - 1]; | |
| 158 | 176 | ||
| 159 | 177 | // Resolve version: explicit if given, else the source tier's current. | |
| 160 | - | let version = match body.version { | |
| 178 | + | let version_str = match body.version { | |
| 161 | 179 | Some(v) => v, | |
| 162 | 180 | None => sqlx::query_scalar::<_, Option<String>>( | |
| 163 | 181 | "SELECT current_version FROM tier_state WHERE tier = ?", | |
| @@ -170,10 +188,12 @@ async fn promote( | |||
| 170 | 188 | format!("no version specified and tier {} has no current_version", source.name), | |
| 171 | 189 | ))?, | |
| 172 | 190 | }; | |
| 191 | + | let version = crate::domain::Version::parse(&version_str) | |
| 192 | + | .map_err(|e| crate::error::Error::Other(anyhow::anyhow!(e)))?; | |
| 173 | 193 | ||
| 174 | 194 | // 1. Predecessor must have all of its gates green for this version (with | |
| 175 | 195 | // optional hotfix override that skips burn_in). | |
| 176 | - | let pending = unsatisfied_gates(&s.pool, &source.name, &version, body.hotfix).await?; | |
| 196 | + | let pending = unsatisfied_gates(&s.pool, source.name.as_str(), &version_str, body.hotfix).await?; | |
| 177 | 197 | if !pending.is_empty() { | |
| 178 | 198 | return Err(crate::error::Error::GateBlocked(format!( | |
| 179 | 199 | "{} gate(s) not satisfied on tier {}: {}", | |
| @@ -208,30 +228,39 @@ async fn promote( | |||
| 208 | 228 | crate::events::emit(&s.events, crate::events::Event::DeployStart { | |
| 209 | 229 | tier: target.name.clone(), node: node.name.clone(), version: version.clone(), | |
| 210 | 230 | }); | |
| 211 | - | let result = crate::deploy::deploy_node(node, &version, &staged_dir, s.cfg.primary_bin()).await; | |
| 231 | + | let result = crate::deploy::deploy_node(node, &version_str, &staged_dir, s.cfg.primary_bin()).await; | |
| 212 | 232 | let finished = chrono::Utc::now().to_rfc3339(); | |
| 213 | - | let (outcome, err_msg) = match &result { | |
| 214 | - | Ok(_) => ("ok", None), | |
| 215 | - | Err(e) => ("failed", Some(format!("{e:#}"))), | |
| 233 | + | let (outcome_obj, err_for_propagation) = match result { | |
| 234 | + | Ok(_) => (crate::outcome::DeployOutcome::ok(), None), | |
| 235 | + | Err(e) => { | |
| 236 | + | let msg = format!("{e:#}"); | |
| 237 | + | let kind = crate::classify::classify_deploy_error(&msg); | |
| 238 | + | (crate::outcome::DeployOutcome::failed(kind), Some(e)) | |
| 239 | + | } | |
| 216 | 240 | }; | |
| 241 | + | let outcome_json = serde_json::to_string(&outcome_obj) | |
| 242 | + | .unwrap_or_else(|e| format!("{{\"_serialize_error\":{e:?}}}")); | |
| 217 | 243 | sqlx::query( | |
| 218 | - | "INSERT INTO deploys (version, tier, node, started_at, finished_at, outcome, hotfix, reset_burn_in) | |
| 219 | - | VALUES (?, ?, ?, ?, ?, ?, ?, ?)", | |
| 244 | + | "INSERT INTO deploys (version, tier, node, started_at, finished_at, outcome, outcome_json, hotfix, reset_burn_in) | |
| 245 | + | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)", | |
| 220 | 246 | ) | |
| 221 | 247 | .bind(&version).bind(&target.name).bind(&node.name) | |
| 222 | - | .bind(&started).bind(&finished).bind(outcome) | |
| 248 | + | .bind(&started).bind(&finished).bind(outcome_obj.status_str()) | |
| 249 | + | .bind(&outcome_json) | |
| 223 | 250 | .bind(body.hotfix as i64).bind(body.reset_burn_in as i64) | |
| 224 | 251 | .execute(&s.pool).await.map_err(crate::error::Error::Db)?; | |
| 225 | - | if let Err(e) = result { | |
| 226 | - | let msg = err_msg.unwrap_or_default(); | |
| 252 | + | if let Some(e) = err_for_propagation { | |
| 253 | + | let crate::outcome::DeployStatus::Failed { failure } = outcome_obj.status else { | |
| 254 | + | unreachable!("err_for_propagation is Some iff status is Failed"); | |
| 255 | + | }; | |
| 227 | 256 | tracing::error!( | |
| 228 | 257 | tier = %target.name, node = %node.name, version = %version, | |
| 229 | - | error = %msg, | |
| 258 | + | failure = failure.summary(), | |
| 230 | 259 | "deploy failed; current symlink left intact, tier_state not advanced" | |
| 231 | 260 | ); | |
| 232 | 261 | crate::events::emit(&s.events, crate::events::Event::DeployFailed { | |
| 233 | 262 | tier: target.name.clone(), node: node.name.clone(), | |
| 234 | - | version: version.clone(), error: msg, | |
| 263 | + | version: version.clone(), failure, | |
| 235 | 264 | }); | |
| 236 | 265 | return Err(crate::error::Error::Other(e)); | |
| 237 | 266 | } | |
| @@ -317,6 +346,7 @@ async fn rollback( | |||
| 317 | 346 | State(s): State<AppState>, | |
| 318 | 347 | Path(tier): Path<String>, | |
| 319 | 348 | ) -> Result<Json<serde_json::Value>> { | |
| 349 | + | let tier = crate::domain::TierId::new(tier); | |
| 320 | 350 | let target = s.topo.tiers.iter().find(|t| t.name == tier) | |
| 321 | 351 | .ok_or(crate::error::Error::NotFound)?; | |
| 322 | 352 | ||
| @@ -325,11 +355,15 @@ async fn rollback( | |||
| 325 | 355 | ) | |
| 326 | 356 | .bind(&tier) | |
| 327 | 357 | .fetch_optional(&s.pool).await.map_err(crate::error::Error::Db)?; | |
| 328 | - | let (Some(current), Some(previous)) = row.unwrap_or((None, None)) else { | |
| 358 | + | let (Some(current_str), Some(previous_str)) = row.unwrap_or((None, None)) else { | |
| 329 | 359 | return Err(crate::error::Error::GateBlocked( | |
| 330 | 360 | "no previous_version to roll back to".into(), | |
| 331 | 361 | )); | |
| 332 | 362 | }; | |
| 363 | + | let current = crate::domain::Version::parse(¤t_str) | |
| 364 | + | .map_err(|e| crate::error::Error::Other(anyhow::anyhow!(e)))?; | |
| 365 | + | let previous = crate::domain::Version::parse(&previous_str) | |
| 366 | + | .map_err(|e| crate::error::Error::Other(anyhow::anyhow!(e)))?; | |
| 333 | 367 | ||
| 334 | 368 | let bin: Option<(String,)> = sqlx::query_as( | |
| 335 | 369 | "SELECT artifact_path FROM versions WHERE version = ?", | |
| @@ -347,7 +381,7 @@ async fn rollback( | |||
| 347 | 381 | .to_path_buf(); | |
| 348 | 382 | ||
| 349 | 383 | for node in &target.nodes { | |
| 350 | - | crate::deploy::deploy_node(node, &previous, &staged_dir, s.cfg.primary_bin()) | |
| 384 | + | crate::deploy::deploy_node(node, &previous_str, &staged_dir, s.cfg.primary_bin()) | |
| 351 | 385 | .await | |
| 352 | 386 | .map_err(crate::error::Error::Other)?; | |
| 353 | 387 | } | |
| @@ -395,6 +429,11 @@ async fn rebuild( | |||
| 395 | 429 | .map_err(crate::error::Error::Other)?, | |
| 396 | 430 | }; | |
| 397 | 431 | ||
| 432 | + | // Boundary parse: a sha entering Sando must be hex of plausible length. | |
| 433 | + | // The build pipeline downstream only ever sees `GitSha`. | |
| 434 | + | let sha = crate::domain::GitSha::parse(&sha) | |
| 435 | + | .map_err(|e| crate::error::Error::Other(anyhow::anyhow!(e)))?; | |
| 436 | + | ||
| 398 | 437 | tracing::info!(sha = %sha, "rebuild requested"); | |
| 399 | 438 | crate::events::emit(&s.events, crate::events::Event::RebuildRequested { sha: sha.clone() }); | |
| 400 | 439 | ||
| @@ -416,6 +455,7 @@ async fn rebuild( | |||
| 416 | 455 | let topo = s.topo.clone(); | |
| 417 | 456 | let events_for_task = s.events.clone(); | |
| 418 | 457 | let sha_for_task = sha.clone(); | |
| 458 | + | let sha_response = sha.to_string(); | |
| 419 | 459 | let handle = tokio::spawn(async move { | |
| 420 | 460 | if let Err(e) = crate::build::build_and_run_host(pool, cfg, topo, sha_for_task.clone(), events_for_task).await { | |
| 421 | 461 | tracing::error!(sha = %sha_for_task, error = %e, "rebuild pipeline failed"); | |
| @@ -423,7 +463,7 @@ async fn rebuild( | |||
| 423 | 463 | }); | |
| 424 | 464 | *slot = Some(handle.abort_handle()); | |
| 425 | 465 | ||
| 426 | - | Ok(Json(serde_json::json!({ "accepted": true, "sha": sha }))) | |
| 466 | + | Ok(Json(serde_json::json!({ "accepted": true, "sha": sha_response }))) | |
| 427 | 467 | } | |
| 428 | 468 | ||
| 429 | 469 | async fn confirm( | |
| @@ -433,17 +473,20 @@ async fn confirm( | |||
| 433 | 473 | // Operator-driven satisfaction of a `manual_confirm` gate. Looks up the | |
| 434 | 474 | // pending version (current MM version, or the tier's own if non-mm) and | |
| 435 | 475 | // inserts a passing gate_runs row so /promote can advance. | |
| 476 | + | let tier = crate::domain::TierId::new(tier); | |
| 436 | 477 | let target = s.topo.tiers.iter().find(|t| t.name == tier) | |
| 437 | 478 | .ok_or(crate::error::Error::NotFound)?; | |
| 438 | 479 | ||
| 439 | - | let version: Option<String> = sqlx::query_scalar( | |
| 480 | + | let version_str: Option<String> = sqlx::query_scalar( | |
| 440 | 481 | "SELECT current_version FROM tier_state WHERE tier = ?", | |
| 441 | 482 | ) | |
| 442 | 483 | .bind(&target.name) | |
| 443 | 484 | .fetch_optional(&s.pool).await.map_err(crate::error::Error::Db)?.flatten(); | |
| 444 | - | let version = version.ok_or_else(|| crate::error::Error::GateBlocked( | |
| 485 | + | let version_str = version_str.ok_or_else(|| crate::error::Error::GateBlocked( | |
| 445 | 486 | format!("tier {tier} has no current_version; nothing to confirm"), | |
| 446 | 487 | ))?; | |
| 488 | + | let version = crate::domain::Version::parse(&version_str) | |
| 489 | + | .map_err(|e| crate::error::Error::Other(anyhow::anyhow!(e)))?; | |
| 447 | 490 | ||
| 448 | 491 | let now = chrono::Utc::now().to_rfc3339(); | |
| 449 | 492 | sqlx::query( |
| @@ -1,3 +1,4 @@ | |||
| 1 | + | use crate::domain::{GateKind, NodeId, TierId}; | |
| 1 | 2 | use anyhow::{Context, Result}; | |
| 2 | 3 | use serde::{Deserialize, Serialize}; | |
| 3 | 4 | use std::path::Path; | |
| @@ -24,7 +25,7 @@ pub struct BackupConfig { | |||
| 24 | 25 | ||
| 25 | 26 | #[derive(Debug, Clone, Serialize, Deserialize)] | |
| 26 | 27 | pub struct Tier { | |
| 27 | - | pub name: String, | |
| 28 | + | pub name: TierId, | |
| 28 | 29 | #[serde(default)] | |
| 29 | 30 | pub provisioned: bool, | |
| 30 | 31 | pub gates: Vec<Gate>, | |
| @@ -36,7 +37,7 @@ pub struct Tier { | |||
| 36 | 37 | ||
| 37 | 38 | #[derive(Debug, Clone, Serialize, Deserialize)] | |
| 38 | 39 | pub struct Node { | |
| 39 | - | pub name: String, | |
| 40 | + | pub name: NodeId, | |
| 40 | 41 | pub ssh_target: String, | |
| 41 | 42 | pub release_root: String, | |
| 42 | 43 | /// systemd unit name to reload-or-restart after the symlink swap. | |
| @@ -74,6 +75,21 @@ pub enum Gate { | |||
| 74 | 75 | ManualConfirm, | |
| 75 | 76 | } | |
| 76 | 77 | ||
| 78 | + | impl Gate { | |
| 79 | + | /// The discriminant — the identifier we use in events, schema columns, | |
| 80 | + | /// and the TUI. Gate parameters (e.g. `BurnIn.hours`) stay with `Gate` | |
| 81 | + | /// and are not carried into `gate_runs` history. | |
| 82 | + | pub fn kind(&self) -> GateKind { | |
| 83 | + | match self { | |
| 84 | + | Gate::CargoTest => GateKind::CargoTest, | |
| 85 | + | Gate::MigrationDryRun => GateKind::MigrationDryRun, | |
| 86 | + | Gate::BootSmoke => GateKind::BootSmoke, | |
| 87 | + | Gate::BurnIn { .. } => GateKind::BurnIn, | |
| 88 | + | Gate::ManualConfirm => GateKind::ManualConfirm, | |
| 89 | + | } | |
| 90 | + | } | |
| 91 | + | } | |
| 92 | + | ||
| 77 | 93 | impl Topology { | |
| 78 | 94 | pub fn load(path: &Path) -> Result<Self> { | |
| 79 | 95 | let raw = std::fs::read_to_string(path) | |
| @@ -86,7 +102,7 @@ impl Topology { | |||
| 86 | 102 | fn validate(&self) -> Result<()> { | |
| 87 | 103 | anyhow::ensure!(!self.tiers.is_empty(), "topology must declare at least one tier"); | |
| 88 | 104 | for t in &self.tiers { | |
| 89 | - | if t.provisioned && t.nodes.is_empty() && t.name != "host" { | |
| 105 | + | if t.provisioned && t.nodes.is_empty() && t.name.as_str() != "host" { | |
| 90 | 106 | anyhow::bail!("tier {} is provisioned but has no nodes", t.name); | |
| 91 | 107 | } | |
| 92 | 108 | } |
| @@ -0,0 +1,213 @@ | |||
| 1 | + | # Sando observability architecture | |
| 2 | + | ||
| 3 | + | Status: draft, 2026-06-03. Argument shape, not a checklist. | |
| 4 | + | ||
| 5 | + | Goal: an error and visibility surface built on newtypes. Sando's current daemon and TUI agree by string convention — `tier: String`, `version: String`, `gate: String`, `detail: Option<String>` — and every consumer (schema, WS payloads, TUI render) reparses or pattern-matches those strings independently. The result is that gate failure classification only exists as prose in `gate_runs.detail`, and the TUI's only recourse on red is "open `/logs/<version>/<gate>` and read the tail." This document proposes the type graph that replaces the strings, the boundary parses that mint the types, and the persistence + wire shapes that follow. | |
| 6 | + | ||
| 7 | + | This is one cohesive design. The launchplan splits it into "Phase B (typed GateOutcome)" and "Phase C (live tail + remaining newtypes)"; that split is a delivery convenience, not an architectural seam. The migration order below preserves the option to ship Phase B first, but the types it introduces must already be the final shape, not a string stand-in to be newtyped later. | |
| 8 | + | ||
| 9 | + | ## What is currently stringly-typed | |
| 10 | + | ||
| 11 | + | Inventory from a read of `daemon/src/gates.rs`, `daemon/src/events.rs`, `daemon/src/topology.rs`, `daemon/migrations/001_init.sql`, and `tui/src/main.rs:186-266`: | |
| 12 | + | ||
| 13 | + | | Concept | Current shape | Where it appears | | |
| 14 | + | |---|---|---| | |
| 15 | + | | Tier | `tier: String` | `GateCtx`, every `Event` variant, `tiers.name`, `gate_runs.tier`, `tier_state.tier`, `deploys.tier`, TUI `str_v(v.get("tier"))` | | |
| 16 | + | | Node | `node: String` (sometimes `Option`) | `Tier.nodes[].name`, `deploys.node`, `Event::Deploy*` | | |
| 17 | + | | Version | `version: String` (semver `0.8.12`) | `GateCtx`, `versions.version`, every event, TUI display | | |
| 18 | + | | Git sha | `sha: String` (short or full?) | `versions.git_sha`, `Event::RebuildRequested/BuildStart/BuildOk/BuildFailed/BuildAborted`, `/logs/<sha>/<gate>` per launchplan §3 | | |
| 19 | + | | Gate kind | `Gate` enum *inside the daemon*; `gate_kind: &'static str` once it crosses the events/schema boundary; reparsed in TUI | `gates::kind_str`, `gate_runs.gate_kind`, `Event::Gate*.gate`, TUI `match kind` | | |
| 20 | + | | Gate outcome detail | `detail: Option<String>` | `GateOutcome.detail`, `gate_runs.detail`, conflates: config error, missing prerequisite, real failure with stderr tail, burn-in clock progress, "waiting on operator" | | |
| 21 | + | | Deploy outcome | `outcome TEXT DEFAULT 'in_progress'` | `deploys.outcome`; no enum on the Rust side | | |
| 22 | + | | Deploy failure | `error: String` | `Event::DeployFailed.error`; freeform | | |
| 23 | + | | Backup source | `source: String` | `backups.source`, `Event::BackupFetched.source`; an ssh URL today | | |
| 24 | + | | Canary policy | typed (`CanaryPolicy`) at config load, then `as_str()` to a `TEXT` column | `tiers.canary` | | |
| 25 | + | ||
| 26 | + | `gates::kind_str` and the TUI's `match kind` are the clearest tell — we have an enum on each side of the wire and a string in the middle. That string is the only schema, so adding a gate kind is a four-place edit with no compile-time enforcement. | |
| 27 | + | ||
| 28 | + | The same anti-pattern with higher stakes is `detail`. Three concrete examples from `gates.rs`: | |
| 29 | + | ||
| 30 | + | - `migration_dry_run` writes `"scratch_db_url unset in daemon config"` (a config bug, no point retrying). | |
| 31 | + | - `migration_dry_run` writes `"no backup fetched; call /backup/fetch first"` (a missing prerequisite, retry after `/backup/fetch`). | |
| 32 | + | - `boot_smoke` writes `"binary exited early: exit status: 101\n==== stdout ====\n…"` (a real failure carrying a tail). | |
| 33 | + | - `burn_in` writes `"47 hours remaining of 168"` (not a failure at all — the gate is correctly red but the deploy is not blocked by a defect). | |
| 34 | + | ||
| 35 | + | Today, an operator (or the TUI, or a future alerting rule) cannot tell those apart without substring-matching the prose. The point of the redesign is to make that classification a property of the value, not a property of how it's spelled. | |
| 36 | + | ||
| 37 | + | ## The type graph | |
| 38 | + | ||
| 39 | + | Domain types live in a new `daemon/src/domain.rs` (name negotiable). They are the vocabulary every other module speaks; they implement `Serialize`/`Deserialize`, `Display`, `FromStr`, and `sqlx::Type` so they round-trip through events, JSON responses, and SQLite columns without per-site conversion. | |
| 40 | + | ||
| 41 | + | ```text | |
| 42 | + | ┌──────────────┐ | |
| 43 | + | │ TierId │ newtype String, validated against topology on construction | |
| 44 | + | └──────────────┘ | |
| 45 | + | ┌──────────────┐ | |
| 46 | + | │ NodeId │ newtype String, validated against TierId's nodes | |
| 47 | + | └──────────────┘ | |
| 48 | + | ┌──────────────┐ | |
| 49 | + | │ Version │ parsed semver (semver::Version), Display = "0.8.12" | |
| 50 | + | └──────────────┘ | |
| 51 | + | ┌──────────────┐ | |
| 52 | + | │ GitSha │ enforced 40-hex or short-7-hex; one canonical form | |
| 53 | + | └──────────────┘ | |
| 54 | + | ┌──────────────┐ | |
| 55 | + | │ GateKind │ enum, replaces both `Gate` discriminant and `gate_kind` string | |
| 56 | + | │ │ Variants: CargoTest, MigrationDryRun, BootSmoke, | |
| 57 | + | │ │ BurnIn, ManualConfirm. Display = snake_case. | |
| 58 | + | └──────────────┘ | |
| 59 | + | ┌──────────────┐ | |
| 60 | + | │ GateRunId │ newtype i64, identifies one row of gate_runs | |
| 61 | + | └──────────────┘ | |
| 62 | + | ┌──────────────┐ | |
| 63 | + | │ DeployId │ newtype i64 | |
| 64 | + | └──────────────┘ | |
| 65 | + | ``` | |
| 66 | + | ||
| 67 | + | `Gate` (the topology struct with `BurnIn { hours }`) and `GateKind` (the discriminant) become distinct: `Gate` is the *config* (kind + parameters), `GateKind` is the *identifier* you use to talk about a class of gate in events and the schema. `gate_runs.gate_kind` stores `GateKind`; gate parameters at the time of the run, if we ever need them in history, become a separate column. | |
| 68 | + | ||
| 69 | + | ### `GateOutcome` redesign | |
| 70 | + | ||
| 71 | + | ```rust | |
| 72 | + | pub struct GateOutcome { | |
| 73 | + | pub status: GateStatus, | |
| 74 | + | pub log_ref: Option<LogRef>, // pointer to persisted stdout/stderr, not the tail itself | |
| 75 | + | } | |
| 76 | + | ||
| 77 | + | pub enum GateStatus { | |
| 78 | + | Passed { note: PassNote }, | |
| 79 | + | Failed(GateFailure), | |
| 80 | + | Blocked(GateBlocker), // gate cannot run yet; not a defect | |
| 81 | + | } | |
| 82 | + | ||
| 83 | + | pub enum PassNote { | |
| 84 | + | StayedUp { duration_s: u32 }, // boot_smoke | |
| 85 | + | BurnInElapsed { hours: u32 }, // burn_in | |
| 86 | + | Migrated { backup: BackupId, count: u32 }, // migration_dry_run | |
| 87 | + | TestsPassed { count: u32, duration_s: u32 }, // cargo_test | |
| 88 | + | OperatorConfirmed { at: DateTime<Utc> }, // manual_confirm | |
| 89 | + | } | |
| 90 | + | ||
| 91 | + | pub enum GateBlocker { | |
| 92 | + | BurnInClockNotStarted, | |
| 93 | + | BurnInRemaining { hours: u32, total: u32 }, | |
| 94 | + | AwaitingOperatorConfirmation, | |
| 95 | + | NoBackupAvailable, | |
| 96 | + | ScratchDbUrlUnset, | |
| 97 | + | ArtifactMissing { version: Version }, | |
| 98 | + | } | |
| 99 | + | ||
| 100 | + | pub enum GateFailure { | |
| 101 | + | CargoTest { failed_count: u32, first_failed: Option<String> }, | |
| 102 | + | MigrationDrift { migration: String }, // "previously applied but missing" | |
| 103 | + | MigrationModified { migration: String }, // "previously applied but modified" | |
| 104 | + | MigrationSqlError { migration: String, sqlstate: Option<String> }, | |
| 105 | + | RestoreFailed { kind: RestoreFailureKind }, | |
| 106 | + | BootPanic { exit_code: Option<i32> }, | |
| 107 | + | BootExitedEarly { exit_code: Option<i32> }, | |
| 108 | + | SpawnFailed { os_error: i32 }, | |
| 109 | + | Timeout { gate: GateKind, after_s: u32 }, | |
| 110 | + | Unclassified, // fallthrough; log_ref required | |
| 111 | + | } | |
| 112 | + | ``` | |
| 113 | + | ||
| 114 | + | Three things to notice about this shape: | |
| 115 | + | ||
| 116 | + | 1. **`Blocked` is its own variant.** Burn-in not yet elapsed and "scratch_db_url unset" are not failures — they are pre-conditions the operator can address out-of-band. The TUI can render them yellow instead of red. Today they're red, indistinguishably. | |
| 117 | + | 2. **`log_ref`, not `detail`.** The structured variants carry just enough to render a one-line summary (`migration 0047 modified`, `tests failed: 3`). The full tail lives on disk and the variant carries a pointer to it. This is the architectural seam between Phase B (classification) and Phase C (live tail) — once `log_ref` exists, Phase C is "ship chunks of that log over WS as they're written" rather than a redesign. | |
| 118 | + | 3. **`Unclassified` is admitted.** Classifiers are best-effort and the migration plan must work when a new failure mode shows up that no classifier matches yet. Unclassified failures still have a `log_ref`; they degrade gracefully to the current "read the tail" experience without breaking the contract. | |
| 119 | + | ||
| 120 | + | ### Classifier layer | |
| 121 | + | ||
| 122 | + | Each gate's runner produces raw output (`stdout`, `stderr`, exit status, plus any structured signals like a sqlx error). A classifier maps `(GateKind, RawOutput) -> GateStatus`. Classifiers are pure functions and live in `daemon/src/classify/` — one file per gate. The taxonomy in `plans/migration-dryrun-failures.md` is already 80% of `migration_dry_run`'s classifier; it gets ported into code, not just docs. | |
| 123 | + | ||
| 124 | + | Classifiers can be unit-tested with captured fixtures (`tests/fixtures/cargo_test_failed_compile.txt` → `GateStatus::Failed(GateFailure::CargoTest { failed_count: 0, .. })`). This is the first place Sando gets meaningful test coverage for diagnostic behavior, which `todo.md` flags as a gap. | |
| 125 | + | ||
| 126 | + | ### Where each type enters Sando | |
| 127 | + | ||
| 128 | + | Boundary discipline: every newtype is constructed once, at the edge where its string form enters the process. Internally, only the typed form moves. | |
| 129 | + | ||
| 130 | + | - `TierId`, `NodeId` — at `Topology::load`. The validator already walks tiers; it now mints `TierId`s and rejects unknown references at parse time instead of letting them surface as foreign-key failures. | |
| 131 | + | - `Version` — at the build step (`build.rs`), parsed from the server `Cargo.toml`. Stored as text in `versions.version`; the column round-trips through `sqlx::Type for Version`. | |
| 132 | + | - `GitSha` — at the post-receive hook entry point in `routes.rs::rebuild`. The hook today passes a string; the route normalizes to `GitSha` immediately. The `/logs/<sha>/<gate>` route (launchplan §3 wording) becomes `/logs/<GitSha>/<GateKind>`, but on-disk storage continues to key by `Version` until the build step has both available together. The comment at `gates.rs:432-434` already anticipates this transition. | |
| 133 | + | - `GateKind` — at `Topology` deserialization. `Gate` keeps its parameter-carrying variants; `GateKind` is derived from `Gate` and used everywhere else. | |
| 134 | + | - `GateOutcome` — produced by gate runners, persisted, emitted in events, displayed in TUI. Never lowered to a `String` along the way. | |
| 135 | + | ||
| 136 | + | ## Persistence | |
| 137 | + | ||
| 138 | + | `gate_runs` becomes: | |
| 139 | + | ||
| 140 | + | ```sql | |
| 141 | + | CREATE TABLE gate_runs ( | |
| 142 | + | id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| 143 | + | version TEXT NOT NULL REFERENCES versions(version), | |
| 144 | + | tier TEXT NOT NULL REFERENCES tiers(name), | |
| 145 | + | gate_kind TEXT NOT NULL, -- GateKind, Display form | |
| 146 | + | started_at TEXT NOT NULL, | |
| 147 | + | finished_at TEXT, | |
| 148 | + | status TEXT, -- 'passed' | 'failed' | 'blocked' | NULL while in-flight | |
| 149 | + | outcome_json TEXT, -- serialized GateOutcome (PassNote / GateFailure / GateBlocker) | |
| 150 | + | log_ref TEXT -- relative path under cfg.logs_root, or NULL | |
| 151 | + | ); | |
| 152 | + | ``` | |
| 153 | + | ||
| 154 | + | `status` is denormalized for cheap indexing and `WHERE status = 'failed'` queries. `outcome_json` is the source of truth and is what the daemon reads back when serving `/state`. The migration drops `passed INTEGER` and `detail TEXT`, with a backfill that maps: | |
| 155 | + | ||
| 156 | + | - `passed = 1` → `status = 'passed'`, `outcome_json = {"kind":"passed","note":{"kind":"legacy","text":<old detail>}}` | |
| 157 | + | - `passed = 0`, detail matches a known prefix (`"burn-in"`, `"scratch_db_url unset"`, `"no backup fetched"`, `"waiting on operator"`) → `status = 'blocked'`, appropriate `GateBlocker` variant | |
| 158 | + | - `passed = 0` otherwise → `status = 'failed'`, `outcome_json = {"kind":"failed","failure":{"kind":"unclassified","legacy_detail":<old>}}` | |
| 159 | + | ||
| 160 | + | Backfill correctness is testable against the existing prod sandod sqlite — there are not many rows. | |
| 161 | + | ||
| 162 | + | `deploys.outcome` gets the same treatment in a smaller, separate migration: enum-typed (`InProgress | Ok | Failed { kind: DeployFailureKind }`), with the failure kind classifying spawn/transport/health-check distinctly from the freeform `error: String` carried by `Event::DeployFailed` today. | |
| 163 | + | ||
| 164 | + | ## Wire shape (events + `/state`) | |
| 165 | + | ||
| 166 | + | `Event::GateDone` becomes: | |
| 167 | + | ||
| 168 | + | ```rust | |
| 169 | + | GateDone { | |
| 170 | + | tier: TierId, | |
| 171 | + | version: Version, | |
| 172 | + | gate: GateKind, | |
| 173 | + | outcome: GateOutcome, | |
| 174 | + | } | |
| 175 | + | ``` | |
| 176 | + | ||
| 177 | + | The old `passed: bool` is gone — `outcome.status` carries strictly more information. The TUI's `format_event` becomes a match on the typed envelope (deserialized via `serde_json::from_str::<EventEnvelope>`, not `Value` reflection), and the per-kind render functions in `tui/src/main.rs:186-266` collapse into `Display` impls on the domain types. The `str_v` / `num_v` helpers stop being needed. | |
| 178 | + | ||
| 179 | + | `/state` (the TUI's polling endpoint) gains a `gates: Vec<GateRunSummary>` per tier, where `GateRunSummary` is the typed outcome plus timestamps and a `log_ref`. The TUI no longer needs `/logs/...` to populate its primary view; `/logs` becomes the drill-down for unclassified failures only. | |
| 180 | + | ||
| 181 | + | Phase C's live tail rides on the same `log_ref`: while a gate is in flight, `log_ref` is present and the WS emits `GateLogChunk { run_id: GateRunId, bytes: Vec<u8>, seq: u32 }` events as the runner flushes to disk. The TUI keeps a per-run ring buffer keyed by `GateRunId`. Because `GateRunId` is a newtype with a clear identity, the wire protocol does not need to invent a separate stream identifier. | |
| 182 | + | ||
| 183 | + | ## Migration order | |
| 184 | + | ||
| 185 | + | The order is constrained by what compiles together, not by the launchplan's B-then-C framing. | |
| 186 | + | ||
| 187 | + | 1. **Domain types module** — introduce `TierId`, `NodeId`, `Version`, `GitSha`, `GateKind`, `GateRunId`, `DeployId` with `Serialize` / `sqlx::Type` impls. No call sites change yet. Pure addition. | |
| 188 | + | 2. **Topology + config use the types** — `Topology::load` mints `TierId` / `NodeId`; `GateCtx` carries `TierId`, `Version`, not strings. `gate_runs.tier` and `gate_runs.version` columns stay `TEXT` (no schema migration); the change is on the Rust side only. `kind_str` deletes. | |
| 189 | + | 3. **Events use the types** — every `Event` variant takes the newtypes. WS frames are unchanged on the wire (snake-case strings) because `Serialize` impls produce identical JSON. The TUI keeps its `Value` reflection during this step; nothing forces it to break. | |
| 190 | + | 4. **`GateOutcome` redesign + classifier layer + `gate_runs` migration** — the biggest single step. Runners produce typed outcomes; classifiers live in `daemon/src/classify/`; schema migration 003 lands with backfill. `/state` exposes typed outcomes. TUI keeps `Value` reflection but is now reading typed fields (`status`, `outcome_json`). | |
| 191 | + | 5. **TUI typed event handling** — deserialize `EventEnvelope` directly, drop `str_v` / `num_v`, render via `Display` on domain types. Yellow for `Blocked`, red for `Failed`, green for `Passed`. | |
| 192 | + | 6. **Live tail (`GateLogChunk` events, `log_ref` plumbing)** — gates write to a per-run log file as they run, not at completion; runners drop the `Vec<u8>` buffers in `boot_smoke` / `cargo_test` in favor of streaming through a `tokio::fs::File` writer that also broadcasts chunks. TUI consumes. | |
| 193 | + | 7. **Deploy outcome typing + `DeployFailed` failure-kind classifier** — same pattern, smaller scope. Closes out the rest of the string surface. | |
| 194 | + | ||
| 195 | + | Steps 1–3 can ship in one commit per step without changing observable behavior. Step 4 is where the operator visibility actually changes; it's the smallest unit that justifies a version bump and a sando deploy to itself. | |
| 196 | + | ||
| 197 | + | ## Non-goals and open questions | |
| 198 | + | ||
| 199 | + | - **Not aiming for a generic gate framework.** The `GateFailure` variants are MNW-specific (cargo, sqlx migrations, `makenotwork` binary boot). If we ever gate non-MNW projects, this enum will need to factor; until then, concrete variants are clearer than `Box<dyn>` traits. | |
| 200 | + | - **Not introducing `Result<GateOutcome, GateError>`.** Runner-internal failures (sqlite I/O, can't spawn) collapse into `GateStatus::Failed(GateFailure::SpawnFailed | …)` or `GateBlocker` as appropriate. The outer `Result` only exists for genuine "the daemon itself is broken" cases that prevent persistence at all. | |
| 201 | + | - **Open: backfill aggressiveness.** Should we run classifiers retroactively against the historical `detail` strings during the schema-3 backfill, or only forward? Forward-only is simpler and the history isn't operator-load-bearing — leaning toward forward-only with the `unclassified + legacy_detail` envelope above, but worth deciding. | |
| 202 | + | - **Open: where does `GateRunId` originate?** Today the `INSERT … RETURNING id` returns a raw i64 inside `gates::run`. Cleanest is for `gates::run` to mint a `GateRunId` and pass it back; the event bus then carries it from `GateStart` through `GateLogChunk` through `GateDone`, making client-side correlation trivial. Worth confirming this doesn't break the `manual_confirm` lookup pattern in `gates.rs:386-403`. | |
| 203 | + | - **Open: `serde(tag = "kind")` vs explicit variant shape on `GateOutcome`.** The events module already uses `#[serde(tag = "kind", rename_all = "snake_case")]`, which would give us `{"kind":"failed","failure":{"kind":"migration_drift","migration":"0047"}}`. Two layers of tagged unions is verbose but consistent. Alternative: flatten via `#[serde(flatten)]` and accept that the JSON shape diverges slightly from the Rust shape. Leaning toward the explicit two-layer form because the TUI's parser then mirrors the Rust enum 1:1. | |
| 204 | + | - **Open: do we keep `detail TEXT` for one release as a shadow column, or drop in the same migration?** Shadow is safer for rollback. Adds clutter. Probably worth one release of shadow. | |
| 205 | + | ||
| 206 | + | ## Acceptance, when this is done | |
| 207 | + | ||
| 208 | + | - `gates.rs` contains no `String` literals describing failure modes. Every failure path constructs a `GateFailure` variant. | |
| 209 | + | - The TUI's `format_event` is gone; rendering goes through `Display` on domain types. | |
| 210 | + | - An operator hitting `c /api/state` sees structured outcomes (not just `"passed": false, "detail": "..."`). | |
| 211 | + | - A new gate kind is added by extending `GateKind`, `Gate`, and a classifier; the compiler enforces every other site. | |
| 212 | + | - `/logs/<version>/<gate>` is a fallback for `Unclassified` outcomes, not the primary diagnostic path. | |
| 213 | + | - Live tail works for `boot_smoke` and `cargo_test`; the buffered `Vec<u8>` pattern in `gates.rs:317-332` is gone. |
| @@ -0,0 +1,163 @@ | |||
| 1 | + | # Session 3 — first sando-driven prod deploy | |
| 2 | + | ||
| 3 | + | Captured 2026-06-03 after the cutover. Resolves §6.5 step 8 of `launchplan_final.md`: first full sando deploy to Hetzner prod, replacing `deploy.sh` as the live deploy path. | |
| 4 | + | ||
| 5 | + | Status: **complete 2026-06-03.** Prod runs `makenotwork` 0.9.5 (sha `f0970b8`) from `/opt/mnw/current/`, deployed via `POST /promote/b {"hotfix":true}` from sandod on pop-os. Outage window 3m25s (02:50:33 → 02:53:58 UTC). All features green. See §F for outcomes and §G for the four hardcoded paths that block the eventual `rm -rf /opt/makenotwork/`. | |
| 6 | + | ||
| 7 | + | ## Background — Session 1 set the layout, Session 2 proved it on testnot, Session 3 cut prod over | |
| 8 | + | ||
| 9 | + | Session 1 redesigned the on-disk layout (`/opt/mnw/releases/<v>/` + `current` symlink; `/etc/mnw/makenotwork.env`; `/var/lib/mnw/` for state) and shipped the sando-side code that produces the full versioned bundle (binaries + static + docs + error-pages + assumptions). Session 2 reprovisioned testnot under that layout; the first remote deploy of the full bundle landed cleanly after three small gotchas (sqlx URL form, pg_ident map, `ASSUMPTIONS_PATH` mismatch — all logged in `launchplan_final.md` §6.9). | |
| 10 | + | ||
| 11 | + | Session 3 is the real-stakes one: prod was on 0.9.1 via `deploy.sh`, `/opt/makenotwork/` had eight months of accreted state (885M of backups, .env, yara-rules, ssh dir, rustdoc, sudoers entries, cron jobs, Caddyfile references). The Session 1 plan enumerated some of the move sequence but understated the surface area; the actual cutover surfaced several things worth documenting so the next major reprovision (or a disaster-recovery rebuild) doesn't re-discover them. | |
| 12 | + | ||
| 13 | + | ## A. Inventory taken before any prod write | |
| 14 | + | ||
| 15 | + | `/opt/makenotwork/` contents (`makenotwork:makenotwork` unless noted): | |
| 16 | + | ||
| 17 | + | - `makenotwork`, `mnw-admin` — 0.9.1 binaries (`root:root`) | |
| 18 | + | - `.env` (110 lines), 5× `.env.bak.*` files (`root:root`) | |
| 19 | + | - `docs/`, `static/`, `error-pages/` — content (will be replaced by release bundle) | |
| 20 | + | - `backups/` — 885M | |
| 21 | + | - `yara-rules/` — 8.5M compiled, `root:root` | |
| 22 | + | - `yara-rules-src/` — upstream YARA sources (compiled to `yara-rules/`), `root:root` | |
| 23 | + | - `rustdoc/` — generated docs, `501:staff` (uploaded from Mac via `deploy.sh`) | |
| 24 | + | - `ssh/` — `known_hosts` for build runner, `root:root` | |
| 25 | + | - `backup-db.sh` — cron'd daily at 03:00 UTC from `makenotwork`'s crontab | |
| 26 | + | - `deploy/` — `deploy.sh` staging area, `root:root` | |
| 27 | + | ||
| 28 | + | Other prod state in play: | |
| 29 | + | ||
| 30 | + | - `/opt/git/` — 99M, `git:git`. Both git user's home (`/etc/passwd` says `git:x:995:986::/opt/git:/bin/sh`) *and* the GIT_REPOS_PATH target. Conflating these turns out to matter (§F). | |
| 31 | + | - `/etc/caddy/Caddyfile` — three `root * /opt/makenotwork/error-pages` lines. | |
| 32 | + | - `/etc/sudoers.d/mnw-git-ssh` — `makenotwork ALL=(git) NOPASSWD: /opt/makenotwork/mnw-admin rebuild-keys`. | |
| 33 | + | - `/etc/sudoers.d/mnw-cli-git` — `mnw-cli ALL=(git) NOPASSWD: /usr/bin/git-*, /usr/bin/tee, /usr/bin/chmod`. No /opt path references; left alone. | |
| 34 | + | - `makenotwork` user crontab: `0 3 * * * /opt/makenotwork/backup-db.sh >> /opt/makenotwork/backups/backup.log 2>&1`. | |
| 35 | + | - Root crontab: `0 3 * * * /opt/backups/pg_backup.sh >> /var/log/pg_backup.log 2>&1` — unrelated, left alone. | |
| 36 | + | ||
| 37 | + | ## B. Pre-flight (no prod impact) | |
| 38 | + | ||
| 39 | + | 1. **`sando.toml` tier B fixed.** Was `deploy@prod-1.makenot.work` (NXDOMAIN, no port). Now `makenotwork@alpha-west-1` with port handling via `~sando/.ssh/config` Host block. Chose to keep service user as `makenotwork` rather than introduce a `deploy` user — avoids chowning 885M of backups and redoing pg peer auth that's been stable for months. The same reasoning applies to a hypothetical tier C: keep the existing user, don't introduce a new one for cosmetic uniformity with testnot. | |
| 40 | + | 2. **Sando pubkey installed** in `/home/makenotwork/.ssh/authorized_keys` (mode 0600, owned makenotwork). | |
| 41 | + | 3. **`chsh -s /bin/bash makenotwork`** — was `/usr/sbin/nologin`. SSH was rejecting connections, not key auth failing. Worth detecting/fixing in `bootstrap-node.sh` for future provisions where someone has hardened the runtime user. | |
| 42 | + | 4. **`/srv/sando/.ssh/config`** Host block for port 2200; `known_hosts` seeded via `ssh-keyscan -p 2200`. | |
| 43 | + | 5. **Dry-run rsync** from sando → prod's `/opt/mnw/releases/_probe/` succeeded (after `bootstrap-node.sh` created `/opt/mnw/`). | |
| 44 | + | ||
| 45 | + | ## C. Cutover sequence (3m25s outage) | |
| 46 | + | ||
| 47 | + | In order, with the exact reason each step exists: | |
| 48 | + | ||
| 49 | + | 1. **`systemctl stop makenotwork`** — 02:50:33 UTC. Outage window starts. | |
| 50 | + | 2. **Backups taken**: `/etc/systemd/system/makenotwork.service → /root/makenotwork.service.bak-pre-cutover`; `/opt/makenotwork/.env → /root/dotenv.bak-pre-cutover`; Caddyfile, sudoers, crontab also backed up to `/root/*.bak-pre-cutover`. Rollback path for any step failing before service restart. | |
| 51 | + | 3. **`bootstrap-node.sh`** with `SERVICE_USER=makenotwork SANDO_PUBKEY=… INSTALL_POSTGRES=0 INSTALL_CADDY=0 INSTALL_TAILSCALE=0 ENABLE_FIREWALL=0` — postgres/caddy/tailscale/UFW already configured on prod, don't touch. Created `/opt/mnw/`, `/etc/mnw/`, `/var/lib/mnw/`, the new systemd unit, the unused `deploy` user (harmless), the sudoers entry for `deploy`. The new unit references `EnvironmentFile=/etc/mnw/makenotwork.env` and `ReadWritePaths=/var/lib/mnw`, with `RestartPreventExitStatus=2` (MNW server convention: exit 2 = migration failure, don't crashloop). | |
| 52 | + | 4. **`cp /opt/makenotwork/.env /etc/mnw/makenotwork.env`** (copy, not move — original stays for one-week rollback). `chmod 0640 root:makenotwork`. Then `sed` rewrites of `DOCS_PATH`, `ASSUMPTIONS_PATH`, `YARA_RULES_DIR`, `GIT_REPOS_PATH` for the new layout. `HOST`, `PORT`, `DATABASE_URL`, `HOST_URL` unchanged. | |
| 53 | + | 5. **`ln -s /opt/makenotwork/yara-rules /opt/mnw/yara-rules`** — yara-rules is operator-managed (independent update cadence), not in the release bundle (Session 1 layout principle: category #3). The symlink lets the new env's `YARA_RULES_DIR=/opt/mnw/yara-rules` continue to resolve. When `/opt/makenotwork/` is eventually removed, the rules dir moves to a permanent path (probably `/var/lib/mnw/yara-rules` or `/etc/mnw/yara-rules`) and the symlink retargets. | |
| 54 | + | 6. **`rsync -aHX /opt/git/ /var/lib/mnw/git/`** — preserves `git:git` ownership and the directory hardlinks. `chmod 0755 /var/lib/mnw` so the git user can traverse (default was 0750 makenotwork:makenotwork, which blocked git's git-receive-pack from reaching the repos). | |
| 55 | + | 7. **Caddyfile rewrite**: `sed -i 's|/opt/makenotwork/error-pages|/opt/mnw/current/error-pages|g'`. `caddy validate` before reload; `systemctl reload caddy`. | |
| 56 | + | 8. **Sudoers rewrite**: same sed pattern on `/etc/sudoers.d/mnw-git-ssh`; `visudo -c -f` to validate. | |
| 57 | + | 9. **`systemctl daemon-reload`** to pick up the new unit. | |
| 58 | + | 10. **`systemctl restart sandod`** on pop-os — sandod caches `sando.toml` at startup; the new tier B target wouldn't have taken effect without this. **First `POST /promote/b` failed with NXDOMAIN against the stale `prod-1.makenot.work` because sandod hadn't been restarted yet.** Fixed by restarting sandod and re-promoting. | |
| 59 | + | 11. **`POST /promote/b {"hotfix":true}`** — `hotfix: true` bypasses the 48h burn-in on tier A (which had just promoted to 0.9.5 ~15 min prior; burn-in not yet elapsed). Sando rsync'd the 161MB bundle to `/opt/mnw/releases/0.9.5/`, swapped the `current` symlink, called `systemctl reload-or-restart makenotwork.service`. | |
| 60 | + | 12. **Service up 02:53:55 UTC.** Outage window ends 02:53:58 once health serves 200. 733 YARA rules compiled, all integrations (S3, Stripe, MT, WAM, git, scanner, custom domain cache) live. | |
| 61 | + | 13. **External smoke checks**: `/`, `/login`, `/pricing`, `/docs`, `/docs/economics`, `/docs/roadmap`, `/docs/tiers` — all 200. | |
| 62 | + | 14. **`rebuild-keys` to regenerate `/opt/git/.ssh/authorized_keys`** — `dotenvy` doesn't auto-load when running mnw-admin standalone (it loads from `/opt/makenotwork/.env`, mode 0600 `makenotwork:makenotwork`, unreadable by git). Worked around by sourcing the env in root then `sudo -u git -E`. **Regenerated keys still contain `command="/opt/makenotwork/mnw-admin git-auth ..."`** — see §G. | |
| 63 | + | 15. **Git push test** — `git ls-remote git@ssh.makenot.work:max/meta.git` returns refs cleanly. Cutover verified end-to-end. | |
| 64 | + | ||
| 65 | + | ## D. What stayed in place (intentional) | |
| 66 | + | ||
| 67 | + | - `/opt/makenotwork/` — full contents, untouched. Soak rollback path: stop new unit, swap systemd unit back, start old binary. Plan: `rm -rf` after a week, post-0.9.6 deploy (see §G). | |
| 68 | + | - `/opt/git/` — untouched. Git user's `/etc/passwd` home; mnw-admin's regenerated `authorized_keys` writes to `/opt/git/.ssh/authorized_keys` (not `/home/git/`, despite earlier confusion). The rsync to `/var/lib/mnw/git/` populated the new GIT_REPOS_PATH; the server reads from there, but git push lands in `/opt/git/` because that's git user's home. Both paths now hold the repo bytes; that's wasteful but harmless during the soak. | |
| 69 | + | - `/opt/makenotwork/backups/` — 885M of pg dumps. Script and cron still write there. Sando's backup-fetch on pop-os still pulls from there (configured pre-cutover). Migration to `/var/lib/mnw/backups/` is its own follow-up (touches script, crontab, pop-os sando config). | |
| 70 | + | - `yara-rules-src/`, `rustdoc/`, `ssh/`, `.env.bak.*` — not in any env var or systemd path. Confirmed by grepping the running 0.9.5 binary's path references. Will be swept in the post-soak cleanup. | |
| 71 | + | ||
| 72 | + | ## E. What broke and how it was caught | |
| 73 | + | ||
| 74 | + | Three small things, all caught by smoke checks: | |
| 75 | + | ||
| 76 | + | 1. **`sandod` cached `sando.toml`.** First promote attempt returned `creating remote release dir` (an in-flight progress string that became the error message). `journalctl -u sandod` showed it was still resolving `prod-1.makenot.work`. `scp sando.toml pop-os:/tmp/`, `sudo cp /tmp/sando.toml /etc/sando/sando.toml`, `sudo systemctl restart sandod`, re-promote. Worth documenting that `sandod` does not watch the file; alternative is to add an inotify or SIGHUP handler. | |
| 77 | + | 2. **First doc smoke checks were wrong URLs.** `/about/economics`, `/docs/about/economics` returned 404; panicked briefly that the cutover broke doc routing. False alarm: the route is `/docs/{slug}` where slug is the filename stem (e.g., `/docs/economics`). Verified with `grep doc_page MNW/server/src/` after the panic. **Worth fixing in any future smoke script** — use the real URL scheme, not guessed-from-filesystem paths. | |
| 78 | + | 3. **`mnw-admin rebuild-keys` needed env loading from root context.** `sudo -u git /opt/mnw/current/mnw-admin rebuild-keys` fails with `DATABASE_URL must be set: NotPresent` because the binary's `dotenvy::from_path("/opt/makenotwork/.env")` runs as git, which can't read `.env` (mode 0600 makenotwork). Workaround: `set -a; source /etc/mnw/makenotwork.env; set +a; sudo -u git -E /opt/mnw/current/mnw-admin rebuild-keys`. Cleanest long-term fix is in §G. | |
| 79 | + | ||
| 80 | + | ## F. Outcomes (verified) | |
| 81 | + | ||
| 82 | + | **Sando state after cutover:** | |
| 83 | + | ||
| 84 | + | ``` | |
| 85 | + | host cur=0.9.5 prev=0.9.5 burn_in_started=2026-06-03T02:23:28Z | |
| 86 | + | a cur=0.9.5 prev=0.8.12 burn_in_started=2026-06-03T02:38:57Z | |
| 87 | + | b cur=0.9.5 prev=None burn_in_started=2026-06-03T02:53:56Z | |
| 88 | + | c not provisioned | |
| 89 | + | ``` | |
| 90 | + | ||
| 91 | + | **Prod externally:** | |
| 92 | + | - `https://makenot.work/api/health` → `{"status":"operational","version":"0.9.5","checks":{"database":true}}`. | |
| 93 | + | - `/`, `/login`, `/pricing`, `/docs`, `/docs/economics`, `/docs/roadmap`, `/docs/tiers` → 200. | |
| 94 | + | - Git: `git ls-remote git@ssh.makenot.work:max/meta.git` → returns refs. | |
| 95 | + | ||
| 96 | + | **Prod internally:** | |
| 97 | + | - `systemctl status makenotwork` → active, PID 3123111, listening 0.0.0.0:3000. | |
| 98 | + | - 733 YARA rules compiled from `/opt/mnw/yara-rules` (symlink). | |
| 99 | + | - All integrations enabled per startup log: `s3=true, synckit_s3=false, stripe=true, scanner=true, mt=true, wam=true, git=true`. | |
| 100 | + | ||
| 101 | + | **deploy.sh path retained.** Not retired; remains as break-glass per `feedback_prefer_sando_over_deploy_sh` (sando is preferred *default*; deploy.sh stays runnable for outages where sando host is down). | |
| 102 | + | ||
| 103 | + | ## G. Open follow-ups | |
| 104 | + | ||
| 105 | + | ### G.1 The hardcoded `/opt/makenotwork/` paths (blocks the cleanup milestone) | |
| 106 | + | ||
| 107 | + | Session 1 outcomes claimed "`command=` prefixes auto-update on the first post-migration `rebuild-keys` run." That's wrong — confirmed during step 14. The path is a `const` in the binary, not pulled from env. Four sites need lifting before `/opt/makenotwork/` can be removed: | |
| 108 | + | ||
| 109 | + | | File | Line | Current value | Target | | |
| 110 | + | |---|---|---|---| | |
| 111 | + | | `server/src/git_ssh.rs` | 15 | `const MNW_ADMIN_PATH: &str = "/opt/makenotwork/mnw-admin"` | `/opt/mnw/current/mnw-admin` | | |
| 112 | + | | `server/src/bin/mnw-admin.rs` | 122 | `dotenvy::from_path("/opt/makenotwork/.env")` | `/etc/mnw/makenotwork.env` | | |
| 113 | + | | `server/src/build_runner.rs` | 467 | `const BUILD_SSH_KNOWN_HOSTS: &str = "/opt/makenotwork/ssh/known_hosts"` | `/etc/mnw/known_hosts` (or delete if dead — verify usage first) | | |
| 114 | + | | `server/src/routes/api/ssh_keys.rs` | 165 | `args(["-u", "git", "/opt/makenotwork/mnw-admin", "rebuild-keys"])` | `/opt/mnw/current/mnw-admin` | | |
| 115 | + | ||
| 116 | + | Ship as 0.9.6. Cleanup sequence after: deploy 0.9.6 via sando → `rebuild-keys` once (regenerates `authorized_keys` with new path in command=) → soak one week → `rm -rf /opt/makenotwork/`. | |
| 117 | + | ||
| 118 | + | ### G.2 The backups dir migration | |
| 119 | + | ||
| 120 | + | Independent of G.1. Touches: | |
| 121 | + | - `server/deploy/backup-db.sh` — hardcoded `BACKUP_DIR="/opt/makenotwork/backups"` near top. | |
| 122 | + | - `makenotwork` user crontab on prod. | |
| 123 | + | - Sando's `backup.source` URL on pop-os (currently pulls from `/opt/makenotwork/backups/latest.sql.gz` via rrsync). | |
| 124 | + | ||
| 125 | + | Easiest order: copy the existing 885M dir to `/var/lib/mnw/backups/`, edit script + crontab + sando config in one window, retire `/opt/makenotwork/backups/` after one successful daily backup lands in the new location and sando confirms it pulled cleanly. | |
| 126 | + | ||
| 127 | + | ### G.3 The `/opt/git` vs `/var/lib/mnw/git` duality | |
| 128 | + | ||
| 129 | + | Both directories currently hold the same repos. Git pushes land in `/opt/git/` (git user's home from `/etc/passwd`). Server reads from `/var/lib/mnw/git/` (GIT_REPOS_PATH). They drift the moment someone pushes. | |
| 130 | + | ||
| 131 | + | Two ways out: | |
| 132 | + | - (a) `usermod -d /var/lib/mnw/git git` to make git's home match GIT_REPOS_PATH. Single source of truth. Risk: any cron / script that reads git's home (none I found, but worth grepping) breaks. | |
| 133 | + | - (b) Revert GIT_REPOS_PATH to `/opt/git/`. Avoids the move but locks the path forever and reverts a piece of Session 1's FHS migration. | |
| 134 | + | ||
| 135 | + | (a) is the right answer. Do it during the post-0.9.6 soak window. | |
| 136 | + | ||
| 137 | + | ### G.4 `bootstrap-node.sh` polish | |
| 138 | + | ||
| 139 | + | From this cutover and Session 2: | |
| 140 | + | ||
| 141 | + | - **Detect `nologin` shell** on `SERVICE_USER` and refuse with a clear error (or auto-`chsh`). Costs ~1 min of cutover time if you don't know to check. | |
| 142 | + | - **Sibling `bootstrap-node-postgres.sh`** for the common pg_ident map case (when SERVICE_USER ≠ pg role name). Or document the manual steps in the script's "next steps" output. | |
| 143 | + | - **README-postgres.md note** on the sqlx URL form: `postgres:///db?host=/var/run/postgresql&user=name`, not `postgres://user@/db?host=...`. | |
| 144 | + | ||
| 145 | + | ### G.5 `ASSUMPTIONS_PATH` mismatch | |
| 146 | + | ||
| 147 | + | `sando-daemon.toml` puts the file at `<release>/docs/assumptions.toml`; prod's pre-existing env expected `<release>/docs/business/assumptions.toml` (matching the source layout `server/docs/business/assumptions.toml`). Worked around with an env edit during cutover but both prod and testnot now have non-canonical `ASSUMPTIONS_PATH=/opt/mnw/current/docs/assumptions.toml`. Fix: change `release_contents[3].dst` in `sando-daemon.toml` to `docs/business/assumptions.toml` and revert the env path on both nodes. Small, do it during the 0.9.6 sprint. | |
| 148 | + | ||
| 149 | + | ## H. Key paths (for orientation) | |
| 150 | + | ||
| 151 | + | - `MNW/sando/sando.toml` — tier B definition (`makenotwork@alpha-west-1`). | |
| 152 | + | - `MNW/sando/deploy/bootstrap-node.sh` — node-bootstrap; ran on prod with `SERVICE_USER=makenotwork`. | |
| 153 | + | - `MNW/sando/daemon/sando-daemon.toml` — release_contents (note §G.5 ASSUMPTIONS_PATH mismatch). | |
| 154 | + | - `MNW/server/src/{git_ssh.rs, build_runner.rs, bin/mnw-admin.rs, routes/api/ssh_keys.rs}` — the four hardcoded path sites. | |
| 155 | + | - `MNW/server/deploy/backup-db.sh` — hardcoded backup dir. | |
| 156 | + | - `/etc/systemd/system/makenotwork.service` (prod) — new FHS unit. | |
| 157 | + | - `/etc/mnw/makenotwork.env` (prod) — new env file location. | |
| 158 | + | - `/etc/sudoers.d/mnw-git-ssh` (prod) — updated to `/opt/mnw/current/mnw-admin`. | |
| 159 | + | - `/etc/caddy/Caddyfile` (prod) — three error-pages refs updated. | |
| 160 | + | - `/opt/makenotwork/` (prod) — full pre-cutover state, kept for soak rollback. | |
| 161 | + | - `launchplan_final.md` §6.5 step 8 — original plan this session closes. | |
| 162 | + | - `launchplan_final.md` §6.9 — Session 2/3 gotchas summary. | |
| 163 | + | - `launchplan_final.md` §7 — 0.9.6 path-decoupling spec. |
| @@ -61,8 +61,12 @@ gates = [ | |||
| 61 | 61 | ] | |
| 62 | 62 | [[tier.node]] | |
| 63 | 63 | name = "prod-1" | |
| 64 | - | ssh_target = "deploy@prod-1.makenot.work" | |
| 64 | + | # Tailnet name; port 2200 supplied via /srv/sando/.ssh/config Host block. | |
| 65 | + | # Service user is "makenotwork" (pre-existing on prod), not "deploy" — chose | |
| 66 | + | # not to chown 885M of backups + redo postgres peer auth for a cosmetic rename. | |
| 67 | + | ssh_target = "makenotwork@alpha-west-1" | |
| 65 | 68 | release_root = "/opt/mnw" | |
| 69 | + | service_name = "makenotwork.service" | |
| 66 | 70 | ||
| 67 | 71 | # ---- C: prod-2 (declared, not yet provisioned) ---- | |
| 68 | 72 | [[tier]] |