Skip to main content

max / makenotwork

14.6 KB · 344 lines History Blame Raw
1 //! Typed gate outcomes.
2 //!
3 //! Replaces the `(passed: bool, detail: Option<String>)` pair on
4 //! `GateOutcome`. The point is to push failure classification into the
5 //! type itself: a `GateFailure::MigrationDrift { migration }` is what it
6 //! says, not a string the operator has to parse. See
7 //! `plans/observability.md` for the full argument.
8 //!
9 //! The variants here describe what the gate runner actually observed.
10 //! Mapping raw process output (stderr tails, exit codes) to these
11 //! variants is the classifier's job — `classify.rs`.
12
13 use crate::domain::{GateKind, Version};
14 use chrono::{DateTime, Utc};
15 use serde::{Deserialize, Serialize};
16
17 /// A gate's result, persisted to `gate_runs.outcome_json` and emitted
18 /// over WS in `GateDone`.
19 #[derive(Debug, Clone, Serialize, Deserialize)]
20 pub struct GateOutcome {
21 pub status: GateStatus,
22 /// Relative path under `cfg.logs_root` to the persisted stdout/stderr
23 /// for this run. `None` for gates that don't produce process output
24 /// (burn_in, manual_confirm).
25 #[serde(skip_serializing_if = "Option::is_none", default)]
26 pub log_ref: Option<LogRef>,
27 }
28
29 impl GateOutcome {
30 pub fn passed(note: PassNote) -> Self {
31 Self { status: GateStatus::Passed { note }, log_ref: None }
32 }
33 pub fn failed(failure: GateFailure) -> Self {
34 Self { status: GateStatus::Failed { failure }, log_ref: None }
35 }
36 pub fn blocked(blocker: GateBlocker) -> Self {
37 Self { status: GateStatus::Blocked { blocker }, log_ref: None }
38 }
39 pub fn with_log_ref(mut self, log_ref: LogRef) -> Self {
40 self.log_ref = Some(log_ref);
41 self
42 }
43
44 /// True iff the gate ran and succeeded. `Blocked` is not passing:
45 /// the gate has not satisfied the pipeline, the operator just owes
46 /// it a precondition.
47 pub fn is_passed(&self) -> bool {
48 matches!(self.status, GateStatus::Passed { .. })
49 }
50
51 /// The high-level status word for the `gate_runs.status` column.
52 pub fn status_str(&self) -> &'static str {
53 match self.status {
54 GateStatus::Passed { .. } => "passed",
55 GateStatus::Failed { .. } => "failed",
56 GateStatus::Blocked { .. } => "blocked",
57 }
58 }
59 }
60
61 #[derive(Debug, Clone, Serialize, Deserialize)]
62 #[serde(tag = "kind", rename_all = "snake_case")]
63 pub enum GateStatus {
64 /// Gate ran and succeeded. The note carries gate-specific evidence
65 /// (e.g. `TestsPassed { duration_s }`).
66 Passed { note: PassNote },
67 /// Gate ran and failed. Two-layer tag: outer `kind = "failed"`, inner
68 /// `failure.kind` names the classified variant. If no classifier
69 /// matched, that's `unclassified`.
70 Failed { failure: GateFailure },
71 /// Gate cannot run yet. Burn-in clock not started, scratch DB not
72 /// configured, backup missing — pre-conditions the operator can fix
73 /// out of band. Distinguished from `Failed` so the TUI can render
74 /// these yellow rather than red.
75 Blocked { blocker: GateBlocker },
76 }
77
78 #[derive(Debug, Clone, Serialize, Deserialize)]
79 #[serde(tag = "kind", rename_all = "snake_case")]
80 pub enum PassNote {
81 /// `boot_smoke` — the binary stayed up for the smoke window.
82 StayedUp { duration_s: u32 },
83 /// `burn_in` — the configured number of hours have elapsed since
84 /// the gate's clock started.
85 BurnInElapsed { hours: u32 },
86 /// `migration_dry_run` — scratch DB restored from `backup_path` and
87 /// every migration ran without error.
88 Migrated { backup_path: String },
89 /// `cargo_test` — `cargo test --release` exited 0.
90 TestsPassed { duration_s: u32 },
91 /// `manual_confirm` — an operator inserted a passing row out-of-band.
92 OperatorConfirmed { at: DateTime<Utc> },
93 /// Legacy rows backfilled from the pre-typed schema. Carries the
94 /// original `detail` string so nothing is lost.
95 Legacy { text: String },
96 }
97
98 impl PassNote {
99 pub fn summary(&self) -> String {
100 match self {
101 PassNote::StayedUp { duration_s } => format!("stayed up for {duration_s}s"),
102 PassNote::BurnInElapsed { hours } => format!("{hours} hours elapsed"),
103 PassNote::Migrated { backup_path } => format!("restored {backup_path} + migrated"),
104 PassNote::TestsPassed { duration_s } => format!("tests passed in {duration_s}s"),
105 PassNote::OperatorConfirmed { at } => format!("operator confirmed at {at}"),
106 PassNote::Legacy { text } => text.clone(),
107 }
108 }
109 }
110
111 #[derive(Debug, Clone, Serialize, Deserialize)]
112 #[serde(tag = "kind", rename_all = "snake_case")]
113 pub enum GateBlocker {
114 /// `burn_in`: the tier's `tier_state.burn_in_started_at` is NULL.
115 BurnInClockNotStarted,
116 /// `burn_in`: clock running but not enough time elapsed yet.
117 BurnInRemaining { hours_remaining: u32, hours_total: u32 },
118 /// `manual_confirm`: no out-of-band passing row exists for this
119 /// (tier, version).
120 AwaitingOperatorConfirmation,
121 /// `migration_dry_run`: no row in `backups` to restore from.
122 NoBackupAvailable,
123 /// `migration_dry_run` / `boot_smoke` / `cargo_test`: daemon config
124 /// has no `scratch_db_url`.
125 ScratchDbUrlUnset,
126 /// `boot_smoke`: no `artifact_path` in `versions` for this version.
127 ArtifactMissing { version: Version },
128 }
129
130 impl GateBlocker {
131 pub fn summary(&self) -> String {
132 match self {
133 GateBlocker::BurnInClockNotStarted => "burn-in clock not started".into(),
134 GateBlocker::BurnInRemaining { hours_remaining, hours_total } =>
135 format!("{hours_remaining} hours remaining of {hours_total}"),
136 GateBlocker::AwaitingOperatorConfirmation => "waiting on operator confirmation".into(),
137 GateBlocker::NoBackupAvailable => "no backup fetched; call /backup/fetch first".into(),
138 GateBlocker::ScratchDbUrlUnset => "scratch_db_url unset in daemon config".into(),
139 GateBlocker::ArtifactMissing { version } => format!("no artifact for version {version}"),
140 }
141 }
142 }
143
144 #[derive(Debug, Clone, Serialize, Deserialize)]
145 #[serde(tag = "kind", rename_all = "snake_case")]
146 pub enum GateFailure {
147 /// `cargo_test` exited non-zero. `failed_count` may be 0 if the
148 /// classifier couldn't parse the count (e.g. compile error).
149 CargoTest { failed_count: u32, first_failed: Option<String> },
150 /// `migration_dry_run`: a migration that was previously applied is
151 /// no longer present in the resolved migrations directory.
152 MigrationDrift { migration: String },
153 /// `migration_dry_run`: a migration that was previously applied has
154 /// been modified (checksum mismatch).
155 MigrationModified { migration: String },
156 /// `migration_dry_run`: postgres rejected a migration's SQL.
157 MigrationSqlError { migration: String, sqlstate: Option<String> },
158 /// `migration_dry_run`: scratch DB reset or dump restore failed.
159 RestoreFailed { reason: String },
160 /// `boot_smoke`: binary exited with a non-zero status during the
161 /// smoke window. Most likely a panic; `exit_code` carries the OS
162 /// status when one is available.
163 BootPanic { exit_code: Option<i32> },
164 /// `boot_smoke`: binary exited 0 before the smoke window elapsed.
165 BootExitedEarly { exit_code: Option<i32> },
166 /// `cargo_test` / `boot_smoke`: tokio could not spawn the child.
167 SpawnFailed { message: String },
168 /// Gate took longer than the configured ceiling.
169 Timeout { gate: GateKind, after_s: u32 },
170 /// Classifier could not match the output to any known variant. The
171 /// `log_ref` on the enclosing `GateOutcome` is the diagnostic path.
172 Unclassified { legacy_detail: Option<String> },
173 }
174
175 impl GateFailure {
176 pub fn summary(&self) -> String {
177 match self {
178 GateFailure::CargoTest { failed_count, first_failed: Some(name) } =>
179 format!("{failed_count} test(s) failed; first: {name}"),
180 GateFailure::CargoTest { failed_count, first_failed: None } =>
181 format!("{failed_count} test(s) failed"),
182 GateFailure::MigrationDrift { migration } =>
183 format!("migration {migration} previously applied but missing"),
184 GateFailure::MigrationModified { migration } =>
185 format!("migration {migration} previously applied but modified"),
186 GateFailure::MigrationSqlError { migration, sqlstate: Some(s) } =>
187 format!("migration {migration} sql error ({s})"),
188 GateFailure::MigrationSqlError { migration, sqlstate: None } =>
189 format!("migration {migration} sql error"),
190 GateFailure::RestoreFailed { reason } => format!("restore: {reason}"),
191 GateFailure::BootPanic { exit_code: Some(c) } => format!("binary panicked: exit {c}"),
192 GateFailure::BootPanic { exit_code: None } => "binary panicked".into(),
193 GateFailure::BootExitedEarly { exit_code: Some(c) } => format!("binary exited early: exit {c}"),
194 GateFailure::BootExitedEarly { exit_code: None } => "binary exited early".into(),
195 GateFailure::SpawnFailed { message } => format!("spawn: {message}"),
196 GateFailure::Timeout { gate, after_s } => format!("{gate} timed out after {after_s}s"),
197 GateFailure::Unclassified { legacy_detail: Some(d) } => d.clone(),
198 GateFailure::Unclassified { legacy_detail: None } => "unclassified failure".into(),
199 }
200 }
201 }
202
203 // ---------------------------------------------------------------------
204 // Deploy outcomes (step 7)
205 // ---------------------------------------------------------------------
206
207 /// Typed outcome of one node-deploy attempt. Stored as `outcome_json` in
208 /// the `deploys` table and emitted in `Event::DeployFailed` so consumers
209 /// can distinguish a node-unreachable error (operator: check the box)
210 /// from rsync mid-transfer corruption (operator: check disk/network).
211 #[derive(Debug, Clone, Serialize, Deserialize)]
212 pub struct DeployOutcome {
213 pub status: DeployStatus,
214 }
215
216 impl DeployOutcome {
217 pub fn ok() -> Self { Self { status: DeployStatus::Ok } }
218 pub fn failed(failure: DeployFailureKind) -> Self {
219 Self { status: DeployStatus::Failed { failure } }
220 }
221 pub fn in_progress() -> Self { Self { status: DeployStatus::InProgress } }
222
223 /// `'in_progress' | 'ok' | 'failed'` — the value of the legacy
224 /// `deploys.outcome` column.
225 pub fn status_str(&self) -> &'static str {
226 match self.status {
227 DeployStatus::InProgress => "in_progress",
228 DeployStatus::Ok => "ok",
229 DeployStatus::Failed { .. } => "failed",
230 }
231 }
232 }
233
234 #[derive(Debug, Clone, Serialize, Deserialize)]
235 #[serde(tag = "kind", rename_all = "snake_case")]
236 pub enum DeployStatus {
237 InProgress,
238 Ok,
239 Failed { failure: DeployFailureKind },
240 }
241
242 #[derive(Debug, Clone, Serialize, Deserialize)]
243 #[serde(tag = "kind", rename_all = "snake_case")]
244 pub enum DeployFailureKind {
245 /// SSH to the node failed before any state changed. Typically a dead
246 /// host, network partition, or stale known_hosts.
247 NodeUnreachable { detail: String },
248 /// rsync exited non-zero mid-transfer. The on-target release dir may
249 /// be partially populated, but the `current` symlink is untouched.
250 RsyncFailed { detail: String },
251 /// Files copied successfully but the atomic symlink swap step
252 /// failed. The new release is on disk; the service is still running
253 /// the old one.
254 SymlinkSwapFailed { detail: String },
255 /// Symlink swapped but `systemctl reload-or-restart` returned
256 /// non-zero. The new code is current but the service may have
257 /// crashed on startup.
258 ServiceRestartFailed { detail: String },
259 /// Classifier couldn't match the error to a known variant. The full
260 /// anyhow chain is in `detail`.
261 Unclassified { detail: String },
262 }
263
264 impl DeployFailureKind {
265 pub fn summary(&self) -> String {
266 match self {
267 DeployFailureKind::NodeUnreachable { detail } => format!("node unreachable: {detail}"),
268 DeployFailureKind::RsyncFailed { detail } => format!("rsync: {detail}"),
269 DeployFailureKind::SymlinkSwapFailed { detail } => format!("symlink swap: {detail}"),
270 DeployFailureKind::ServiceRestartFailed { detail } => format!("service restart: {detail}"),
271 DeployFailureKind::Unclassified { detail } => detail.chars().take(200).collect(),
272 }
273 }
274 }
275
276 /// Pointer to the on-disk gate log: a path relative to `cfg.logs_root`
277 /// of the form `<version>/<gate_kind>.log`. Stored in `gate_runs.log_ref`
278 /// and surfaced in `/state` so the TUI/operator can request the full
279 /// tail via `GET /logs/<version>/<gate>` only when needed.
280 #[derive(Debug, Clone, Serialize, Deserialize)]
281 #[serde(transparent)]
282 pub struct LogRef(pub String);
283
284 impl LogRef {
285 pub fn new(version: &Version, gate: GateKind) -> Self {
286 Self(format!("{}/{}.log", version, gate.as_str()))
287 }
288 pub fn as_str(&self) -> &str { &self.0 }
289 }
290
291 impl std::fmt::Display for LogRef {
292 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { self.0.fmt(f) }
293 }
294
295 #[cfg(test)]
296 mod tests {
297 use super::*;
298
299 #[test]
300 fn outcome_serialization_is_two_layer_tagged() {
301 let o = GateOutcome::failed(GateFailure::MigrationDrift {
302 migration: "0047_widgets".into(),
303 });
304 let v: serde_json::Value = serde_json::to_value(&o).unwrap();
305 assert_eq!(v["status"]["kind"], "failed");
306 assert_eq!(v["status"]["failure"]["kind"], "migration_drift");
307 assert_eq!(v["status"]["failure"]["migration"], "0047_widgets");
308 }
309
310 #[test]
311 fn outcome_round_trips_through_json() {
312 let o = GateOutcome::passed(PassNote::TestsPassed { duration_s: 42 });
313 let s = serde_json::to_string(&o).unwrap();
314 let back: GateOutcome = serde_json::from_str(&s).unwrap();
315 assert!(back.is_passed());
316 assert_eq!(back.status_str(), "passed");
317 }
318
319 #[test]
320 fn blocked_is_not_passed() {
321 let o = GateOutcome::blocked(GateBlocker::BurnInClockNotStarted);
322 assert!(!o.is_passed());
323 assert_eq!(o.status_str(), "blocked");
324 }
325
326 #[test]
327 fn log_ref_construction_matches_disk_layout() {
328 let v: Version = "0.9.6".parse().unwrap();
329 let lr = LogRef::new(&v, GateKind::CargoTest);
330 assert_eq!(lr.as_str(), "0.9.6/cargo_test.log");
331 }
332
333 #[test]
334 fn unclassified_preserves_legacy_detail() {
335 let o = GateOutcome::failed(GateFailure::Unclassified {
336 legacy_detail: Some("binary exited early: exit status: 101\n==== stdout ====\n...".into()),
337 });
338 let v: serde_json::Value = serde_json::to_value(&o).unwrap();
339 assert_eq!(v["status"]["failure"]["kind"], "unclassified");
340 assert!(v["status"]["failure"]["legacy_detail"]
341 .as_str().unwrap().contains("exit status: 101"));
342 }
343 }
344