Skip to main content

max / makenotwork

14.7 KB · 360 lines History Blame Raw
1 //! Gate-output classifiers.
2 //!
3 //! Each `classify_*` function takes the raw signals produced by a gate
4 //! runner (exit status, stdout/stderr tails, sqlx error strings) and
5 //! maps them to a typed `GateFailure`. Anything that doesn't match a
6 //! known pattern returns `GateFailure::Unclassified` with the original
7 //! detail attached — the on-disk gate log is the ultimate fallback.
8 //!
9 //! Classifiers are pure functions: no IO, no async. That makes them
10 //! fixture-testable, and it keeps the `gates.rs` runner code in charge
11 //! of side effects (process spawning, log persistence).
12
13 use crate::outcome::GateFailure;
14
15 /// `cargo_test`: derive a `CargoTest` failure with whatever counts can
16 /// be lifted out of the test runner's output.
17 ///
18 /// libtest emits a `test result: FAILED. P passed; F failed; ...` line
19 /// near the end of stdout. We grab `F` from that. If the output never
20 /// reached that line (compile error, runtime panic in the harness), we
21 /// fall through to `Unclassified`.
22 pub fn classify_cargo_test(stdout: &[u8], stderr: &[u8]) -> GateFailure {
23 let stdout_s = String::from_utf8_lossy(stdout);
24
25 let mut failed_count: u32 = 0;
26 let mut first_failed: Option<String> = None;
27
28 // `test result: FAILED. P passed; F failed; ...` lives near the
29 // end. Walk backwards to find it cheaply on very large outputs.
30 for line in stdout_s.lines().rev().take(50) {
31 if let Some(rest) = line.strip_prefix("test result: FAILED.") {
32 // Expect "P passed; F failed; ..."
33 for piece in rest.split(';') {
34 let p = piece.trim();
35 if let Some(num_str) = p.strip_suffix(" failed") {
36 if let Ok(n) = num_str.parse::<u32>() {
37 failed_count = n;
38 }
39 }
40 }
41 break;
42 }
43 }
44
45 // libtest prints "failures:\n foo::bar" near the end too. Grab
46 // the first one for the summary line.
47 if let Some(idx) = stdout_s.find("\nfailures:\n") {
48 for line in stdout_s[idx + 11..].lines() {
49 let trimmed = line.trim();
50 if trimmed.is_empty() { break; }
51 // The "failures:" block repeats — once with stdout per
52 // failure, once as a plain name list. Either way the first
53 // non-empty line is a candidate.
54 first_failed = Some(trimmed.to_string());
55 break;
56 }
57 }
58
59 if failed_count == 0 && first_failed.is_none() {
60 // Compile error or harness panic — no usable signal in stdout.
61 return GateFailure::Unclassified {
62 legacy_detail: Some(combined_tail_for_classifier(stdout, stderr)),
63 };
64 }
65
66 GateFailure::CargoTest { failed_count, first_failed }
67 }
68
69 /// `migration_dry_run` is staged: scratch reset → restore dump → run
70 /// migrator. Each stage has its own failure mode. The caller (the gate
71 /// runner) knows which stage tripped; classifiers here turn the stage's
72 /// error string into a typed variant.
73 ///
74 /// Inputs are the migration name (when known) and the error string sqlx
75 /// returned. `migration` defaults to "?" when sqlx couldn't tell us
76 /// which file blew up.
77 pub fn classify_migration_error(err: &str, migration_hint: Option<&str>) -> GateFailure {
78 // sqlx::migrate::MigrateError variants are stringified consistently.
79 // Examples from `plans/migration-dryrun-failures.md`:
80 // "migration 47 was previously applied but is missing in the resolved migrations"
81 // "migration 47 was previously applied but has been modified"
82 // sqlx::Error::Database with sqlstate (e.g. "42P01" relation does not exist)
83
84 if let Some(m) = extract_drift(err) {
85 return GateFailure::MigrationDrift { migration: m };
86 }
87 if let Some(m) = extract_modified(err) {
88 return GateFailure::MigrationModified { migration: m };
89 }
90 let sqlstate = extract_sqlstate(err);
91 let migration = migration_hint.map(str::to_owned).unwrap_or_else(|| "?".to_owned());
92 if sqlstate.is_some() {
93 return GateFailure::MigrationSqlError { migration, sqlstate };
94 }
95 GateFailure::Unclassified {
96 legacy_detail: Some(err.chars().take(4_000).collect()),
97 }
98 }
99
100 fn extract_drift(err: &str) -> Option<String> {
101 // "migration N was previously applied but is missing in the resolved migrations"
102 let idx = err.find(" was previously applied but is missing")?;
103 let prefix = &err[..idx];
104 let mig = prefix.rsplit_once(' ').map(|(_, m)| m).unwrap_or(prefix);
105 Some(mig.to_string())
106 }
107
108 fn extract_modified(err: &str) -> Option<String> {
109 let idx = err.find(" was previously applied but has been modified")?;
110 let prefix = &err[..idx];
111 let mig = prefix.rsplit_once(' ').map(|(_, m)| m).unwrap_or(prefix);
112 Some(mig.to_string())
113 }
114
115 fn extract_sqlstate(err: &str) -> Option<String> {
116 // Postgres errors surface as `... code: "42P01" ...` in the Debug
117 // form sqlx produces. Be tolerant of the surrounding quoting.
118 let idx = err.find("code: \"")?;
119 let rest = &err[idx + 7..];
120 let end = rest.find('"')?;
121 Some(rest[..end].to_string())
122 }
123
124 /// `boot_smoke`: process exit info is the dominant signal. If the
125 /// binary exited with a status during the smoke window, we map exit
126 /// code 101 (Rust default for panic) to `BootPanic`, everything else
127 /// to `BootExitedEarly`. If it never exited (stayed up), the caller
128 /// constructs `PassNote::StayedUp` directly without consulting this.
129 pub fn classify_boot_smoke(exit_code: Option<i32>) -> GateFailure {
130 match exit_code {
131 Some(101) => GateFailure::BootPanic { exit_code: Some(101) },
132 Some(c) if c < 0 => GateFailure::BootPanic { exit_code: Some(c) }, // killed by signal
133 Some(c) => GateFailure::BootExitedEarly { exit_code: Some(c) },
134 None => GateFailure::BootExitedEarly { exit_code: None },
135 }
136 }
137
138 /// `Event::DeployFailed`: classify an anyhow chain produced by
139 /// `deploy::deploy_node` into a typed `DeployFailureKind`.
140 ///
141 /// The anyhow chain is the `format!("{e:#}")` string the caller built,
142 /// which joins each `.context(...)` layer with ": ". We probe for the
143 /// contexts attached by `deploy_remote` (and well-known stderr patterns
144 /// from ssh/rsync) in order of specificity.
145 pub fn classify_deploy_error(err: &str) -> crate::outcome::DeployFailureKind {
146 use crate::outcome::DeployFailureKind as K;
147
148 // SSH-level transport failures bubble up under whatever context
149 // their caller attached. Probe for the canonical OpenSSH stderr
150 // patterns first so a "creating remote release dir: ... Connection
151 // refused" doesn't get filed under NodeUnreachable's prose label.
152 let unreachable_signals = [
153 "Connection refused",
154 "Connection timed out",
155 "Network is unreachable",
156 "No route to host",
157 "Could not resolve hostname",
158 "Host key verification failed",
159 "Permission denied (publickey",
160 ];
161 if unreachable_signals.iter().any(|p| err.contains(p)) {
162 return K::NodeUnreachable { detail: err.chars().take(400).collect() };
163 }
164
165 // The contexts attached by `deploy_remote` (deploy.rs) are stable
166 // strings; treat them as anchors. Order matters — "symlink swap +
167 // systemctl" appears after a successful rsync, so probe rsync first
168 // to avoid catching it under the swap heading.
169 if err.contains("rsync failed") || err.contains("spawning rsync") {
170 return K::RsyncFailed { detail: err.chars().take(400).collect() };
171 }
172 if err.contains("creating remote release dir") {
173 return K::NodeUnreachable { detail: err.chars().take(400).collect() };
174 }
175 if err.contains("symlink swap + systemctl") {
176 // Heuristic split inside the combined step: stderr containing
177 // "systemctl" suggests the swap succeeded and the restart failed.
178 if err.contains("systemctl") && !err.contains("ln:") {
179 return K::ServiceRestartFailed { detail: err.chars().take(400).collect() };
180 }
181 return K::SymlinkSwapFailed { detail: err.chars().take(400).collect() };
182 }
183 if err.contains("symlink swap failed") {
184 return K::SymlinkSwapFailed { detail: err.chars().take(400).collect() };
185 }
186
187 K::Unclassified { detail: err.chars().take(400).collect() }
188 }
189
190 /// Concatenate stdout + stderr tails the way the legacy runner did, so
191 /// `Unclassified.legacy_detail` looks like what operators are used to
192 /// seeing in `gate_runs.detail` today.
193 fn combined_tail_for_classifier(stdout: &[u8], stderr: &[u8]) -> String {
194 let mut joined = Vec::with_capacity(stdout.len() + stderr.len() + 32);
195 joined.extend_from_slice(b"==== stdout ====\n");
196 joined.extend_from_slice(stdout);
197 if !stdout.last().is_some_and(|b| *b == b'\n') { joined.push(b'\n'); }
198 joined.extend_from_slice(b"==== stderr ====\n");
199 joined.extend_from_slice(stderr);
200 let s = String::from_utf8_lossy(&joined);
201 if s.len() <= 4_000 { s.into_owned() } else { format!("...{}", &s[s.len() - 4_000..]) }
202 }
203
204 #[cfg(test)]
205 mod tests {
206 use super::*;
207
208 #[test]
209 fn cargo_test_extracts_failed_count() {
210 let stdout = b"running 12 tests\n\
211 test foo ... ok\n\
212 test bar ... FAILED\n\
213 test baz ... FAILED\n\
214 \n\
215 failures:\n\
216 foo::bar\n\
217 foo::baz\n\
218 \n\
219 test result: FAILED. 10 passed; 2 failed; 0 ignored\n";
220 let GateFailure::CargoTest { failed_count, first_failed } =
221 classify_cargo_test(stdout, b"")
222 else { panic!("expected CargoTest variant"); };
223 assert_eq!(failed_count, 2);
224 assert_eq!(first_failed.as_deref(), Some("foo::bar"));
225 }
226
227 #[test]
228 fn cargo_test_compile_error_is_unclassified() {
229 // No "test result:" line because cargo never got to running.
230 let stderr = b"error[E0382]: borrow of moved value: `x`\n";
231 let f = classify_cargo_test(b"", stderr);
232 match f {
233 GateFailure::Unclassified { legacy_detail: Some(d) } => {
234 assert!(d.contains("borrow of moved value"));
235 }
236 other => panic!("expected Unclassified, got {other:?}"),
237 }
238 }
239
240 #[test]
241 fn migration_drift_extracts_name() {
242 let err = "migration 0047_widgets was previously applied but is missing in the resolved migrations";
243 let f = classify_migration_error(err, None);
244 match f {
245 GateFailure::MigrationDrift { migration } => assert_eq!(migration, "0047_widgets"),
246 other => panic!("expected MigrationDrift, got {other:?}"),
247 }
248 }
249
250 #[test]
251 fn migration_modified_extracts_name() {
252 let err = "migration 0042_seed was previously applied but has been modified";
253 let f = classify_migration_error(err, None);
254 match f {
255 GateFailure::MigrationModified { migration } => assert_eq!(migration, "0042_seed"),
256 other => panic!("expected MigrationModified, got {other:?}"),
257 }
258 }
259
260 #[test]
261 fn migration_sql_error_extracts_sqlstate() {
262 let err = r#"while executing migrations: error returned from database: code: "42P01" message: "relation \"widgets\" does not exist""#;
263 let f = classify_migration_error(err, Some("0050_drop_widgets"));
264 match f {
265 GateFailure::MigrationSqlError { migration, sqlstate } => {
266 assert_eq!(migration, "0050_drop_widgets");
267 assert_eq!(sqlstate.as_deref(), Some("42P01"));
268 }
269 other => panic!("expected MigrationSqlError, got {other:?}"),
270 }
271 }
272
273 #[test]
274 fn migration_unknown_error_is_unclassified() {
275 let err = "something went wrong with the universe";
276 let f = classify_migration_error(err, None);
277 match f {
278 GateFailure::Unclassified { legacy_detail: Some(d) } => {
279 assert!(d.contains("universe"));
280 }
281 other => panic!("expected Unclassified, got {other:?}"),
282 }
283 }
284
285 #[test]
286 fn boot_smoke_101_is_panic() {
287 match classify_boot_smoke(Some(101)) {
288 GateFailure::BootPanic { exit_code: Some(101) } => {}
289 other => panic!("expected BootPanic(101), got {other:?}"),
290 }
291 }
292
293 #[test]
294 fn boot_smoke_signal_is_panic() {
295 match classify_boot_smoke(Some(-9)) {
296 GateFailure::BootPanic { exit_code: Some(-9) } => {}
297 other => panic!("expected BootPanic(-9), got {other:?}"),
298 }
299 }
300
301 #[test]
302 fn boot_smoke_other_exit_is_exited_early() {
303 match classify_boot_smoke(Some(2)) {
304 GateFailure::BootExitedEarly { exit_code: Some(2) } => {}
305 other => panic!("expected BootExitedEarly(2), got {other:?}"),
306 }
307 }
308
309 #[test]
310 fn deploy_connection_refused_is_node_unreachable() {
311 use crate::outcome::DeployFailureKind as K;
312 let err = "creating remote release dir: ssh testnot-1 failed: ssh: connect to host testnot-1 port 22: Connection refused";
313 match classify_deploy_error(err) {
314 K::NodeUnreachable { .. } => {}
315 other => panic!("expected NodeUnreachable, got {other:?}"),
316 }
317 }
318
319 #[test]
320 fn deploy_rsync_failure_is_rsync_failed() {
321 use crate::outcome::DeployFailureKind as K;
322 let err = "rsync failed (current symlink left intact): rsync: write failed on \"/srv/.../makenotwork\": No space left on device (28)";
323 match classify_deploy_error(err) {
324 K::RsyncFailed { detail } => assert!(detail.contains("No space left")),
325 other => panic!("expected RsyncFailed, got {other:?}"),
326 }
327 }
328
329 #[test]
330 fn deploy_systemctl_failure_is_service_restart_failed() {
331 use crate::outcome::DeployFailureKind as K;
332 // The combined "swap + restart" step where stderr mentions systemctl.
333 let err = "symlink swap + systemctl reload-or-restart: ssh testnot-1 failed: Failed to restart makenotwork.service: Unit makenotwork.service failed to start";
334 match classify_deploy_error(err) {
335 K::ServiceRestartFailed { .. } => {}
336 other => panic!("expected ServiceRestartFailed, got {other:?}"),
337 }
338 }
339
340 #[test]
341 fn deploy_ln_failure_is_symlink_swap_failed() {
342 use crate::outcome::DeployFailureKind as K;
343 let err = "symlink swap + systemctl reload-or-restart: ssh testnot-1 failed: ln: failed to create symbolic link: Permission denied";
344 match classify_deploy_error(err) {
345 K::SymlinkSwapFailed { .. } => {}
346 other => panic!("expected SymlinkSwapFailed, got {other:?}"),
347 }
348 }
349
350 #[test]
351 fn deploy_unknown_is_unclassified() {
352 use crate::outcome::DeployFailureKind as K;
353 let err = "something went wrong in a way we did not anticipate";
354 match classify_deploy_error(err) {
355 K::Unclassified { detail } => assert!(detail.contains("anticipate")),
356 other => panic!("expected Unclassified, got {other:?}"),
357 }
358 }
359 }
360