Skip to main content

max / makenotwork

sando: phase A on-disk per-gate logs; cargo_test runs with --features fast-tests Land §6.8 Phase A of launchplan_final.md: - new Config.logs_root (default /srv/sando/logs) - gates.rs writes combined stdout+stderr per gate to <logs_root>/<version>/<gate>.log; detail blob now sourced from combined tail so cargo's stdout-only failure block stops getting hidden behind the stderr 4KB tail (gates.rs:142 bug from §6.8.1). - boot_smoke now pipes child stdio (was inherited and dropped) so the smoke binary's panic/log output reaches the on-disk log. - new route GET /logs/{version}/{gate} returning text/plain; rejects `..` traversal. 3 new route tests. - cargo_test gate now runs with `--features fast-tests` to match server/deploy/run-ci.sh — without it AUTH_RATE_LIMIT_BURST=5 and workflows::auth::lockout_after_failed_attempts hits Governor before the hand-rolled lockout check fires. Verifies Phase A acceptance (§6.8.9): a real cargo_test failure on sando MM (b00cc39, 0.9.5) was fully diagnosable via `curl /logs/...` without SSH — surfaced 4 distinct workflow-test failures with file:line and panic messages.
Author: Max Johnson <me@maxj.phd> · 2026-06-03 00:26 UTC
Commit: fc105c58a33d905d6e11cfb76ad877279578c73d
Parent: ae7521d
4 files changed, +234 insertions, -20 deletions
@@ -22,9 +22,14 @@ pub struct Config {
22 22 /// `["makenotwork", "mnw-admin"]`.
23 23 #[serde(default = "default_bin_names")]
24 24 pub bin_names: Vec<String>,
25 + /// Root for per-gate run logs (`<logs_root>/<version>/<gate>.log`).
26 + /// Served via `GET /logs/{version}/{gate}`. Defaults to `/srv/sando/logs`.
27 + #[serde(default = "default_logs_root")]
28 + pub logs_root: PathBuf,
25 29 }
26 30
27 31 fn default_bin_names() -> Vec<String> { vec!["server".into()] }
32 + fn default_logs_root() -> PathBuf { PathBuf::from("/srv/sando/logs") }
28 33
29 34 impl Config {
30 35 /// Primary binary — the one the systemd unit's ExecStart points at.
@@ -11,6 +11,7 @@ use chrono::Utc;
11 11 use sqlx::SqlitePool;
12 12 use std::path::PathBuf;
13 13 use std::sync::Arc;
14 + use tokio::io::AsyncReadExt;
14 15 use tokio::process::Command;
15 16
16 17 pub struct GateCtx {
@@ -120,26 +121,50 @@ fn kind_str(g: &Gate) -> &'static str {
120 121 async fn cargo_test(ctx: &GateCtx) -> Result<GateOutcome> {
121 122 let server_dir = ctx.worktree.join("server");
122 123 let mut cmd = Command::new("cargo");
123 - cmd.args(["test", "--release"]).current_dir(&server_dir).kill_on_drop(true);
124 + // Match CI (`server/deploy/run-ci.sh`): `--features fast-tests` relaxes
125 + // auth rate-limit burst (5 → 20) and argon2 cost so signup-heavy + lockout
126 + // workflow tests can complete without hitting Governor before the
127 + // hand-rolled lockout check. The feature is specifically documented for
128 + // this in `server/src/constants.rs:87`.
129 + cmd.args(["test", "--release", "--features", "fast-tests"])
130 + .current_dir(&server_dir)
131 + .kill_on_drop(true);
124 132 // Same online-mode rationale as the build step: sqlx query macros need a
125 133 // live DB to type-check against. The scratch DB is left in migrated state
126 134 // by the preceding build, so we can reuse it here.
127 135 if let Some(scratch_url) = ctx.cfg.scratch_db_url.as_deref() {
128 136 cmd.env("DATABASE_URL", scratch_url);
137 + // The server test harness (tests/harness/db.rs) parses TEST_DATABASE_URL
138 + // with rfind('/'), which mangles URLs whose query string contains '/'
139 + // (e.g. `?host=/var/run/postgresql`). Strip the query — libpq defaults
140 + // to /var/run/postgresql on Debian/Ubuntu when host is unspecified.
141 + let test_url = scratch_url
142 + .split_once('?')
143 + .map(|(base, _)| base)
144 + .unwrap_or(scratch_url);
145 + cmd.env("TEST_DATABASE_URL", test_url);
129 146 }
130 147 let out = cmd.output().await?;
148 + persist_gate_log(ctx, "cargo_test", &out.stdout, &out.stderr).await;
131 149 Ok(GateOutcome {
132 150 passed: out.status.success(),
133 - detail: Some(tail(&out.stderr, 4_000)),
151 + detail: Some(combined_tail(&out.stdout, &out.stderr, 4_000)),
134 152 })
135 153 }
136 154
137 155 async fn migration_dry_run(ctx: &GateCtx) -> Result<GateOutcome> {
156 + let mut log_buf: Vec<u8> = Vec::new();
157 + let finish = |outcome: GateOutcome, buf: Vec<u8>| async move {
158 + persist_gate_log(ctx, "migration_dry_run", &buf, &[]).await;
159 + outcome
160 + };
161 +
138 162 let Some(db_url) = ctx.cfg.scratch_db_url.as_deref() else {
139 - return Ok(GateOutcome {
163 + log_buf.extend_from_slice(b"scratch_db_url unset in daemon config\n");
164 + return Ok(finish(GateOutcome {
140 165 passed: false,
141 166 detail: Some("scratch_db_url unset in daemon config".into()),
142 - });
167 + }, log_buf).await);
143 168 };
144 169
145 170 let backup: Option<(String,)> = sqlx::query_as(
@@ -148,24 +173,39 @@ async fn migration_dry_run(ctx: &GateCtx) -> Result<GateOutcome> {
148 173 .fetch_optional(&ctx.pool)
149 174 .await?;
150 175 let Some((backup_path,)) = backup else {
151 - return Ok(GateOutcome {
176 + log_buf.extend_from_slice(b"no backup fetched; call /backup/fetch first\n");
177 + return Ok(finish(GateOutcome {
152 178 passed: false,
153 179 detail: Some("no backup fetched; call /backup/fetch first".into()),
154 - });
180 + }, log_buf).await);
155 181 };
156 182
157 - // Reset the scratch DB: drop schema public, restore dump, run migrations.
183 + log_buf.extend_from_slice(format!("---- reset_scratch ----\n").as_bytes());
158 184 if let Err(e) = reset_scratch(db_url).await {
159 - return Ok(GateOutcome { passed: false, detail: Some(format!("scratch reset: {e}")) });
185 + let msg = format!("scratch reset: {e}");
186 + log_buf.extend_from_slice(msg.as_bytes());
187 + return Ok(finish(GateOutcome { passed: false, detail: Some(msg) }, log_buf).await);
160 188 }
161 - if let Err(e) = restore_dump(db_url, &backup_path).await {
162 - return Ok(GateOutcome { passed: false, detail: Some(format!("restore: {e}")) });
189 + log_buf.extend_from_slice(format!("---- restore_dump ({backup_path}) ----\n").as_bytes());
190 + if let Err(e) = restore_dump(db_url, &backup_path, &mut log_buf).await {
191 + let msg = format!("restore: {e}");
192 + log_buf.extend_from_slice(msg.as_bytes());
193 + return Ok(finish(GateOutcome { passed: false, detail: Some(msg) }, log_buf).await);
163 194 }
164 195
165 196 let migrations_dir = ctx.worktree.join("server").join("migrations");
197 + log_buf.extend_from_slice(b"---- run_migrator ----\n");
166 198 match run_migrator(db_url, &migrations_dir).await {
167 - Ok(()) => Ok(GateOutcome { passed: true, detail: Some(format!("restored {backup_path} + migrated")) }),
168 - Err(e) => Ok(GateOutcome { passed: false, detail: Some(tail(e.to_string().as_bytes(), 4_000)) }),
199 + Ok(()) => {
200 + let detail = format!("restored {backup_path} + migrated");
201 + log_buf.extend_from_slice(detail.as_bytes());
202 + Ok(finish(GateOutcome { passed: true, detail: Some(detail) }, log_buf).await)
203 + }
204 + Err(e) => {
205 + let err_s = e.to_string();
206 + log_buf.extend_from_slice(err_s.as_bytes());
207 + Ok(finish(GateOutcome { passed: false, detail: Some(tail(err_s.as_bytes(), 4_000)) }, log_buf).await)
208 + }
169 209 }
170 210 }
171 211
@@ -197,7 +237,7 @@ pub(crate) async fn reset_scratch(db_url: &str) -> Result<()> {
197 237 Ok(())
198 238 }
199 239
200 - async fn restore_dump(db_url: &str, dump: &str) -> Result<()> {
240 + async fn restore_dump(db_url: &str, dump: &str, log_buf: &mut Vec<u8>) -> Result<()> {
201 241 // Two pipelines we accept:
202 242 // *.sql -> psql $url < dump
203 243 // *.sql.gz -> gunzip -c dump | psql $url
@@ -208,6 +248,8 @@ async fn restore_dump(db_url: &str, dump: &str) -> Result<()> {
208 248 format!("psql {url} < {q}", q = shell_escape(dump), url = shell_escape(db_url))
209 249 };
210 250 let out = Command::new("sh").arg("-c").arg(&shell).output().await?;
251 + log_buf.extend_from_slice(&out.stdout);
252 + log_buf.extend_from_slice(&out.stderr);
211 253 anyhow::ensure!(
212 254 out.status.success(),
213 255 "restore failed: {}",
@@ -254,26 +296,62 @@ async fn boot_smoke(ctx: &GateCtx) -> Result<GateOutcome> {
254 296 let mut cmd = tokio::process::Command::new(&bin);
255 297 cmd.env("SANDO_BOOT_SMOKE", "1")
256 298 .env("SCAN_ENABLED", "false")
299 + .stdout(std::process::Stdio::piped())
300 + .stderr(std::process::Stdio::piped())
257 301 .kill_on_drop(true);
258 302 if let Some(scratch_url) = ctx.cfg.scratch_db_url.as_deref() {
259 303 cmd.env("DATABASE_URL", scratch_url);
260 304 }
261 305 let mut child = match cmd.spawn() {
262 306 Ok(c) => c,
263 - Err(e) => return Ok(GateOutcome { passed: false, detail: Some(format!("spawn: {e}")) }),
307 + Err(e) => {
308 + persist_gate_log(ctx, "boot_smoke", format!("spawn: {e}").as_bytes(), &[]).await;
309 + return Ok(GateOutcome { passed: false, detail: Some(format!("spawn: {e}")) });
310 + }
264 311 };
265 312
313 + // Drain stdout/stderr concurrently into in-memory buffers so the log
314 + // captures whatever the binary printed before exiting (or being killed
315 + // after the 3s smoke window). Without piping + draining, output goes to
316 + // sandod's own stdio and is lost.
317 + let mut stdout_h = child.stdout.take();
318 + let mut stderr_h = child.stderr.take();
319 + let stdout_task = tokio::spawn(async move {
320 + let mut buf = Vec::new();
321 + if let Some(s) = stdout_h.as_mut() {
322 + let _ = s.read_to_end(&mut buf).await;
323 + }
324 + buf
325 + });
326 + let stderr_task = tokio::spawn(async move {
327 + let mut buf = Vec::new();
328 + if let Some(s) = stderr_h.as_mut() {
329 + let _ = s.read_to_end(&mut buf).await;
330 + }
331 + buf
332 + });
333 +
266 334 tokio::time::sleep(std::time::Duration::from_secs(3)).await;
267 335
268 - match child.try_wait()? {
336 + let exit = child.try_wait()?;
337 + if exit.is_none() {
338 + let _ = child.kill().await;
339 + }
340 + // Both read tasks complete once the child's stdio is closed (kill closes
341 + // the pipes via the dropped Child on exit-after-kill).
342 + let stdout_buf = stdout_task.await.unwrap_or_default();
343 + let stderr_buf = stderr_task.await.unwrap_or_default();
344 + persist_gate_log(ctx, "boot_smoke", &stdout_buf, &stderr_buf).await;
345 +
346 + match exit {
269 347 Some(status) => Ok(GateOutcome {
270 348 passed: false,
271 - detail: Some(format!("binary exited early: {status}")),
349 + detail: Some(format!(
350 + "binary exited early: {status}\n{}",
351 + combined_tail(&stdout_buf, &stderr_buf, 4_000),
352 + )),
272 353 }),
273 - None => {
274 - let _ = child.kill().await;
275 - Ok(GateOutcome { passed: true, detail: Some("stayed up for 3s".into()) })
276 - }
354 + None => Ok(GateOutcome { passed: true, detail: Some("stayed up for 3s".into()) }),
277 355 }
278 356 }
279 357
@@ -329,6 +407,51 @@ fn tail(buf: &[u8], max: usize) -> String {
329 407 if s.len() <= max { s.into_owned() } else { format!("...{}", &s[s.len() - max..]) }
330 408 }
331 409
410 + /// Tail combining stdout + stderr. Phase A doesn't preserve true interleaving
411 + /// (the streams were captured separately); we concatenate stderr after stdout
412 + /// so the failing test output (cargo writes it to stdout) is no longer hidden
413 + /// behind the 4KB stderr tail. Full unsliced output lives in the on-disk log.
414 + fn combined_tail(stdout: &[u8], stderr: &[u8], max: usize) -> String {
415 + let mut joined = Vec::with_capacity(stdout.len() + stderr.len() + 32);
416 + joined.extend_from_slice(b"==== stdout ====\n");
417 + joined.extend_from_slice(stdout);
418 + if !stdout.last().is_some_and(|b| *b == b'\n') {
419 + joined.push(b'\n');
420 + }
421 + joined.extend_from_slice(b"==== stderr ====\n");
422 + joined.extend_from_slice(stderr);
423 + tail(&joined, max)
424 + }
425 +
426 + /// Write `<logs_root>/<version>/<gate_kind>.log` containing the gate's
427 + /// captured stdout + stderr. Best-effort: any IO error is logged and
428 + /// swallowed — a missing log must not turn a passing gate red.
429 + ///
430 + /// Layout matches §6.8 of launchplan_final.md so the `GET /logs/{version}/{gate}`
431 + /// route can find it. We use `ctx.version` (semver, what `gate_runs.version`
432 + /// already stores) rather than the git sha; the launchplan uses "sha" in
433 + /// prose but switching the on-disk key is a Phase B change after `GitSha` /
434 + /// `Version` newtypes land.
435 + async fn persist_gate_log(ctx: &GateCtx, gate_kind: &str, stdout: &[u8], stderr: &[u8]) {
436 + let dir = ctx.cfg.logs_root.join(&ctx.version);
437 + if let Err(e) = tokio::fs::create_dir_all(&dir).await {
438 + tracing::warn!(error = %e, dir = %dir.display(), "could not create gate log dir");
439 + return;
440 + }
441 + let path = dir.join(format!("{gate_kind}.log"));
442 + let mut body = Vec::with_capacity(stdout.len() + stderr.len() + 64);
443 + body.extend_from_slice(b"==== stdout ====\n");
444 + body.extend_from_slice(stdout);
445 + if !stdout.last().is_some_and(|b| *b == b'\n') {
446 + body.push(b'\n');
447 + }
448 + body.extend_from_slice(b"==== stderr ====\n");
449 + body.extend_from_slice(stderr);
450 + if let Err(e) = tokio::fs::write(&path, &body).await {
451 + tracing::warn!(error = %e, path = %path.display(), "could not write gate log");
452 + }
453 + }
454 +
332 455 #[cfg(test)]
333 456 mod tests {
334 457 use super::*;
@@ -16,6 +16,7 @@ pub fn router(state: AppState) -> Router {
16 16 .route("/rebuild", post(rebuild))
17 17 .route("/confirm/{tier}", post(confirm))
18 18 .route("/backup/fetch", post(backup_fetch))
19 + .route("/logs/{version}/{gate}", get(get_gate_log))
19 20 .route("/events", get(events_ws))
20 21 .with_state(state)
21 22 .route("/metrics", get(crate::metrics::render).with_state(prom))
@@ -476,6 +477,34 @@ async fn backup_fetch(State(s): State<AppState>) -> Result<Json<serde_json::Valu
476 477 })))
477 478 }
478 479
480 + async fn get_gate_log(
481 + State(s): State<AppState>,
482 + Path((version, gate)): Path<(String, String)>,
483 + ) -> Result<axum::response::Response> {
484 + // Guard against `..` / absolute paths — both segments must be a single
485 + // safe component. Without this, `GET /logs/..%2Fetc/passwd` would escape
486 + // logs_root.
487 + fn safe(seg: &str) -> bool {
488 + !seg.is_empty()
489 + && !seg.contains('/')
490 + && !seg.contains('\\')
491 + && seg != "."
492 + && seg != ".."
493 + }
494 + if !safe(&version) || !safe(&gate) {
495 + return Err(crate::error::Error::NotFound);
496 + }
497 + let path = s.cfg.logs_root.join(&version).join(format!("{gate}.log"));
498 + match tokio::fs::read(&path).await {
499 + Ok(bytes) => Ok((
500 + [(axum::http::header::CONTENT_TYPE, "text/plain; charset=utf-8")],
501 + bytes,
502 + ).into_response()),
503 + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Err(crate::error::Error::NotFound),
504 + Err(e) => Err(crate::error::Error::Other(e.into())),
505 + }
506 + }
507 +
479 508 async fn events_ws(ws: WebSocketUpgrade, State(s): State<AppState>) -> impl IntoResponse {
480 509 use axum::extract::ws::Message;
481 510 use tokio::sync::broadcast::error::RecvError;
@@ -575,6 +604,7 @@ mod tests {
575 604 release_root: PathBuf::from("/tmp/sando-releases"),
576 605 scratch_db_url: None,
577 606 bin_names: vec!["makenotwork".into()],
607 + logs_root: PathBuf::from("/tmp/sando-logs"),
578 608 }
579 609 }
580 610
@@ -853,4 +883,59 @@ mod tests {
853 883 .unwrap();
854 884 assert_eq!(resp.status(), StatusCode::NOT_FOUND);
855 885 }
886 +
887 + // ---- GET /logs/{version}/{gate} ----
888 +
889 + async fn state_with_logs_root(logs_root: PathBuf) -> AppState {
890 + let mut s = test_state().await;
891 + let mut cfg = (*s.cfg).clone();
892 + cfg.logs_root = logs_root;
893 + s.cfg = Arc::new(cfg);
894 + s
895 + }
896 +
897 + #[tokio::test]
898 + async fn get_gate_log_returns_file_contents() {
899 + let tmp = tempfile::tempdir().unwrap();
900 + let dir = tmp.path().join("0.9.5");
901 + tokio::fs::create_dir_all(&dir).await.unwrap();
902 + tokio::fs::write(dir.join("cargo_test.log"), b"hello sandod\n").await.unwrap();
903 +
904 + let state = state_with_logs_root(tmp.path().to_path_buf()).await;
905 + let app = router(state);
906 + let resp = app
907 + .oneshot(Request::builder().uri("/logs/0.9.5/cargo_test").body(Body::empty()).unwrap())
908 + .await
909 + .unwrap();
910 + assert_eq!(resp.status(), StatusCode::OK);
911 + assert_eq!(body_string(resp).await, "hello sandod\n");
912 + }
913 +
914 + #[tokio::test]
915 + async fn get_gate_log_404s_when_missing() {
916 + let tmp = tempfile::tempdir().unwrap();
917 + let state = state_with_logs_root(tmp.path().to_path_buf()).await;
918 + let app = router(state);
919 + let resp = app
920 + .oneshot(Request::builder().uri("/logs/0.9.5/cargo_test").body(Body::empty()).unwrap())
921 + .await
922 + .unwrap();
923 + assert_eq!(resp.status(), StatusCode::NOT_FOUND);
924 + }
925 +
926 + /// Path-traversal guard: a `..` segment must not escape logs_root.
927 + /// axum's `{name}` param already rejects a literal `/` in the value, but
928 + /// `..` as a whole segment is structurally valid and must be blocked at
929 + /// the handler.
930 + #[tokio::test]
931 + async fn get_gate_log_rejects_dotdot_segments() {
932 + let tmp = tempfile::tempdir().unwrap();
933 + let state = state_with_logs_root(tmp.path().to_path_buf()).await;
934 + let app = router(state);
935 + let resp = app
936 + .oneshot(Request::builder().uri("/logs/../etc/passwd").body(Body::empty()).unwrap())
937 + .await
938 + .unwrap();
939 + assert_eq!(resp.status(), StatusCode::NOT_FOUND);
940 + }
856 941 }
@@ -8,3 +8,4 @@ workdir = "/srv/sando/work"
8 8 release_root = "/srv/sando"
9 9 scratch_db_url = "postgres:///sando_scratch?host=/var/run/postgresql"
10 10 bin_names = ["makenotwork", "mnw-admin"]
11 + logs_root = "/srv/sando/logs"