Skip to main content

max / makenotwork

14.1 KB · 395 lines History Blame Raw
1 //! Build-run tracking: one `build_runs` row per `/rebuild`, updated as the
2 //! pipeline moves through its phases, terminating in passed/failed/aborted.
3 //!
4 //! This is the resource that makes Sando driveable headlessly. `/state` only
5 //! ever reflects the last *successful* deploy, so on a red pipeline a poller
6 //! of `/state` sees stale-green for the whole build (the 0.10.2 incident). A
7 //! `RunId` returned by `/rebuild` + `GET /runs/{id}` gives a non-TUI caller
8 //! one pollable resource tied to the build it triggered, carrying the phase,
9 //! the per-gate status, and — the highest-value bit — a `failure_summary`
10 //! (first compile error / first failed gate) so the cause is in the API, not
11 //! behind `sudo journalctl`.
12 //!
13 //! Terminal writes (`mark_passed`/`mark_failed`/`mark_aborted`) are guarded on
14 //! `result = 'building'`, so whichever site settles the run first wins: a
15 //! build-step compile error, the first red gate, or the task-level catch for
16 //! pre-build bails. Later writes are silent no-ops.
17
18 use crate::domain::{RunId, Version};
19 use anyhow::Result;
20 use chrono::Utc;
21 use serde::Serialize;
22 use sqlx::{Row, SqlitePool};
23
24 /// In-flight sub-state. Plain strings in the DB; this enum names the values so
25 /// call sites can't typo them.
26 #[derive(Debug, Clone, Copy)]
27 pub enum Phase {
28 Fetching,
29 Compiling,
30 Staging,
31 Gating,
32 }
33
34 impl Phase {
35 pub fn as_str(self) -> &'static str {
36 match self {
37 Phase::Fetching => "fetching",
38 Phase::Compiling => "compiling",
39 Phase::Staging => "staging",
40 Phase::Gating => "gating",
41 }
42 }
43 }
44
45 /// Insert a fresh `building` run for `sha` and return its id.
46 pub async fn create(pool: &SqlitePool, sha: &str) -> Result<RunId> {
47 let id: i64 = sqlx::query_scalar(
48 "INSERT INTO build_runs (sha, phase, result, started_at)
49 VALUES (?, 'queued', 'building', ?) RETURNING id",
50 )
51 .bind(sha)
52 .bind(Utc::now().to_rfc3339())
53 .fetch_one(pool)
54 .await?;
55 Ok(RunId(id))
56 }
57
58 /// Advance the in-flight phase. No-op once the run is terminal so a late
59 /// phase write can't resurrect a finished row.
60 pub async fn set_phase(pool: &SqlitePool, run_id: RunId, phase: Phase) -> Result<()> {
61 sqlx::query("UPDATE build_runs SET phase = ? WHERE id = ? AND result = 'building'")
62 .bind(phase.as_str())
63 .bind(run_id.0)
64 .execute(pool)
65 .await?;
66 Ok(())
67 }
68
69 /// Record the version once it's been read from the worktree's Cargo.toml.
70 pub async fn set_version(pool: &SqlitePool, run_id: RunId, version: &Version) -> Result<()> {
71 sqlx::query("UPDATE build_runs SET version = ? WHERE id = ? AND result = 'building'")
72 .bind(version.to_string())
73 .bind(run_id.0)
74 .execute(pool)
75 .await?;
76 Ok(())
77 }
78
79 /// Settle the run green. First terminal write wins (guarded on `building`).
80 pub async fn mark_passed(pool: &SqlitePool, run_id: RunId) -> Result<()> {
81 sqlx::query(
82 "UPDATE build_runs SET result = 'passed', phase = 'done', finished_at = ?
83 WHERE id = ? AND result = 'building'",
84 )
85 .bind(Utc::now().to_rfc3339())
86 .bind(run_id.0)
87 .execute(pool)
88 .await?;
89 Ok(())
90 }
91
92 /// Settle the run red with a human-readable cause. First terminal write wins,
93 /// so the most specific failure (build compile error, first red gate) recorded
94 /// before the task-level catch is the one that sticks.
95 pub async fn mark_failed(pool: &SqlitePool, run_id: RunId, summary: &str) -> Result<()> {
96 // Bound the stored summary — it's a headline, not the log. The full output
97 // is at the gate's log_ref / journald.
98 let summary: String = summary.chars().take(600).collect();
99 sqlx::query(
100 "UPDATE build_runs SET result = 'failed', phase = 'done', failure_summary = ?, finished_at = ?
101 WHERE id = ? AND result = 'building'",
102 )
103 .bind(&summary)
104 .bind(Utc::now().to_rfc3339())
105 .bind(run_id.0)
106 .execute(pool)
107 .await?;
108 Ok(())
109 }
110
111 /// Settle the run as superseded by a newer `/rebuild`.
112 pub async fn mark_aborted(pool: &SqlitePool, run_id: RunId) -> Result<()> {
113 sqlx::query(
114 "UPDATE build_runs SET result = 'aborted', phase = 'done',
115 failure_summary = 'superseded by a newer /rebuild', finished_at = ?
116 WHERE id = ? AND result = 'building'",
117 )
118 .bind(Utc::now().to_rfc3339())
119 .bind(run_id.0)
120 .execute(pool)
121 .await?;
122 Ok(())
123 }
124
125 /// One gate's status within a run view.
126 #[derive(Debug, Serialize)]
127 pub struct RunGateView {
128 pub kind: String,
129 /// `'passed' | 'failed' | 'blocked'` or NULL while in-flight.
130 pub status: Option<String>,
131 /// Relative path under `cfg.logs_root` for the full byte stream.
132 pub log_ref: Option<String>,
133 }
134
135 /// The `GET /runs/{id}` payload.
136 #[derive(Debug, Serialize)]
137 pub struct RunView {
138 pub run_id: i64,
139 pub sha: String,
140 pub version: Option<String>,
141 pub phase: String,
142 /// `'building' | 'passed' | 'failed' | 'aborted'`.
143 pub result: String,
144 pub started_at: String,
145 pub finished_at: Option<String>,
146 /// Headline cause when `result = 'failed'`: first compile error or first
147 /// red gate. NULL otherwise.
148 pub failure_summary: Option<String>,
149 /// Gates run on the host tier for this run's version, latest row per kind.
150 /// Empty until the run reaches a version + the gating phase.
151 pub gates: Vec<RunGateView>,
152 }
153
154 /// Load a run plus its host-tier gate statuses. `None` if the id is unknown.
155 pub async fn get(pool: &SqlitePool, run_id: RunId) -> Result<Option<RunView>> {
156 let Some(row) = sqlx::query(
157 "SELECT id, sha, version, phase, result, started_at, finished_at, failure_summary
158 FROM build_runs WHERE id = ?",
159 )
160 .bind(run_id.0)
161 .fetch_optional(pool)
162 .await?
163 else {
164 return Ok(None);
165 };
166
167 let version: Option<String> = row.get("version");
168 // Gates are keyed by (tier, version); a build run drives the `host` tier.
169 // Latest row per gate_kind, matching `/state`'s per-tier query shape.
170 let gates: Vec<RunGateView> = if let Some(ver) = version.as_deref() {
171 sqlx::query(
172 "SELECT gate_kind, status, log_ref
173 FROM gate_runs g
174 WHERE tier = 'host' AND version = ?1
175 AND id = (SELECT MAX(id) FROM gate_runs
176 WHERE tier = 'host' AND version = ?1 AND gate_kind = g.gate_kind)
177 ORDER BY gate_kind",
178 )
179 .bind(ver)
180 .fetch_all(pool)
181 .await?
182 .into_iter()
183 .map(|gr| RunGateView {
184 kind: gr.get("gate_kind"),
185 status: gr.get("status"),
186 log_ref: gr.get("log_ref"),
187 })
188 .collect()
189 } else {
190 Vec::new()
191 };
192
193 Ok(Some(RunView {
194 run_id: row.get("id"),
195 sha: row.get("sha"),
196 version,
197 phase: row.get("phase"),
198 result: row.get("result"),
199 started_at: row.get("started_at"),
200 finished_at: row.get("finished_at"),
201 failure_summary: row.get("failure_summary"),
202 gates,
203 }))
204 }
205
206 /// Compact view of the latest build run for `/state`'s liveness line.
207 #[derive(Debug, Serialize)]
208 pub struct BuildSummary {
209 pub run_id: i64,
210 pub sha: String,
211 pub version: Option<String>,
212 pub phase: String,
213 pub result: String,
214 pub failure_summary: Option<String>,
215 /// Seconds from start to finish (or to now while building). Lets a
216 /// `/state` poller show "building <ver>, phase=<x>, elapsed Ns" instead of
217 /// a version frozen at the last success for the whole ~10-min build.
218 pub elapsed_s: i64,
219 }
220
221 /// The most recent build run, for `/state`. `None` until the first `/rebuild`.
222 pub async fn latest_summary(pool: &SqlitePool) -> Result<Option<BuildSummary>> {
223 let Some(row) = sqlx::query(
224 "SELECT id, sha, version, phase, result, failure_summary, started_at, finished_at
225 FROM build_runs ORDER BY id DESC LIMIT 1",
226 )
227 .fetch_optional(pool)
228 .await?
229 else {
230 return Ok(None);
231 };
232 let started_at: String = row.get("started_at");
233 let finished_at: Option<String> = row.get("finished_at");
234 Ok(Some(BuildSummary {
235 run_id: row.get("id"),
236 sha: row.get("sha"),
237 version: row.get("version"),
238 phase: row.get("phase"),
239 result: row.get("result"),
240 failure_summary: row.get("failure_summary"),
241 elapsed_s: elapsed_seconds(&started_at, finished_at.as_deref()),
242 }))
243 }
244
245 /// Seconds between an rfc3339 `started_at` and (`finished_at` or now), clamped
246 /// at 0. A parse failure yields 0 rather than erroring the whole `/state` call.
247 fn elapsed_seconds(started_at: &str, finished_at: Option<&str>) -> i64 {
248 let Ok(start) = chrono::DateTime::parse_from_rfc3339(started_at) else {
249 return 0;
250 };
251 let end = match finished_at {
252 Some(f) => chrono::DateTime::parse_from_rfc3339(f)
253 .map(|d| d.with_timezone(&Utc))
254 .unwrap_or_else(|_| Utc::now()),
255 None => Utc::now(),
256 };
257 (end - start.with_timezone(&Utc)).num_seconds().max(0)
258 }
259
260 /// The summary of the first failed gate for `version` on the host tier, if
261 /// any — used by the build pipeline to populate `failure_summary` when
262 /// `run_all` reports a red pipeline. Reads the typed `outcome_json` so the
263 /// stored headline matches what the TUI renders.
264 pub async fn first_failed_gate_summary(pool: &SqlitePool, version: &Version) -> Option<String> {
265 let row = sqlx::query(
266 "SELECT gate_kind, outcome_json FROM gate_runs
267 WHERE tier = 'host' AND version = ? AND status = 'failed'
268 ORDER BY id ASC LIMIT 1",
269 )
270 .bind(version.to_string())
271 .fetch_optional(pool)
272 .await
273 .ok()
274 .flatten()?;
275 let kind: String = row.get("gate_kind");
276 let outcome_json: Option<String> = row.get("outcome_json");
277 let summary = outcome_json
278 .and_then(|s| serde_json::from_str::<crate::outcome::GateOutcome>(&s).ok())
279 .map(|o| match o.status {
280 crate::outcome::GateStatus::Failed { failure } => failure.summary(),
281 other => format!("{:?}", other),
282 })
283 .unwrap_or_else(|| "gate failed".to_string());
284 Some(format!("{kind}: {summary}"))
285 }
286
287 #[cfg(test)]
288 mod tests {
289 use super::*;
290 use sqlx::sqlite::SqlitePoolOptions;
291
292 async fn pool() -> SqlitePool {
293 let pool = SqlitePoolOptions::new()
294 .max_connections(1)
295 .connect("sqlite::memory:")
296 .await
297 .unwrap();
298 crate::db::migrate(&pool).await.unwrap();
299 pool
300 }
301
302 #[tokio::test]
303 async fn create_then_get_roundtrips_building() {
304 let pool = pool().await;
305 let id = create(&pool, "abc1234").await.unwrap();
306 let v = get(&pool, id).await.unwrap().expect("run exists");
307 assert_eq!(v.sha, "abc1234");
308 assert_eq!(v.result, "building");
309 assert_eq!(v.phase, "queued");
310 assert!(v.version.is_none());
311 assert!(v.gates.is_empty());
312 assert!(v.failure_summary.is_none());
313 }
314
315 #[tokio::test]
316 async fn phase_and_version_advance_then_pass() {
317 let pool = pool().await;
318 let id = create(&pool, "abc1234").await.unwrap();
319 set_phase(&pool, id, Phase::Compiling).await.unwrap();
320 let ver: Version = "0.10.2".parse().unwrap();
321 set_version(&pool, id, &ver).await.unwrap();
322 mark_passed(&pool, id).await.unwrap();
323
324 let v = get(&pool, id).await.unwrap().unwrap();
325 assert_eq!(v.result, "passed");
326 assert_eq!(v.phase, "done");
327 assert_eq!(v.version.as_deref(), Some("0.10.2"));
328 assert!(v.finished_at.is_some());
329 }
330
331 #[tokio::test]
332 async fn first_terminal_write_wins() {
333 let pool = pool().await;
334 let id = create(&pool, "abc1234").await.unwrap();
335 mark_failed(&pool, id, "error[E0063]: missing field user_pages_host").await.unwrap();
336 // A later pass attempt (e.g. the task catch racing a build-step error)
337 // must not overwrite the recorded failure.
338 mark_passed(&pool, id).await.unwrap();
339 // And a second failure summary doesn't clobber the first.
340 mark_failed(&pool, id, "something else").await.unwrap();
341
342 let v = get(&pool, id).await.unwrap().unwrap();
343 assert_eq!(v.result, "failed");
344 assert_eq!(v.failure_summary.as_deref(), Some("error[E0063]: missing field user_pages_host"));
345 }
346
347 #[tokio::test]
348 async fn phase_write_after_terminal_is_noop() {
349 let pool = pool().await;
350 let id = create(&pool, "abc1234").await.unwrap();
351 mark_passed(&pool, id).await.unwrap();
352 set_phase(&pool, id, Phase::Gating).await.unwrap();
353 let v = get(&pool, id).await.unwrap().unwrap();
354 assert_eq!(v.phase, "done", "a late phase write must not move a finished run");
355 }
356
357 #[test]
358 fn elapsed_seconds_uses_finished_when_present() {
359 // Both timestamps present → exact span, no wall-clock dependency.
360 let s = elapsed_seconds("2026-06-13T00:00:00Z", Some("2026-06-13T00:02:05Z"));
361 assert_eq!(s, 125);
362 // Unparseable start → 0, never a panic / negative.
363 assert_eq!(elapsed_seconds("not-a-date", None), 0);
364 }
365
366 #[tokio::test]
367 async fn latest_summary_reports_most_recent_run() {
368 let pool = pool().await;
369 assert!(latest_summary(&pool).await.unwrap().is_none());
370 let _old = create(&pool, "old1234").await.unwrap();
371 let new = create(&pool, "new5678").await.unwrap();
372 set_phase(&pool, new, Phase::Compiling).await.unwrap();
373 let sum = latest_summary(&pool).await.unwrap().expect("a run exists");
374 assert_eq!(sum.run_id, new.0);
375 assert_eq!(sum.sha, "new5678");
376 assert_eq!(sum.phase, "compiling");
377 assert_eq!(sum.result, "building");
378 }
379
380 #[tokio::test]
381 async fn get_unknown_id_is_none() {
382 let pool = pool().await;
383 assert!(get(&pool, RunId(999)).await.unwrap().is_none());
384 }
385
386 #[tokio::test]
387 async fn failure_summary_is_bounded() {
388 let pool = pool().await;
389 let id = create(&pool, "abc1234").await.unwrap();
390 mark_failed(&pool, id, &"x".repeat(5_000)).await.unwrap();
391 let v = get(&pool, id).await.unwrap().unwrap();
392 assert!(v.failure_summary.unwrap().len() <= 600);
393 }
394 }
395