Skip to main content

max / makenotwork

Add backup verification task to PoM New backup_checks table (migration 9), filesystem scanner for pg_dump .sql.gz files, alert integration for stale/recovery transitions, configurable via target config. 129 tests pass. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-04-16 02:18 UTC
Commit: b4c4cf8be785333aef590a69a758d6db3fbbdfb5
Parent: 820e559
12 files changed, +509 insertions, -28 deletions
@@ -543,6 +543,70 @@ impl Alerter {
543 543 self.record_alert(&alert_key, AlertCategory::TestDurationDrift, None, None, Some(drift_message)).await;
544 544 }
545 545
546 + #[instrument(skip_all)]
547 + pub async fn send_backup_stale_alert(
548 + &self,
549 + target: &str,
550 + label: &str,
551 + database: &str,
552 + status: &str,
553 + age_hours: Option<i64>,
554 + ) {
555 + let alert_key = format!("backup:{target}:{database}");
556 + if self.is_within_cooldown(&alert_key).await {
557 + info!("alert cooldown active for {alert_key}, skipping");
558 + return;
559 + }
560 +
561 + let detail = match (status, age_hours) {
562 + ("stale", Some(hours)) => format!("last backup is {hours}h old"),
563 + ("missing", _) => "no backup files found".to_string(),
564 + ("error", _) => "backup check failed".to_string(),
565 + _ => format!("status: {status}"),
566 + };
567 +
568 + let subject = format!("[PoM] {label}: {database} backup {status}");
569 + let body = format!(
570 + "Target: {label} ({target})\n\
571 + Database: {database}\n\
572 + Status: {status}\n\
573 + Detail: {detail}\n\
574 + Instance: {}\n\
575 + Time: {}\n\n\
576 + - PoM",
577 + self.instance_name,
578 + chrono::Utc::now().to_rfc3339(),
579 + );
580 +
581 + self.send_email(&subject, &body).await;
582 + self.record_alert(&alert_key, AlertCategory::BackupStale, None, Some(status), None).await;
583 + }
584 +
585 + #[instrument(skip_all)]
586 + pub async fn send_backup_recovery(
587 + &self,
588 + target: &str,
589 + label: &str,
590 + database: &str,
591 + ) {
592 + // No cooldown on recovery — always send
593 + let alert_key = format!("backup:{target}:{database}");
594 + let subject = format!("[PoM] {label}: {database} backup recovered");
595 + let body = format!(
596 + "Target: {label} ({target})\n\
597 + Database: {database}\n\
598 + Backup is now current.\n\
599 + Instance: {}\n\
600 + Time: {}\n\n\
601 + - PoM",
602 + self.instance_name,
603 + chrono::Utc::now().to_rfc3339(),
604 + );
605 +
606 + self.send_email(&subject, &body).await;
607 + self.record_alert(&alert_key, AlertCategory::BackupRecovery, None, Some("ok"), None).await;
608 + }
609 +
546 610 /// All monitored targets are unreachable — likely a network issue with PoM itself.
547 611 #[instrument(skip_all)]
548 612 pub async fn send_monitoring_offline_alert(&self, target_count: usize) {
@@ -0,0 +1,184 @@
1 + //! Local filesystem backup verification — scan for PostgreSQL backup files
2 + //! and check their recency.
3 +
4 + use std::path::Path;
5 +
6 + use tracing::instrument;
7 +
8 + use crate::types::BackupCheckResult;
9 +
10 + /// Scan `directory` for backup files matching `{database}_*.sql.gz`, find the
11 + /// newest one by modification time, and return a status based on its age.
12 + ///
13 + /// Returns "ok" if the newest backup is younger than `max_age_hours`, "stale"
14 + /// if older, "missing" if no matching files exist, or "error" on I/O failures.
15 + #[instrument(skip_all)]
16 + pub fn check_backup(
17 + target_name: &str,
18 + directory: &str,
19 + database: &str,
20 + max_age_hours: u64,
21 + ) -> BackupCheckResult {
22 + let checked_at = chrono::Utc::now().to_rfc3339();
23 + let dir = Path::new(directory);
24 +
25 + if !dir.exists() {
26 + return BackupCheckResult {
27 + target: target_name.to_string(),
28 + database_name: database.to_string(),
29 + status: "error".to_string(),
30 + last_backup_at: None,
31 + size_bytes: None,
32 + age_hours: None,
33 + checked_at,
34 + error: Some(format!("backup directory does not exist: {directory}")),
35 + };
36 + }
37 +
38 + let prefix = format!("{database}_");
39 + let suffix = ".sql.gz";
40 +
41 + let entries = match std::fs::read_dir(dir) {
42 + Ok(entries) => entries,
43 + Err(e) => {
44 + return BackupCheckResult {
45 + target: target_name.to_string(),
46 + database_name: database.to_string(),
47 + status: "error".to_string(),
48 + last_backup_at: None,
49 + size_bytes: None,
50 + age_hours: None,
51 + checked_at,
52 + error: Some(format!("failed to read backup directory: {e}")),
53 + };
54 + }
55 + };
56 +
57 + let mut newest: Option<(std::time::SystemTime, u64, String)> = None;
58 +
59 + for entry in entries.flatten() {
60 + let name = entry.file_name();
61 + let name_str = name.to_string_lossy();
62 + if !name_str.starts_with(&prefix) || !name_str.ends_with(suffix) {
63 + continue;
64 + }
65 +
66 + let metadata = match entry.metadata() {
67 + Ok(m) => m,
68 + Err(_) => continue,
69 + };
70 +
71 + if !metadata.is_file() {
72 + continue;
73 + }
74 +
75 + let modified = match metadata.modified() {
76 + Ok(t) => t,
77 + Err(_) => continue,
78 + };
79 +
80 + let size = metadata.len();
81 +
82 + if newest.as_ref().is_none_or(|(best_time, _, _)| modified > *best_time) {
83 + newest = Some((modified, size, name_str.into_owned()));
84 + }
85 + }
86 +
87 + let Some((modified_time, size, _filename)) = newest else {
88 + return BackupCheckResult {
89 + target: target_name.to_string(),
90 + database_name: database.to_string(),
91 + status: "missing".to_string(),
92 + last_backup_at: None,
93 + size_bytes: None,
94 + age_hours: None,
95 + checked_at,
96 + error: None,
97 + };
98 + };
99 +
100 + let modified_chrono = chrono::DateTime::<chrono::Utc>::from(modified_time);
101 + let age = chrono::Utc::now().signed_duration_since(modified_chrono);
102 + let age_hours = age.num_hours();
103 +
104 + let status = if age_hours < max_age_hours as i64 {
105 + "ok"
106 + } else {
107 + "stale"
108 + };
109 +
110 + BackupCheckResult {
111 + target: target_name.to_string(),
112 + database_name: database.to_string(),
113 + status: status.to_string(),
114 + last_backup_at: Some(modified_chrono.to_rfc3339()),
115 + size_bytes: Some(size as i64),
116 + age_hours: Some(age_hours),
117 + checked_at,
118 + error: None,
119 + }
120 + }
121 +
122 + #[cfg(test)]
123 + mod tests {
124 + use super::*;
125 +
126 + #[test]
127 + fn missing_directory_returns_error() {
128 + let result = check_backup("test", "/nonexistent/path/123", "testdb", 25);
129 + assert_eq!(result.status, "error");
130 + assert!(result.error.as_ref().unwrap().contains("does not exist"));
131 + }
132 +
133 + #[test]
134 + fn empty_directory_returns_missing() {
135 + let dir = std::env::temp_dir().join("pom_backup_test_empty");
136 + let _ = std::fs::create_dir_all(&dir);
137 + let result = check_backup("test", dir.to_str().unwrap(), "testdb", 25);
138 + assert_eq!(result.status, "missing");
139 + assert!(result.error.is_none());
140 + let _ = std::fs::remove_dir_all(&dir);
141 + }
142 +
143 + #[test]
144 + fn recent_backup_returns_ok() {
145 + let dir = std::env::temp_dir().join("pom_backup_test_ok");
146 + let _ = std::fs::create_dir_all(&dir);
147 + let file = dir.join("mydb_2026-04-15.sql.gz");
148 + std::fs::write(&file, b"fake backup").unwrap();
149 +
150 + let result = check_backup("test", dir.to_str().unwrap(), "mydb", 25);
151 + assert_eq!(result.status, "ok");
152 + assert!(result.last_backup_at.is_some());
153 + assert!(result.size_bytes.unwrap() > 0);
154 + assert_eq!(result.age_hours.unwrap(), 0);
155 +
156 + let _ = std::fs::remove_dir_all(&dir);
157 + }
158 +
159 + #[test]
160 + fn wrong_prefix_not_matched() {
161 + let dir = std::env::temp_dir().join("pom_backup_test_prefix");
162 + let _ = std::fs::create_dir_all(&dir);
163 + let file = dir.join("otherdb_2026-04-15.sql.gz");
164 + std::fs::write(&file, b"wrong db").unwrap();
165 +
166 + let result = check_backup("test", dir.to_str().unwrap(), "mydb", 25);
167 + assert_eq!(result.status, "missing");
168 +
169 + let _ = std::fs::remove_dir_all(&dir);
170 + }
171 +
172 + #[test]
173 + fn wrong_suffix_not_matched() {
174 + let dir = std::env::temp_dir().join("pom_backup_test_suffix");
175 + let _ = std::fs::create_dir_all(&dir);
176 + let file = dir.join("mydb_2026-04-15.sql");
177 + std::fs::write(&file, b"wrong suffix").unwrap();
178 +
179 + let result = check_backup("test", dir.to_str().unwrap(), "mydb", 25);
180 + assert_eq!(result.status, "missing");
181 +
182 + let _ = std::fs::remove_dir_all(&dir);
183 + }
184 + }
@@ -1,3 +1,4 @@
1 + pub mod backup;
1 2 pub mod cors;
2 3 pub mod dns;
3 4 pub mod http;
@@ -62,6 +62,7 @@ pub(crate) async fn cmd_serve(
62 62 handles.extend(tasks::spawn_dns_tasks(config, pool, &token, &alerter));
63 63 handles.extend(tasks::spawn_whois_tasks(config, pool, &token, &alerter));
64 64 handles.extend(tasks::spawn_cors_tasks(config, pool, &token, &alerter));
65 + handles.extend(tasks::spawn_backup_tasks(config, pool, &token, &alerter));
65 66 handles.push(tasks::spawn_prune_task(pool, prune_days, &token));
66 67
67 68 // Spawn peer heartbeat tasks
@@ -0,0 +1,98 @@
1 + use tokio::task::JoinHandle;
2 + use tracing::info;
3 +
4 + use pom::alerts::Alerter;
5 + use pom::checks::backup;
6 + use pom::config::Config;
7 + use pom::db;
8 +
9 + pub(crate) fn spawn_backup_tasks(
10 + config: &Config,
11 + pool: &sqlx::SqlitePool,
12 + cancel: &tokio_util::sync::CancellationToken,
13 + alerter: &Option<Alerter>,
14 + ) -> Vec<JoinHandle<()>> {
15 + let mut handles = Vec::new();
16 +
17 + for name in config.target_names() {
18 + let target_config = config.get_target(&name).unwrap().clone();
19 + if let Some(backup_config) = target_config.backups {
20 + let pool = pool.clone();
21 + let name = name.clone();
22 + let label = target_config.label.clone();
23 + let alerter = alerter.clone();
24 + let cancel = cancel.clone();
25 + let interval_secs = backup_config.interval_secs;
26 +
27 + info!(
28 + "{name}: backup check every {interval_secs}s (dir={}, databases={:?})",
29 + backup_config.directory, backup_config.databases
30 + );
31 +
32 + handles.push(tokio::spawn(async move {
33 + let mut interval = tokio::time::interval(
34 + std::time::Duration::from_secs(interval_secs),
35 + );
36 + interval.tick().await; // consume immediate first tick
37 + loop {
38 + tokio::select! {
39 + _ = cancel.cancelled() => break,
40 + _ = interval.tick() => {}
41 + }
42 + for database in &backup_config.databases {
43 + let result = backup::check_backup(
44 + &name,
45 + &backup_config.directory,
46 + database,
47 + backup_config.max_age_hours,
48 + );
49 + info!(
50 + "{}: backup {} — {} (age: {}h)",
51 + name,
52 + database,
53 + result.status,
54 + result.age_hours.unwrap_or(-1),
55 + );
56 +
57 + // Check for status transitions before storing
58 + let previous = db::get_latest_backup_check(&pool, &name, database)
59 + .await
60 + .ok()
61 + .flatten();
62 +
63 + if let Err(e) = db::insert_backup_check(&pool, &result).await {
64 + tracing::error!("{name}: failed to store backup check for {database}: {e}");
65 + }
66 +
67 + // Fire alerts on status transitions
68 + if let Some(ref alerter) = alerter {
69 + let prev_status = previous.as_ref().map(|p| p.status.as_str());
70 + let was_ok = prev_status.is_none_or(|s| s == "ok");
71 + let now_ok = result.status == "ok";
72 +
73 + if was_ok && !now_ok {
74 + // Transitioned to a bad state
75 + alerter.send_backup_stale_alert(
76 + &name,
77 + &label,
78 + database,
79 + &result.status,
80 + result.age_hours,
81 + ).await;
82 + } else if !was_ok && now_ok {
83 + // Recovered
84 + alerter.send_backup_recovery(
85 + &name,
86 + &label,
87 + database,
88 + ).await;
89 + }
90 + }
91 + }
92 + }
93 + }));
94 + }
95 + }
96 +
97 + handles
98 + }
@@ -1,3 +1,4 @@
1 + mod backup;
1 2 mod cors;
2 3 mod dns;
3 4 mod health;
@@ -7,6 +8,7 @@ mod routes;
7 8 mod tls;
8 9 mod whois;
9 10
11 + pub(crate) use backup::spawn_backup_tasks;
10 12 pub(crate) use cors::spawn_cors_tasks;
11 13 pub(crate) use dns::spawn_dns_tasks;
12 14 pub(crate) use health::spawn_health_tasks;
@@ -22,7 +22,7 @@ pub(crate) fn spawn_prune_task(
22 22 _ = interval.tick() => {}
23 23 }
24 24 match db::prune_old_records(&pool, prune_days).await {
25 - Ok(r) => info!("Pruned {} health checks, {} test runs, {} test details, {} peer heartbeats, {} alerts, {} TLS checks, {} incidents, {} route checks, {} DNS checks, {} WHOIS checks", r.health, r.tests, r.test_details, r.heartbeats, r.alerts, r.tls, r.incidents, r.routes, r.dns, r.whois),
25 + Ok(r) => info!("Pruned {} health checks, {} test runs, {} test details, {} peer heartbeats, {} alerts, {} TLS checks, {} incidents, {} route checks, {} DNS checks, {} WHOIS checks, {} backup checks", r.health, r.tests, r.test_details, r.heartbeats, r.alerts, r.tls, r.incidents, r.routes, r.dns, r.whois, r.backups),
26 26 Err(e) => tracing::error!("Prune failed: {e}"),
27 27 }
28 28 }
@@ -175,6 +175,8 @@ pub struct TargetConfig {
175 175 /// CORS preflight checks. Empty disables CORS checks.
176 176 #[serde(default)]
177 177 pub cors: Vec<CorsCheck>,
178 + /// Local filesystem backup verification. `None` disables backup checks.
179 + pub backups: Option<BackupConfig>,
178 180 }
179 181
180 182 #[derive(Debug, Clone, Deserialize)]
@@ -216,6 +218,30 @@ fn default_cors_method() -> String {
216 218 }
217 219
218 220 #[derive(Debug, Clone, Deserialize)]
221 + pub struct BackupConfig {
222 + /// Filesystem directory containing backup files (e.g. "/opt/backups/postgres").
223 + pub directory: String,
224 + /// Database names to check for backups (e.g. ["makenotwork", "multithreaded"]).
225 + pub databases: Vec<String>,
226 + /// Maximum age in hours before a backup is considered stale.
227 + #[serde(default = "default_max_age_hours")]
228 + pub max_age_hours: u64,
229 + /// Seconds between backup verification checks.
230 + #[serde(default = "default_backup_interval")]
231 + pub interval_secs: u64,
232 + }
233 +
234 + fn default_max_age_hours() -> u64 {
235 + // 25 hours: allows for some cron drift from the daily 03:00 UTC schedule
236 + 25
237 + }
238 +
239 + fn default_backup_interval() -> u64 {
240 + // 1 hour: backups are daily, hourly checks are sufficient
241 + 3600
242 + }
243 +
244 + #[derive(Debug, Clone, Deserialize)]
219 245 pub struct TlsConfig {
220 246 /// Hostname to connect to for the TLS check.
221 247 pub host: String,
M pom/src/db.rs +77 -2
@@ -11,7 +11,7 @@ use std::str::FromStr;
11 11 use tracing::{info, instrument};
12 12
13 13 use crate::error::Result;
14 - use crate::types::{CorsCheckResult, DnsCheckResult, HealthDetails, HealthSnapshot, HealthStatus, TestDetail, TestRun, TestRunId, TestSummary, TlsStatus, WhoisResult};
14 + use crate::types::{BackupCheckResult, CorsCheckResult, DnsCheckResult, HealthDetails, HealthSnapshot, HealthStatus, TestDetail, TestRun, TestRunId, TestSummary, TlsStatus, WhoisResult};
15 15
16 16 /// Each migration is a (version, description, SQL) tuple. Versions start at 1.
17 17 /// The SQL may contain multiple statements separated by semicolons.
@@ -160,6 +160,20 @@ const MIGRATIONS: &[(i64, &str, &str)] = &[
160 160 );
161 161 CREATE INDEX IF NOT EXISTS idx_cors_target_id ON cors_checks(target, id DESC);
162 162 "#),
163 + (9, "add backup_checks table", r#"
164 + CREATE TABLE IF NOT EXISTS backup_checks (
165 + id INTEGER PRIMARY KEY AUTOINCREMENT,
166 + target TEXT NOT NULL,
167 + database_name TEXT NOT NULL,
168 + status TEXT NOT NULL,
169 + last_backup_at TEXT,
170 + size_bytes INTEGER,
171 + age_hours INTEGER,
172 + checked_at TEXT NOT NULL,
173 + error TEXT
174 + );
175 + CREATE INDEX IF NOT EXISTS idx_backup_checks_target ON backup_checks(target, id DESC);
176 + "#),
163 177 ];
164 178
165 179 #[instrument(skip_all)]
@@ -1033,6 +1047,60 @@ pub async fn get_latest_cors_checks(
1033 1047 .collect())
1034 1048 }
1035 1049
1050 + // --- Backup check queries ---
1051 +
1052 + #[instrument(skip_all)]
1053 + pub async fn insert_backup_check(
1054 + pool: &SqlitePool,
1055 + result: &BackupCheckResult,
1056 + ) -> Result<i64> {
1057 + let row = sqlx::query(
1058 + "INSERT INTO backup_checks (target, database_name, status, last_backup_at, size_bytes, age_hours, checked_at, error)
1059 + VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
1060 + )
1061 + .bind(&result.target)
1062 + .bind(&result.database_name)
1063 + .bind(&result.status)
1064 + .bind(&result.last_backup_at)
1065 + .bind(result.size_bytes)
1066 + .bind(result.age_hours)
1067 + .bind(&result.checked_at)
1068 + .bind(&result.error)
1069 + .execute(pool)
1070 + .await?;
1071 + Ok(row.last_insert_rowid())
1072 + }
1073 +
1074 + /// Get the latest backup check for a specific target and database.
1075 + #[instrument(skip_all)]
1076 + pub async fn get_latest_backup_check(
1077 + pool: &SqlitePool,
1078 + target: &str,
1079 + database_name: &str,
1080 + ) -> Result<Option<BackupCheckRow>> {
1081 + Ok(sqlx::query_as::<_, BackupCheckRow>(
1082 + "SELECT id, target, database_name, status, last_backup_at, size_bytes, age_hours, checked_at, error
1083 + FROM backup_checks WHERE target = ? AND database_name = ? ORDER BY id DESC LIMIT 1",
1084 + )
1085 + .bind(target)
1086 + .bind(database_name)
1087 + .fetch_optional(pool)
1088 + .await?)
1089 + }
1090 +
1091 + #[derive(Debug, Clone, sqlx::FromRow, serde::Serialize)]
1092 + pub struct BackupCheckRow {
1093 + pub id: i64,
1094 + pub target: String,
1095 + pub database_name: String,
1096 + pub status: String,
1097 + pub last_backup_at: Option<String>,
1098 + pub size_bytes: Option<i64>,
1099 + pub age_hours: Option<i64>,
1100 + pub checked_at: String,
1101 + pub error: Option<String>,
1102 + }
1103 +
1036 1104 // --- Stale data cleanup ---
1037 1105
1038 1106 /// Delete route_checks for paths no longer in the config.
@@ -1112,6 +1180,7 @@ pub struct PruneResult {
1112 1180 pub routes: u64,
1113 1181 pub dns: u64,
1114 1182 pub whois: u64,
1183 + pub backups: u64,
1115 1184 }
1116 1185
1117 1186 /// Delete records older than `days` from all tables.
@@ -1124,7 +1193,7 @@ pub async fn prune_old_records(
1124 1193 // Guard: days <= 0 would set cutoff to now (or the future), deleting
1125 1194 // everything. Treat this as a no-op instead.
1126 1195 if days <= 0 {
1127 - return Ok(PruneResult { health: 0, tests: 0, test_details: 0, heartbeats: 0, alerts: 0, tls: 0, incidents: 0, routes: 0, dns: 0, whois: 0 });
1196 + return Ok(PruneResult { health: 0, tests: 0, test_details: 0, heartbeats: 0, alerts: 0, tls: 0, incidents: 0, routes: 0, dns: 0, whois: 0, backups: 0 });
1128 1197 }
1129 1198
1130 1199 let cutoff = chrono::Utc::now() - chrono::Duration::days(days);
@@ -1182,6 +1251,11 @@ pub async fn prune_old_records(
1182 1251 .execute(pool)
1183 1252 .await?;
1184 1253
1254 + let backups_result = sqlx::query("DELETE FROM backup_checks WHERE checked_at < ?")
1255 + .bind(&cutoff_str)
1256 + .execute(pool)
1257 + .await?;
1258 +
1185 1259 Ok(PruneResult {
1186 1260 health: health_result.rows_affected(),
1187 1261 tests: test_result.rows_affected(),
@@ -1193,6 +1267,7 @@ pub async fn prune_old_records(
1193 1267 routes: routes_result.rows_affected(),
1194 1268 dns: dns_result.rows_affected(),
1195 1269 whois: whois_result.rows_affected(),
1270 + backups: backups_result.rows_affected(),
1196 1271 })
1197 1272 }
1198 1273
@@ -286,9 +286,9 @@ pub fn format_dns_results(dns_results: &[DnsCheckResult], whois_results: &[Whois
286 286 /// Format prune results for CLI display.
287 287 pub fn format_prune(result: &PruneResult, days: i64) -> String {
288 288 format!(
289 - "Pruned {} health checks, {} test runs, {} test details, {} peer heartbeats, {} alerts, {} TLS checks, {} incidents, {} route checks, {} DNS checks, {} WHOIS checks older than {} days.\n",
289 + "Pruned {} health checks, {} test runs, {} test details, {} peer heartbeats, {} alerts, {} TLS checks, {} incidents, {} route checks, {} DNS checks, {} WHOIS checks, {} backup checks older than {} days.\n",
290 290 result.health, result.tests, result.test_details, result.heartbeats, result.alerts, result.tls,
291 - result.incidents, result.routes, result.dns, result.whois, days
291 + result.incidents, result.routes, result.dns, result.whois, result.backups, days
292 292 )
293 293 }
294 294
@@ -876,12 +876,12 @@ mod tests {
876 876 fn prune_formatting() {
877 877 let result = PruneResult {
878 878 health: 5, tests: 3, test_details: 15, heartbeats: 10, alerts: 2, tls: 1,
879 - incidents: 4, routes: 0, dns: 8, whois: 2,
879 + incidents: 4, routes: 0, dns: 8, whois: 2, backups: 1,
880 880 };
881 881 let out = format_prune(&result, 30);
882 882 assert_eq!(
883 883 out,
884 - "Pruned 5 health checks, 3 test runs, 15 test details, 10 peer heartbeats, 2 alerts, 1 TLS checks, 4 incidents, 0 route checks, 8 DNS checks, 2 WHOIS checks older than 30 days.\n"
884 + "Pruned 5 health checks, 3 test runs, 15 test details, 10 peer heartbeats, 2 alerts, 1 TLS checks, 4 incidents, 0 route checks, 8 DNS checks, 2 WHOIS checks, 1 backup checks older than 30 days.\n"
885 885 );
886 886 }
887 887
@@ -889,10 +889,10 @@ mod tests {
889 889 fn prune_zero_records() {
890 890 let result = PruneResult {
891 891 health: 0, tests: 0, test_details: 0, heartbeats: 0, alerts: 0, tls: 0,
892 - incidents: 0, routes: 0, dns: 0, whois: 0,
892 + incidents: 0, routes: 0, dns: 0, whois: 0, backups: 0,
893 893 };
894 894 let out = format_prune(&result, 7);
895 - assert!(out.contains("Pruned 0 health checks, 0 test runs, 0 test details, 0 peer heartbeats, 0 alerts, 0 TLS checks, 0 incidents, 0 route checks, 0 DNS checks, 0 WHOIS checks older than 7 days."));
895 + assert!(out.contains("Pruned 0 health checks, 0 test runs, 0 test details, 0 peer heartbeats, 0 alerts, 0 TLS checks, 0 incidents, 0 route checks, 0 DNS checks, 0 WHOIS checks, 0 backup checks older than 7 days."));
896 896 }
897 897
898 898 // --- format_mesh ---
@@ -27,6 +27,8 @@ pub enum AlertCategory {
27 27 TestDurationDrift,
28 28 CorsFailure,
29 29 CorsRecovery,
30 + BackupStale,
31 + BackupRecovery,
30 32 MonitoringOffline,
31 33 MonitoringRecovery,
32 34 }
@@ -52,6 +54,8 @@ impl fmt::Display for AlertCategory {
52 54 Self::TestDurationDrift => write!(f, "test_duration_drift"),
53 55 Self::CorsFailure => write!(f, "cors_failure"),
54 56 Self::CorsRecovery => write!(f, "cors_recovery"),
57 + Self::BackupStale => write!(f, "backup_stale"),
58 + Self::BackupRecovery => write!(f, "backup_recovery"),
55 59 Self::MonitoringOffline => write!(f, "monitoring_offline"),
56 60 Self::MonitoringRecovery => write!(f, "monitoring_recovery"),
57 61 }
@@ -80,6 +84,8 @@ impl std::str::FromStr for AlertCategory {
80 84 "test_duration_drift" => Ok(Self::TestDurationDrift),
81 85 "cors_failure" => Ok(Self::CorsFailure),
82 86 "cors_recovery" => Ok(Self::CorsRecovery),
87 + "backup_stale" => Ok(Self::BackupStale),
88 + "backup_recovery" => Ok(Self::BackupRecovery),
83 89 "monitoring_offline" => Ok(Self::MonitoringOffline),
84 90 "monitoring_recovery" => Ok(Self::MonitoringRecovery),
85 91 other => Err(format!("unknown alert category: {other}")),
@@ -407,6 +413,26 @@ pub struct CorsCheckResult {
407 413 pub error: Option<String>,
408 414 }
409 415
416 + #[derive(Debug, Clone, Serialize, Deserialize)]
417 + pub struct BackupCheckResult {
418 + /// Config key identifying the monitored target.
419 + pub target: String,
420 + /// Name of the PostgreSQL database (e.g. "makenotwork").
421 + pub database_name: String,
422 + /// Check outcome: "ok", "stale", "missing", or "error".
423 + pub status: String,
424 + /// Modification time of the most recent backup file, in RFC 3339 format.
425 + pub last_backup_at: Option<String>,
426 + /// Size of the most recent backup file in bytes.
427 + pub size_bytes: Option<i64>,
428 + /// Age of the most recent backup in whole hours.
429 + pub age_hours: Option<i64>,
430 + /// When this check was performed, in RFC 3339 format (UTC).
431 + pub checked_at: String,
432 + /// Error message if the filesystem scan failed.
433 + pub error: Option<String>,
434 + }
435 +
410 436 impl LatencyStats {
411 437 /// Compute latency statistics from a slice of response times.
412 438 /// Returns `None` if the slice is empty.
@@ -636,6 +662,8 @@ mod tests {
636 662 AlertCategory::TestDurationDrift,
637 663 AlertCategory::CorsFailure,
638 664 AlertCategory::CorsRecovery,
665 + AlertCategory::BackupStale,
666 + AlertCategory::BackupRecovery,
639 667 AlertCategory::MonitoringOffline,
640 668 AlertCategory::MonitoringRecovery,
641 669 ] {
@@ -113,7 +113,7 @@ async fn test_run_insert_and_query() {
113 113 };
114 114
115 115 let id = db::insert_test_run(&pool, &run).await.unwrap();
116 - assert!(id > 0);
116 + assert!(id.0 > 0);
117 117
118 118 let latest = db::get_latest_test_run(&pool, "mnw").await.unwrap();
119 119 assert!(latest.is_some());
@@ -376,7 +376,7 @@ async fn migration_fresh_db_reaches_latest_version() {
376 376 // A fresh in-memory DB should run all migrations and reach the latest version.
377 377 let pool = db::connect_in_memory().await.unwrap();
378 378 let version = db::get_schema_version(&pool).await.unwrap();
379 - assert_eq!(version, 8);
379 + assert_eq!(version, 9);
380 380
381 381 // Verify the schema_version table has entries for each migration
382 382 let rows = sqlx::query_as::<_, (i64, String)>(
@@ -385,7 +385,7 @@ async fn migration_fresh_db_reaches_latest_version() {
385 385 .fetch_all(&pool)
386 386 .await
387 387 .unwrap();
388 - assert_eq!(rows.len(), 8);
388 + assert_eq!(rows.len(), 9);
389 389 assert_eq!(rows[0].0, 1);
390 390 assert_eq!(rows[0].1, "initial schema");
391 391 assert_eq!(rows[1].0, 2);
@@ -402,6 +402,8 @@ async fn migration_fresh_db_reaches_latest_version() {
402 402 assert_eq!(rows[6].1, "add test_details table");
403 403 assert_eq!(rows[7].0, 8);
404 404 assert_eq!(rows[7].1, "add cors_checks table");
405 + assert_eq!(rows[8].0, 9);
406 + assert_eq!(rows[8].1, "add backup_checks table");
405 407
406 408 // Verify actual tables were created by inserting data
407 409 let snapshot = HealthSnapshot {
@@ -421,18 +423,18 @@ async fn migration_fresh_db_reaches_latest_version() {
421 423 async fn migration_already_current_is_idempotent() {
422 424 // Running migrations on an already-migrated DB should be a no-op.
423 425 let pool = db::connect_in_memory().await.unwrap();
424 - assert_eq!(db::get_schema_version(&pool).await.unwrap(), 8);
426 + assert_eq!(db::get_schema_version(&pool).await.unwrap(), 9);
425 427
426 428 // Run migrations again
427 429 db::run_migrations(&pool).await.unwrap();
428 - assert_eq!(db::get_schema_version(&pool).await.unwrap(), 8);
430 + assert_eq!(db::get_schema_version(&pool).await.unwrap(), 9);
429 431
430 - // schema_version should still have exactly eight entries (not duplicated)
432 + // schema_version should still have exactly nine entries (not duplicated)
431 433 let count = sqlx::query_as::<_, (i64,)>("SELECT COUNT(*) FROM schema_version")
432 434 .fetch_one(&pool)
433 435 .await
434 436 .unwrap();
435 - assert_eq!(count.0, 8);
437 + assert_eq!(count.0, 9);
436 438 }
437 439
438 440 #[tokio::test]
@@ -491,8 +493,8 @@ async fn migration_detects_pre_migration_database() {
491 493 // Now run migrations — should detect existing tables, stamp as v1, then run v2+v3+v4+v5+v6
492 494 db::run_migrations(&pool).await.unwrap();
493 495
494 - // Version should be 6 (stamped v1 + ran v2 + ran v3 + ran v4 + ran v5 + ran v6)
495 - assert_eq!(db::get_schema_version(&pool).await.unwrap(), 8);
496 + // Version should be 9 (stamped v1 + ran v2..v9)
497 + assert_eq!(db::get_schema_version(&pool).await.unwrap(), 9);
496 498
497 499 // Description should indicate pre-existing
498 500 let row = sqlx::query_as::<_, (String,)>(
@@ -793,7 +795,7 @@ async fn tool_run_tests_no_test_config() {
793 795 async fn migration_v2_creates_alerts_table() {
794 796 let pool = db::connect_in_memory().await.unwrap();
795 797 let version = db::get_schema_version(&pool).await.unwrap();
796 - assert_eq!(version, 8);
798 + assert_eq!(version, 9);
797 799
798 800 // Verify alerts table exists by inserting
799 801 let id = db::insert_alert(&pool, "mnw", "health", Some("operational"), Some("error"), None)
@@ -861,7 +863,7 @@ async fn prune_removes_old_alerts() {
861 863 async fn migration_v3_creates_tls_checks_table() {
862 864 let pool = db::connect_in_memory().await.unwrap();
863 865 let version = db::get_schema_version(&pool).await.unwrap();
864 - assert_eq!(version, 8);
866 + assert_eq!(version, 9);
865 867
866 868 // Verify tls_checks table exists by inserting
867 869 let status = pom::types::TlsStatus {
@@ -1076,7 +1078,7 @@ cooldown_secs = 120
1076 1078 async fn migration_v4_creates_incidents_table() {
1077 1079 let pool = db::connect_in_memory().await.unwrap();
1078 1080 let version = db::get_schema_version(&pool).await.unwrap();
1079 - assert_eq!(version, 8);
1081 + assert_eq!(version, 9);
1080 1082
1081 1083 // Verify incidents table exists by inserting
1082 1084 let id = db::insert_incident(&pool, "mnw", "operational", "degraded")
@@ -1205,7 +1207,7 @@ async fn api_status_no_incidents_omits_fields() {
1205 1207 async fn migration_v5_creates_route_checks_table() {
1206 1208 let pool = db::connect_in_memory().await.unwrap();
1207 1209 let version = db::get_schema_version(&pool).await.unwrap();
1208 - assert_eq!(version, 8);
1210 + assert_eq!(version, 9);
1209 1211
1210 1212 // Verify route_checks table exists by inserting
1211 1213 let result = pom::checks::routes::RouteCheckResult {
@@ -2205,7 +2207,7 @@ async fn peer_uuid_mismatch_updates_db_identity() {
2205 2207 async fn migration_v6_creates_dns_and_whois_tables() {
2206 2208 let pool = db::connect_in_memory().await.unwrap();
2207 2209 let version = db::get_schema_version(&pool).await.unwrap();
2208 - assert_eq!(version, 8);
2210 + assert_eq!(version, 9);
2209 2211
2210 2212 // Verify dns_checks table exists
2211 2213 let dns_result = DnsCheckResult {
@@ -3195,7 +3197,7 @@ async fn prune_cascades_test_details_with_deleted_run() {
3195 3197
3196 3198 // Verify details exist
3197 3199 let count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM test_details WHERE run_id = ?")
3198 - .bind(run_id)
3200 + .bind(run_id.0)
3199 3201 .fetch_one(&pool)
3200 3202 .await
3201 3203 .unwrap();
@@ -3207,7 +3209,7 @@ async fn prune_cascades_test_details_with_deleted_run() {
3207 3209
3208 3210 // Verify details are gone (removed by ON DELETE CASCADE)
3209 3211 let count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM test_details WHERE run_id = ?")
3210 - .bind(run_id)
3212 + .bind(run_id.0)
3211 3213 .fetch_one(&pool)
3212 3214 .await
3213 3215 .unwrap();
@@ -3243,7 +3245,7 @@ async fn prune_cleans_up_orphaned_test_details() {
3243 3245 // Disable FK enforcement temporarily to delete the run without cascading
3244 3246 sqlx::query("PRAGMA foreign_keys = OFF").execute(&pool).await.unwrap();
3245 3247 sqlx::query("DELETE FROM test_runs WHERE id = ?")
3246 - .bind(run_id)
3248 + .bind(run_id.0)
3247 3249 .execute(&pool)
3248 3250 .await
3249 3251 .unwrap();
@@ -3251,7 +3253,7 @@ async fn prune_cleans_up_orphaned_test_details() {
3251 3253
3252 3254 // Verify orphan exists
3253 3255 let count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM test_details WHERE run_id = ?")
3254 - .bind(run_id)
3256 + .bind(run_id.0)
3255 3257 .fetch_one(&pool)
3256 3258 .await
3257 3259 .unwrap();
@@ -3262,7 +3264,7 @@ async fn prune_cleans_up_orphaned_test_details() {
3262 3264 assert_eq!(result.test_details, 1);
3263 3265
3264 3266 let count: (i64,) = sqlx::query_as("SELECT COUNT(*) FROM test_details WHERE run_id = ?")
3265 - .bind(run_id)
3267 + .bind(run_id.0)
3266 3268 .fetch_one(&pool)
3267 3269 .await
3268 3270 .unwrap();