Skip to main content

max / makenotwork

39.6 KB · 1142 lines History Blame Raw
1 //! Email alerting via Postmark API.
2 //!
3 //! Sends alerts on health status transitions and peer disappearance/recovery.
4 //! If no `postmark_token` is configured, alerts are logged to stdout instead.
5
6 use sqlx::SqlitePool;
7 use tracing::{info, instrument, warn};
8
9 use crate::config::AlertConfig;
10 use crate::db;
11 use crate::types::AlertCategory;
12
13 /// WAM ticket priority for a transition into a non-operational health status.
14 fn health_status_priority(to_status: &str) -> &'static str {
15 match to_status {
16 "error" | "unreachable" => "critical",
17 "degraded" => "high",
18 _ => "medium",
19 }
20 }
21
22 /// WAM ticket priority for a TLS certificate that expires in `days`.
23 fn tls_expiry_priority(days: i64) -> &'static str {
24 if days <= 3 { "critical" } else if days <= 7 { "high" } else { "medium" }
25 }
26
27 /// WAM ticket priority for a domain registration that expires in `days`.
28 fn whois_expiry_priority(days: i64) -> &'static str {
29 if days <= 7 { "critical" } else if days <= 14 { "high" } else { "medium" }
30 }
31
32 /// WAM ticket priority for a stale/missing/error backup status.
33 fn backup_status_priority(status: &str) -> &'static str {
34 if status == "missing" { "critical" } else { "high" }
35 }
36
37 /// Human-readable detail text for a backup status alert.
38 fn backup_status_detail(status: &str, age_hours: Option<i64>) -> String {
39 match (status, age_hours) {
40 ("stale", Some(hours)) => format!("last backup is {hours}h old"),
41 ("missing", _) => "no backup files found".to_string(),
42 ("error", _) => "backup check failed".to_string(),
43 _ => format!("status: {status}"),
44 }
45 }
46
47 #[derive(Clone)]
48 pub struct Alerter {
49 config: AlertConfig,
50 client: reqwest::Client,
51 pool: SqlitePool,
52 instance_name: String,
53 wam_url: Option<String>,
54 }
55
56 impl Alerter {
57 pub fn new(config: AlertConfig, pool: SqlitePool, instance_name: String) -> Self {
58 let client = reqwest::Client::builder()
59 .timeout(std::time::Duration::from_secs(10))
60 .build()
61 .unwrap_or_default();
62 let wam_url = config.wam_url.clone();
63 Self { config, client, pool, instance_name, wam_url }
64 }
65
66 #[instrument(skip_all)]
67 pub async fn send_health_alert(
68 &self,
69 target: &str,
70 label: &str,
71 from_status: &str,
72 to_status: &str,
73 error: Option<&str>,
74 ) {
75 let alert_key = format!("health:{target}");
76 if self.is_within_cooldown(&alert_key).await {
77 info!("alert cooldown active for {alert_key}, skipping");
78 return;
79 }
80
81 let subject = format!("[PoM] {target}: {from_status} -> {to_status}");
82 let mut body = format!(
83 "Target: {label} ({target})\n\
84 Status: {from_status} -> {to_status}\n\
85 Instance: {}\n\
86 Time: {}\n",
87 self.instance_name,
88 chrono::Utc::now().to_rfc3339(),
89 );
90 if let Some(err) = error {
91 body.push_str(&format!("Error: {err}\n"));
92 }
93 body.push_str("\n- PoM");
94
95 let priority = health_status_priority(to_status);
96 self.wam_ticket(&subject, &body, priority, "pom-health", Some(target)).await;
97 self.record_alert(&alert_key, AlertCategory::Health, Some(from_status), Some(to_status), error).await;
98 }
99
100 #[instrument(skip_all)]
101 pub async fn send_health_recovery(
102 &self,
103 target: &str,
104 label: &str,
105 from_status: &str,
106 ) {
107 let alert_key = format!("health:{target}");
108 // No cooldown on recovery — always send
109 let subject = format!("[PoM] {target}: recovered");
110 let body = format!(
111 "Target: {label} ({target})\n\
112 Status: {from_status} -> operational\n\
113 Instance: {}\n\
114 Time: {}\n\n\
115 - PoM",
116 self.instance_name,
117 chrono::Utc::now().to_rfc3339(),
118 );
119
120 self.send_email(&subject, &body).await;
121 self.record_alert(&alert_key, AlertCategory::Recovery, Some(from_status), Some("operational"), None).await;
122 }
123
124 #[instrument(skip_all)]
125 pub async fn send_tls_expiry_alert(
126 &self,
127 target: &str,
128 host: &str,
129 days_remaining: i64,
130 not_after: &str,
131 ) {
132 let alert_key = format!("tls:{target}");
133 if self.is_within_cooldown(&alert_key).await {
134 info!("alert cooldown active for {alert_key}, skipping");
135 return;
136 }
137
138 let subject = format!("[PoM] {target}: TLS cert expires in {days_remaining} days");
139 let body = format!(
140 "Target: {target}\n\
141 Host: {host}\n\
142 Days remaining: {days_remaining}\n\
143 Expires: {not_after}\n\
144 Instance: {}\n\
145 Time: {}\n\n\
146 - PoM",
147 self.instance_name,
148 chrono::Utc::now().to_rfc3339(),
149 );
150
151 let priority = tls_expiry_priority(days_remaining);
152 self.wam_ticket(&subject, &body, priority, "pom-tls", Some(&format!("{target}:{host}"))).await;
153 self.record_alert(&alert_key, AlertCategory::TlsExpiry, None, None, None).await;
154 }
155
156 #[instrument(skip_all)]
157 pub async fn send_tls_error_alert(
158 &self,
159 target: &str,
160 host: &str,
161 error: &str,
162 ) {
163 let alert_key = format!("tls:{target}");
164 if self.is_within_cooldown(&alert_key).await {
165 info!("alert cooldown active for {alert_key}, skipping");
166 return;
167 }
168
169 let subject = format!("[PoM] {target}: TLS check failed");
170 let body = format!(
171 "Target: {target}\n\
172 Host: {host}\n\
173 Error: {error}\n\
174 Instance: {}\n\
175 Time: {}\n\n\
176 - PoM",
177 self.instance_name,
178 chrono::Utc::now().to_rfc3339(),
179 );
180
181 self.wam_ticket(&subject, &body, "high", "pom-tls", Some(&format!("{target}:{host}"))).await;
182 self.record_alert(&alert_key, AlertCategory::TlsError, None, None, Some(error)).await;
183 }
184
185 #[instrument(skip_all)]
186 pub async fn send_tls_recovery(
187 &self,
188 target: &str,
189 label: &str,
190 days_remaining: i64,
191 ) {
192 let alert_key = format!("tls:{target}");
193 // No cooldown on recovery — always send
194 let subject = format!("[PoM] {target}: TLS cert renewed");
195 let body = format!(
196 "Target: {label} ({target})\n\
197 Days remaining: {days_remaining}\n\
198 Instance: {}\n\
199 Time: {}\n\n\
200 - PoM",
201 self.instance_name,
202 chrono::Utc::now().to_rfc3339(),
203 );
204
205 self.send_email(&subject, &body).await;
206 self.record_alert(&alert_key, AlertCategory::TlsRecovery, None, None, None).await;
207 }
208
209 #[instrument(skip_all)]
210 pub async fn send_peer_missing(
211 &self,
212 peer_name: &str,
213 address: &str,
214 consecutive_failures: u32,
215 ) {
216 let alert_key = format!("peer:{peer_name}");
217 if self.is_within_cooldown(&alert_key).await {
218 info!("alert cooldown active for {alert_key}, skipping");
219 return;
220 }
221
222 let subject = format!("[PoM] peer {peer_name}: missing");
223 let body = format!(
224 "Peer: {peer_name}\n\
225 Address: {address}\n\
226 Consecutive failures: {consecutive_failures}\n\
227 Instance: {}\n\
228 Time: {}\n\n\
229 - PoM",
230 self.instance_name,
231 chrono::Utc::now().to_rfc3339(),
232 );
233
234 self.wam_ticket(&subject, &body, "high", "pom-peer", Some(peer_name)).await;
235 self.record_alert(&alert_key, AlertCategory::PeerMissing, None, None, None).await;
236 }
237
238 #[instrument(skip_all)]
239 pub async fn send_peer_recovery(
240 &self,
241 peer_name: &str,
242 address: &str,
243 ) {
244 let subject = format!("[PoM] peer {peer_name}: recovered");
245 let body = format!(
246 "Peer: {peer_name}\n\
247 Address: {address}\n\
248 Instance: {}\n\
249 Time: {}\n\n\
250 - PoM",
251 self.instance_name,
252 chrono::Utc::now().to_rfc3339(),
253 );
254
255 let alert_key = format!("peer:{peer_name}");
256 self.send_email(&subject, &body).await;
257 self.record_alert(&alert_key, AlertCategory::PeerRecovery, None, None, None).await;
258 }
259
260 #[instrument(skip_all)]
261 pub async fn send_route_failure_alert(
262 &self,
263 target: &str,
264 label: &str,
265 failed_paths: &[String],
266 ) {
267 let alert_key = format!("route:{target}");
268 if self.is_within_cooldown(&alert_key).await {
269 info!("alert cooldown active for {alert_key}, skipping");
270 return;
271 }
272
273 let n = failed_paths.len();
274 let subject = format!("[PoM] {label}: {n} route(s) failing");
275 let body = format!(
276 "Target: {label} ({target})\n\
277 Failed routes:\n{}\n\
278 Instance: {}\n\
279 Time: {}\n\n\
280 - PoM",
281 failed_paths.iter().map(|p| format!(" - {p}")).collect::<Vec<_>>().join("\n"),
282 self.instance_name,
283 chrono::Utc::now().to_rfc3339(),
284 );
285
286 self.wam_ticket(&subject, &body, "high", "pom-routes", Some(target)).await;
287 self.record_alert(&alert_key, AlertCategory::RouteFailure, None, None, None).await;
288 }
289
290 #[instrument(skip_all)]
291 pub async fn send_route_recovery_alert(
292 &self,
293 target: &str,
294 label: &str,
295 recovered_paths: &[String],
296 ) {
297 // No cooldown on recovery — always send
298 let alert_key = format!("route:{target}");
299 let subject = format!("[PoM] {label}: routes recovered");
300 let body = format!(
301 "Target: {label} ({target})\n\
302 Recovered routes:\n{}\n\
303 Instance: {}\n\
304 Time: {}\n\n\
305 - PoM",
306 recovered_paths.iter().map(|p| format!(" - {p}")).collect::<Vec<_>>().join("\n"),
307 self.instance_name,
308 chrono::Utc::now().to_rfc3339(),
309 );
310
311 self.send_email(&subject, &body).await;
312 self.record_alert(&alert_key, AlertCategory::RouteRecovery, None, None, None).await;
313 }
314
315 #[instrument(skip_all)]
316 pub async fn send_dns_mismatch_alert(
317 &self,
318 target: &str,
319 label: &str,
320 mismatches: &[crate::types::DnsCheckResult],
321 ) {
322 let alert_key = format!("dns:{target}");
323 if self.is_within_cooldown(&alert_key).await {
324 info!("alert cooldown active for {alert_key}, skipping");
325 return;
326 }
327
328 let n = mismatches.len();
329 let subject = format!("[PoM] {label}: {n} DNS record(s) mismatched");
330 let details: Vec<String> = mismatches
331 .iter()
332 .map(|m| {
333 if let Some(ref err) = m.error {
334 format!(" - {} {}: {err}", m.name, m.record_type)
335 } else {
336 format!(
337 " - {} {}: expected {:?}, got {:?}",
338 m.name, m.record_type, m.expected, m.actual
339 )
340 }
341 })
342 .collect();
343 let body = format!(
344 "Target: {label} ({target})\n\
345 DNS mismatches:\n{}\n\
346 Instance: {}\n\
347 Time: {}\n\n\
348 - PoM",
349 details.join("\n"),
350 self.instance_name,
351 chrono::Utc::now().to_rfc3339(),
352 );
353
354 self.wam_ticket(&subject, &body, "high", "pom-dns", Some(target)).await;
355 self.record_alert(&alert_key, AlertCategory::DnsMismatch, None, None, None).await;
356 }
357
358 #[instrument(skip_all)]
359 pub async fn send_dns_recovery_alert(
360 &self,
361 target: &str,
362 label: &str,
363 ) {
364 // No cooldown on recovery — always send
365 let alert_key = format!("dns:{target}");
366 let subject = format!("[PoM] {label}: DNS records recovered");
367 let body = format!(
368 "Target: {label} ({target})\n\
369 All DNS records now match expected values.\n\
370 Instance: {}\n\
371 Time: {}\n\n\
372 - PoM",
373 self.instance_name,
374 chrono::Utc::now().to_rfc3339(),
375 );
376
377 self.send_email(&subject, &body).await;
378 self.record_alert(&alert_key, AlertCategory::DnsRecovery, None, None, None).await;
379 }
380
381 #[instrument(skip_all)]
382 pub async fn send_whois_expiry_alert(
383 &self,
384 target: &str,
385 label: &str,
386 domain: &str,
387 days_remaining: i64,
388 ) {
389 let alert_key = format!("whois:{target}");
390 if self.is_within_cooldown(&alert_key).await {
391 info!("alert cooldown active for {alert_key}, skipping");
392 return;
393 }
394
395 let subject = format!("[PoM] {label}: domain {domain} expires in {days_remaining} days");
396 let body = format!(
397 "Target: {label} ({target})\n\
398 Domain: {domain}\n\
399 Days remaining: {days_remaining}\n\
400 Instance: {}\n\
401 Time: {}\n\n\
402 - PoM",
403 self.instance_name,
404 chrono::Utc::now().to_rfc3339(),
405 );
406
407 let priority = whois_expiry_priority(days_remaining);
408 self.wam_ticket(&subject, &body, priority, "pom-whois", Some(&format!("{target}:{domain}"))).await;
409 self.record_alert(&alert_key, AlertCategory::WhoisExpiry, None, None, None).await;
410 }
411
412 #[instrument(skip_all)]
413 pub async fn send_whois_error_alert(
414 &self,
415 target: &str,
416 label: &str,
417 domain: &str,
418 error: &str,
419 ) {
420 let alert_key = format!("whois:{target}");
421 if self.is_within_cooldown(&alert_key).await {
422 info!("alert cooldown active for {alert_key}, skipping");
423 return;
424 }
425
426 let subject = format!("[PoM] {label}: WHOIS check failed for {domain}");
427 let body = format!(
428 "Target: {label} ({target})\n\
429 Domain: {domain}\n\
430 Error: {error}\n\
431 Instance: {}\n\
432 Time: {}\n\n\
433 - PoM",
434 self.instance_name,
435 chrono::Utc::now().to_rfc3339(),
436 );
437
438 self.wam_ticket(&subject, &body, "high", "pom-whois", Some(&format!("{target}:{domain}"))).await;
439 self.record_alert(&alert_key, AlertCategory::WhoisError, None, None, Some(error)).await;
440 }
441
442 #[instrument(skip_all)]
443 pub async fn send_cors_failure_alert(
444 &self,
445 target: &str,
446 label: &str,
447 failures: &[crate::types::CorsCheckResult],
448 ) {
449 let alert_key = format!("cors:{target}");
450 if self.is_within_cooldown(&alert_key).await {
451 info!("alert cooldown active for {alert_key}, skipping");
452 return;
453 }
454
455 let n = failures.len();
456 let subject = format!("[PoM] {label}: {n} CORS preflight(s) failing");
457 let details: Vec<String> = failures
458 .iter()
459 .map(|f| {
460 if let Some(ref err) = f.error {
461 format!(" - {} {} from {}: {err}", f.method, f.url, f.origin)
462 } else {
463 format!(" - {} {} from {}: no CORS headers", f.method, f.url, f.origin)
464 }
465 })
466 .collect();
467 let body = format!(
468 "Target: {label} ({target})\n\
469 CORS preflight failures:\n{}\n\
470 Instance: {}\n\
471 Time: {}\n\n\
472 Browser-side uploads will silently fail without CORS.\n\n\
473 - PoM",
474 details.join("\n"),
475 self.instance_name,
476 chrono::Utc::now().to_rfc3339(),
477 );
478
479 self.wam_ticket(&subject, &body, "high", "pom-cors", Some(target)).await;
480 self.record_alert(&alert_key, AlertCategory::CorsFailure, None, None, None).await;
481 }
482
483 #[instrument(skip_all)]
484 pub async fn send_cors_recovery_alert(
485 &self,
486 target: &str,
487 label: &str,
488 ) {
489 // No cooldown on recovery — always send
490 let alert_key = format!("cors:{target}");
491 let subject = format!("[PoM] {label}: CORS preflights recovered");
492 let body = format!(
493 "Target: {label} ({target})\n\
494 All CORS preflight checks passing.\n\
495 Instance: {}\n\
496 Time: {}\n\n\
497 - PoM",
498 self.instance_name,
499 chrono::Utc::now().to_rfc3339(),
500 );
501
502 self.send_email(&subject, &body).await;
503 self.record_alert(&alert_key, AlertCategory::CorsRecovery, None, None, None).await;
504 }
505
506 #[instrument(skip_all)]
507 pub async fn send_latency_drift_alert(
508 &self,
509 target: &str,
510 label: &str,
511 drift_message: &str,
512 ) {
513 let alert_key = format!("latency:{target}");
514 if self.is_within_cooldown(&alert_key).await {
515 info!("alert cooldown active for {alert_key}, skipping");
516 return;
517 }
518
519 let subject = format!("[PoM] {target}: latency drift detected");
520 let body = format!(
521 "Target: {label} ({target})\n\
522 {drift_message}\n\
523 Instance: {}\n\
524 Time: {}\n\n\
525 - PoM",
526 self.instance_name,
527 chrono::Utc::now().to_rfc3339(),
528 );
529
530 self.wam_ticket(&subject, &body, "medium", "pom-latency", Some(target)).await;
531 self.record_alert(&alert_key, AlertCategory::LatencyDrift, None, None, Some(drift_message)).await;
532 }
533
534 #[instrument(skip_all)]
535 pub async fn send_latency_recovery(
536 &self,
537 target: &str,
538 label: &str,
539 ) {
540 // No cooldown on recovery — always send
541 let alert_key = format!("latency:{target}");
542 let subject = format!("[PoM] {target}: latency recovered");
543 let body = format!(
544 "Target: {label} ({target})\n\
545 Latency returned to normal.\n\
546 Instance: {}\n\
547 Time: {}\n\n\
548 - PoM",
549 self.instance_name,
550 chrono::Utc::now().to_rfc3339(),
551 );
552
553 self.send_email(&subject, &body).await;
554 self.record_alert(&alert_key, AlertCategory::LatencyRecovery, None, None, None).await;
555 }
556
557 #[instrument(skip_all)]
558 pub async fn send_test_duration_drift_alert(
559 &self,
560 target: &str,
561 label: &str,
562 drift_message: &str,
563 ) {
564 let alert_key = format!("test_duration:{target}");
565 if self.is_within_cooldown(&alert_key).await {
566 info!("alert cooldown active for {alert_key}, skipping");
567 return;
568 }
569
570 let subject = format!("[PoM] {target}: test duration drift detected");
571 let body = format!(
572 "Target: {label} ({target})\n\
573 {drift_message}\n\
574 Instance: {}\n\
575 Time: {}\n\n\
576 - PoM",
577 self.instance_name,
578 chrono::Utc::now().to_rfc3339(),
579 );
580
581 self.wam_ticket(&subject, &body, "medium", "pom-test-duration", Some(target)).await;
582 self.record_alert(&alert_key, AlertCategory::TestDurationDrift, None, None, Some(drift_message)).await;
583 }
584
585 #[instrument(skip_all)]
586 pub async fn send_backup_stale_alert(
587 &self,
588 target: &str,
589 label: &str,
590 database: &str,
591 status: &str,
592 age_hours: Option<i64>,
593 ) {
594 let alert_key = format!("backup:{target}:{database}");
595 if self.is_within_cooldown(&alert_key).await {
596 info!("alert cooldown active for {alert_key}, skipping");
597 return;
598 }
599
600 let detail = backup_status_detail(status, age_hours);
601
602 let subject = format!("[PoM] {label}: {database} backup {status}");
603 let body = format!(
604 "Target: {label} ({target})\n\
605 Database: {database}\n\
606 Status: {status}\n\
607 Detail: {detail}\n\
608 Instance: {}\n\
609 Time: {}\n\n\
610 - PoM",
611 self.instance_name,
612 chrono::Utc::now().to_rfc3339(),
613 );
614
615 let priority = backup_status_priority(status);
616 self.wam_ticket(&subject, &body, priority, "pom-backup", Some(&format!("{target}:{database}"))).await;
617 self.record_alert(&alert_key, AlertCategory::BackupStale, None, Some(status), None).await;
618 }
619
620 #[instrument(skip_all)]
621 pub async fn send_backup_recovery(
622 &self,
623 target: &str,
624 label: &str,
625 database: &str,
626 ) {
627 // No cooldown on recovery — always send
628 let alert_key = format!("backup:{target}:{database}");
629 let subject = format!("[PoM] {label}: {database} backup recovered");
630 let body = format!(
631 "Target: {label} ({target})\n\
632 Database: {database}\n\
633 Backup is now current.\n\
634 Instance: {}\n\
635 Time: {}\n\n\
636 - PoM",
637 self.instance_name,
638 chrono::Utc::now().to_rfc3339(),
639 );
640
641 self.send_email(&subject, &body).await;
642 self.record_alert(&alert_key, AlertCategory::BackupRecovery, None, Some("ok"), None).await;
643 }
644
645 /// Fire when the scan pipeline transitions from operational → degraded or
646 /// unreachable. Includes the audit-doc threshold issues that fired.
647 #[instrument(skip_all)]
648 pub async fn send_scan_pipeline_alert(
649 &self,
650 target: &str,
651 label: &str,
652 status: &str,
653 issues: &[String],
654 ) {
655 let alert_key = format!("scan_pipeline:{target}");
656 if self.is_within_cooldown(&alert_key).await {
657 info!("alert cooldown active for {alert_key}, skipping");
658 return;
659 }
660
661 let subject = format!("[PoM] {label}: scan pipeline {status}");
662 let body = format!(
663 "Target: {label} ({target})\n\
664 Status: {status}\n\
665 Issues:\n{}\n\
666 Instance: {}\n\
667 Time: {}\n\n\
668 Dashboard: <https://{}/admin/uploads>\n\n\
669 - PoM",
670 issues.iter().map(|i| format!(" - {i}")).collect::<Vec<_>>().join("\n"),
671 self.instance_name,
672 chrono::Utc::now().to_rfc3339(),
673 target,
674 );
675
676 let priority = if status == "unreachable" { "high" } else { "medium" };
677 self.wam_ticket(&subject, &body, priority, "pom-scan-pipeline", Some(target)).await;
678 self.record_alert(&alert_key, AlertCategory::ScanPipelineDegraded, None, Some(status), None).await;
679 }
680
681 /// Fire on recovery from a degraded / unreachable scan-pipeline state.
682 #[instrument(skip_all)]
683 pub async fn send_scan_pipeline_recovery(&self, target: &str, label: &str) {
684 let alert_key = format!("scan_pipeline:{target}");
685 let subject = format!("[PoM] {label}: scan pipeline recovered");
686 let body = format!(
687 "Target: {label} ({target})\n\
688 Scan pipeline is operational.\n\
689 Instance: {}\n\
690 Time: {}\n\n\
691 - PoM",
692 self.instance_name,
693 chrono::Utc::now().to_rfc3339(),
694 );
695
696 self.send_email(&subject, &body).await;
697 self.record_alert(&alert_key, AlertCategory::ScanPipelineRecovery, None, Some("operational"), None).await;
698 }
699
700 /// All monitored targets are unreachable — likely a network issue with PoM itself.
701 #[instrument(skip_all)]
702 pub async fn send_monitoring_offline_alert(&self, target_count: usize) {
703 let alert_key = "monitoring:self";
704 if self.is_within_cooldown(alert_key).await {
705 info!("alert cooldown active for {alert_key}, skipping");
706 return;
707 }
708
709 let subject = format!("[PoM] all {target_count} targets unreachable");
710 let body = format!(
711 "All {target_count} monitored targets are non-operational.\n\
712 This likely indicates a network issue with the PoM instance itself,\n\
713 not an actual outage of all targets.\n\n\
714 Instance: {}\n\
715 Time: {}\n\n\
716 - PoM",
717 self.instance_name,
718 chrono::Utc::now().to_rfc3339(),
719 );
720
721 self.wam_ticket(&subject, &body, "critical", "pom-monitoring", Some("self")).await;
722 self.record_alert(alert_key, AlertCategory::MonitoringOffline, None, None, None).await;
723 }
724
725 /// At least one target is reachable again after a monitoring-offline event.
726 #[instrument(skip_all)]
727 pub async fn send_monitoring_recovery(&self) {
728 let alert_key = "monitoring:self";
729 let subject = "[PoM] monitoring recovered".to_string();
730 let body = format!(
731 "At least one target is reachable again.\n\
732 Instance: {}\n\
733 Time: {}\n\n\
734 - PoM",
735 self.instance_name,
736 chrono::Utc::now().to_rfc3339(),
737 );
738
739 self.send_email(&subject, &body).await;
740 self.record_alert(alert_key, AlertCategory::MonitoringRecovery, None, None, None).await;
741 }
742
743 async fn is_within_cooldown(&self, target: &str) -> bool {
744 let latest = match db::get_latest_alert_for_target(&self.pool, target).await {
745 Ok(Some(row)) => row,
746 _ => return false,
747 };
748
749 let sent_at = match chrono::DateTime::parse_from_rfc3339(&latest.sent_at) {
750 Ok(dt) => dt,
751 Err(_) => return false,
752 };
753
754 let elapsed = chrono::Utc::now().signed_duration_since(sent_at);
755 elapsed.num_seconds() < self.config.cooldown_secs as i64
756 }
757
758 async fn send_email(&self, subject: &str, body: &str) {
759 let Some(ref token) = self.config.postmark_token else {
760 info!("[dev] alert: {subject}");
761 info!("[dev] {body}");
762 return;
763 };
764
765 let payload = serde_json::json!({
766 "From": self.config.from,
767 "To": self.config.to,
768 "Subject": subject,
769 "TextBody": body,
770 });
771
772 let send_fut = self.client
773 .post("https://api.postmarkapp.com/email")
774 .header("X-Postmark-Server-Token", token)
775 .header("Content-Type", "application/json")
776 .header("Accept", "application/json")
777 .json(&payload)
778 .send();
779
780 // Wrap in a 30-second timeout to prevent Postmark latency from blocking
781 // the alert task. The reqwest client has its own 10s timeout, but this
782 // guards against DNS resolution stalls and connection pool exhaustion.
783 match tokio::time::timeout(std::time::Duration::from_secs(30), send_fut).await {
784 Ok(Ok(resp)) if resp.status().is_success() => {
785 info!("alert sent: {subject}");
786 }
787 Ok(Ok(resp)) => {
788 let status = resp.status();
789 let text = resp.text().await.unwrap_or_default();
790 warn!("postmark error ({status}): {text}");
791 }
792 Ok(Err(e)) => {
793 warn!("failed to send alert: {e}");
794 }
795 Err(_) => {
796 warn!("alert send timed out after 30s: {subject}");
797 }
798 }
799 }
800
801 async fn record_alert(
802 &self,
803 target: &str,
804 alert_type: AlertCategory,
805 from_status: Option<&str>,
806 to_status: Option<&str>,
807 error: Option<&str>,
808 ) {
809 let alert_type_str = alert_type.to_string();
810 if let Err(e) = db::insert_alert(&self.pool, target, &alert_type_str, from_status, to_status, error).await {
811 warn!("failed to record alert: {e}");
812 }
813 }
814
815 /// Create a WAM ticket (best-effort, fire-and-forget).
816 async fn wam_ticket(
817 &self,
818 title: &str,
819 body: &str,
820 priority: &str,
821 source: &str,
822 source_ref: Option<&str>,
823 ) {
824 let Some(ref base_url) = self.wam_url else { return };
825 let url = format!("{base_url}/tickets");
826
827 let mut payload = serde_json::json!({
828 "title": title,
829 "body": body,
830 "priority": priority,
831 "source": source,
832 });
833 if let Some(r) = source_ref {
834 payload["source_ref"] = serde_json::json!(r);
835 }
836
837 match self.client.post(&url).json(&payload).send().await {
838 Ok(resp) if resp.status().is_success() => {
839 info!("WAM ticket created: {title}");
840 }
841 Ok(resp) => {
842 warn!("WAM ticket creation returned {}: {title}", resp.status());
843 }
844 Err(e) => {
845 warn!("WAM unreachable: {e}");
846 }
847 }
848 }
849 }
850
851 #[cfg(test)]
852 mod tests {
853 use super::*;
854
855 fn test_alerter(pool: SqlitePool) -> Alerter {
856 let config = AlertConfig {
857 postmark_token: None, // dev mode
858 to: "test@example.com".to_string(),
859 from: "PoM Alerts <pom-alerts@makenot.work>".to_string(),
860 cooldown_secs: 300,
861 wam_url: None,
862 };
863 Alerter::new(config, pool, "test-instance".to_string())
864 }
865
866 #[tokio::test]
867 async fn cooldown_prevents_duplicate_alerts() {
868 let pool = db::connect_in_memory().await.unwrap();
869 let alerter = test_alerter(pool.clone());
870
871 // First alert — not in cooldown
872 assert!(!alerter.is_within_cooldown("health:mnw").await);
873
874 // Record an alert
875 db::insert_alert(&pool, "health:mnw", "health", Some("operational"), Some("error"), None)
876 .await
877 .unwrap();
878
879 // Now should be in cooldown
880 assert!(alerter.is_within_cooldown("health:mnw").await);
881 }
882
883 #[tokio::test]
884 async fn cooldown_does_not_affect_other_targets() {
885 let pool = db::connect_in_memory().await.unwrap();
886 let alerter = test_alerter(pool.clone());
887
888 db::insert_alert(&pool, "health:mnw", "health", None, None, None)
889 .await
890 .unwrap();
891
892 // Different target should not be in cooldown
893 assert!(!alerter.is_within_cooldown("health:other").await);
894 }
895
896 #[tokio::test]
897 async fn dev_mode_does_not_send_http() {
898 let pool = db::connect_in_memory().await.unwrap();
899 let alerter = test_alerter(pool.clone());
900
901 // This should log instead of making HTTP calls (no panic, no error)
902 alerter.send_health_alert("mnw", "MakeNotWork", "operational", "error", None).await;
903
904 // Verify alert was recorded in DB with the prefixed key (health:mnw),
905 // matching the cooldown lookup key format.
906 let latest = db::get_latest_alert_for_target(&pool, "health:mnw").await.unwrap();
907 assert!(latest.is_some());
908 let row = latest.unwrap();
909 assert_eq!(row.alert_type, "health");
910 assert_eq!(row.from_status.as_deref(), Some("operational"));
911 assert_eq!(row.to_status.as_deref(), Some("error"));
912 }
913
914 #[tokio::test]
915 async fn route_alert_cooldown_key() {
916 let pool = db::connect_in_memory().await.unwrap();
917 let alerter = test_alerter(pool.clone());
918
919 assert!(!alerter.is_within_cooldown("route:mnw").await);
920
921 alerter.send_route_failure_alert("mnw", "MakeNotWork", &["/docs/faq".to_string()]).await;
922
923 assert!(alerter.is_within_cooldown("route:mnw").await);
924 assert!(!alerter.is_within_cooldown("route:mt").await);
925 }
926
927 #[tokio::test]
928 async fn recovery_does_not_start_cooldown_for_next_failure() {
929 let pool = db::connect_in_memory().await.unwrap();
930 let alerter = test_alerter(pool.clone());
931
932 // Send a failure alert — starts cooldown
933 alerter.send_health_alert("mnw", "MakeNotWork", "operational", "error", None).await;
934 assert!(alerter.is_within_cooldown("health:mnw").await);
935
936 // Send a recovery alert (always sends, no cooldown check)
937 alerter.send_health_recovery("mnw", "MakeNotWork", "error").await;
938
939 // The recovery alert should NOT reset cooldown for failures.
940 // is_within_cooldown now excludes recovery-type alerts, so it checks
941 // the original failure alert's timestamp — which is still within cooldown.
942 assert!(alerter.is_within_cooldown("health:mnw").await);
943 }
944
945 #[tokio::test]
946 async fn dns_alert_cooldown_key() {
947 let pool = db::connect_in_memory().await.unwrap();
948 let alerter = test_alerter(pool.clone());
949
950 assert!(!alerter.is_within_cooldown("dns:mnw").await);
951
952 let mismatches = vec![crate::types::DnsCheckResult {
953 target: "mnw".to_string(),
954 name: "makenot.work".to_string(),
955 record_type: crate::types::DnsRecordType::A,
956 expected: vec!["1.2.3.4".to_string()],
957 actual: vec!["5.6.7.8".to_string()],
958 matches: false,
959 checked_at: chrono::Utc::now().to_rfc3339(),
960 error: None,
961 }];
962 alerter.send_dns_mismatch_alert("mnw", "MakeNotWork", &mismatches).await;
963
964 assert!(alerter.is_within_cooldown("dns:mnw").await);
965 assert!(!alerter.is_within_cooldown("dns:other").await);
966 }
967
968 #[tokio::test]
969 async fn whois_alert_cooldown_key() {
970 let pool = db::connect_in_memory().await.unwrap();
971 let alerter = test_alerter(pool.clone());
972
973 assert!(!alerter.is_within_cooldown("whois:mnw").await);
974
975 alerter.send_whois_expiry_alert("mnw", "MakeNotWork", "makenot.work", 15).await;
976
977 assert!(alerter.is_within_cooldown("whois:mnw").await);
978 assert!(!alerter.is_within_cooldown("whois:other").await);
979 }
980
981 #[tokio::test]
982 async fn health_alert_cooldown_key_matches_record_key() {
983 let pool = db::connect_in_memory().await.unwrap();
984 let alerter = test_alerter(pool.clone());
985
986 // Not in cooldown initially
987 assert!(!alerter.is_within_cooldown("health:example.com").await);
988
989 // Send an alert for "example.com"
990 alerter.send_health_alert("example.com", "Example", "operational", "error", None).await;
991
992 // Same target should now be in cooldown
993 assert!(alerter.is_within_cooldown("health:example.com").await);
994
995 // Different target should NOT be in cooldown
996 assert!(!alerter.is_within_cooldown("health:other.com").await);
997 }
998
999 #[tokio::test]
1000 async fn tls_expiry_alert_cooldown_key() {
1001 let pool = db::connect_in_memory().await.unwrap();
1002 let alerter = test_alerter(pool.clone());
1003
1004 assert!(!alerter.is_within_cooldown("tls:mnw").await);
1005 alerter.send_tls_expiry_alert("mnw", "makenot.work", 10, "2026-04-01T00:00:00Z").await;
1006 assert!(alerter.is_within_cooldown("tls:mnw").await);
1007 }
1008
1009 #[tokio::test]
1010 async fn tls_error_alert_cooldown_key() {
1011 let pool = db::connect_in_memory().await.unwrap();
1012 let alerter = test_alerter(pool.clone());
1013
1014 assert!(!alerter.is_within_cooldown("tls:mnw").await);
1015 alerter.send_tls_error_alert("mnw", "makenot.work", "certificate expired").await;
1016 assert!(alerter.is_within_cooldown("tls:mnw").await);
1017 }
1018
1019 #[tokio::test]
1020 async fn latency_drift_alert_cooldown_key() {
1021 let pool = db::connect_in_memory().await.unwrap();
1022 let alerter = test_alerter(pool.clone());
1023
1024 assert!(!alerter.is_within_cooldown("latency:mnw").await);
1025 alerter.send_latency_drift_alert("mnw", "MakeNotWork", "avg 500ms, baseline 100ms").await;
1026 assert!(alerter.is_within_cooldown("latency:mnw").await);
1027 }
1028
1029 #[tokio::test]
1030 async fn test_duration_drift_alert_cooldown_key() {
1031 let pool = db::connect_in_memory().await.unwrap();
1032 let alerter = test_alerter(pool.clone());
1033
1034 assert!(!alerter.is_within_cooldown("test_duration:mnw").await);
1035 alerter.send_test_duration_drift_alert("mnw", "MakeNotWork", "drift: 120s vs 60s baseline").await;
1036 assert!(alerter.is_within_cooldown("test_duration:mnw").await);
1037 }
1038
1039 #[tokio::test]
1040 async fn monitoring_offline_alert_cooldown_key() {
1041 let pool = db::connect_in_memory().await.unwrap();
1042 let alerter = test_alerter(pool.clone());
1043
1044 assert!(!alerter.is_within_cooldown("monitoring:self").await);
1045 alerter.send_monitoring_offline_alert(3).await;
1046 assert!(alerter.is_within_cooldown("monitoring:self").await);
1047 }
1048
1049 #[tokio::test]
1050 async fn route_recovery_does_not_start_cooldown() {
1051 let pool = db::connect_in_memory().await.unwrap();
1052 let alerter = test_alerter(pool.clone());
1053
1054 alerter.send_route_recovery_alert("mnw", "MakeNotWork", &["/health".to_string()]).await;
1055 // Recovery alerts are excluded from cooldown lookups, so sending a recovery
1056 // should NOT put the key into cooldown.
1057 assert!(!alerter.is_within_cooldown("route:mnw").await);
1058 }
1059
1060 #[tokio::test]
1061 async fn dns_recovery_does_not_start_cooldown() {
1062 let pool = db::connect_in_memory().await.unwrap();
1063 let alerter = test_alerter(pool.clone());
1064
1065 alerter.send_dns_recovery_alert("mnw", "MakeNotWork").await;
1066 assert!(!alerter.is_within_cooldown("dns:mnw").await);
1067 }
1068
1069 #[tokio::test]
1070 async fn tls_recovery_does_not_start_cooldown() {
1071 let pool = db::connect_in_memory().await.unwrap();
1072 let alerter = test_alerter(pool.clone());
1073
1074 alerter.send_tls_recovery("mnw", "MakeNotWork", 90).await;
1075 assert!(!alerter.is_within_cooldown("tls:mnw").await);
1076 }
1077
1078 // ── Pure priority/severity helpers (pin the <= boundaries) ──
1079
1080 #[test]
1081 fn tls_expiry_priority_boundaries() {
1082 // critical: days <= 3
1083 assert_eq!(tls_expiry_priority(-5), "critical", "negative days = already expired");
1084 assert_eq!(tls_expiry_priority(0), "critical");
1085 assert_eq!(tls_expiry_priority(3), "critical");
1086 // high: 4..=7
1087 assert_eq!(tls_expiry_priority(4), "high");
1088 assert_eq!(tls_expiry_priority(7), "high");
1089 // medium: > 7
1090 assert_eq!(tls_expiry_priority(8), "medium");
1091 assert_eq!(tls_expiry_priority(90), "medium");
1092 }
1093
1094 #[test]
1095 fn whois_expiry_priority_boundaries() {
1096 // critical: days <= 7
1097 assert_eq!(whois_expiry_priority(-1), "critical");
1098 assert_eq!(whois_expiry_priority(7), "critical");
1099 // high: 8..=14
1100 assert_eq!(whois_expiry_priority(8), "high");
1101 assert_eq!(whois_expiry_priority(14), "high");
1102 // medium: > 14
1103 assert_eq!(whois_expiry_priority(15), "medium");
1104 assert_eq!(whois_expiry_priority(180), "medium");
1105 }
1106
1107 #[test]
1108 fn backup_status_priority_missing_is_critical() {
1109 assert_eq!(backup_status_priority("missing"), "critical");
1110 assert_eq!(backup_status_priority("stale"), "high");
1111 assert_eq!(backup_status_priority("error"), "high");
1112 assert_eq!(backup_status_priority("anything-else"), "high");
1113 assert_eq!(backup_status_priority(""), "high");
1114 }
1115
1116 #[test]
1117 fn backup_status_detail_arms() {
1118 assert_eq!(
1119 backup_status_detail("stale", Some(12)),
1120 "last backup is 12h old"
1121 );
1122 // `stale` with no age falls through to the default arm.
1123 assert_eq!(backup_status_detail("stale", None), "status: stale");
1124 assert_eq!(backup_status_detail("missing", None), "no backup files found");
1125 assert_eq!(backup_status_detail("missing", Some(5)), "no backup files found");
1126 assert_eq!(backup_status_detail("error", None), "backup check failed");
1127 assert_eq!(backup_status_detail("error", Some(99)), "backup check failed");
1128 assert_eq!(backup_status_detail("weird", None), "status: weird");
1129 }
1130
1131 #[test]
1132 fn health_status_priority_arms() {
1133 assert_eq!(health_status_priority("error"), "critical");
1134 assert_eq!(health_status_priority("unreachable"), "critical");
1135 assert_eq!(health_status_priority("degraded"), "high");
1136 // Anything else (operational, unknown values) falls through to medium.
1137 assert_eq!(health_status_priority("operational"), "medium");
1138 assert_eq!(health_status_priority("flapping"), "medium");
1139 assert_eq!(health_status_priority(""), "medium");
1140 }
1141 }
1142