Skip to main content

max / makenotwork

2.0 KB · 65 lines History Blame Raw
1 use tokio::task::JoinHandle;
2 use tracing::info;
3
4 use pom::alerts::Alerter;
5 use pom::config::Config;
6 use pom::db;
7 use pom::types::HealthStatus;
8
9 pub(crate) fn spawn_meta_alert_task(
10 config: &Config,
11 pool: &sqlx::SqlitePool,
12 default_interval: u64,
13 cancel: &tokio_util::sync::CancellationToken,
14 alerter: &Option<Alerter>,
15 ) -> Option<JoinHandle<()>> {
16 let health_target_names: Vec<String> = config.target_names()
17 .into_iter()
18 .filter(|n| config.get_target(n).is_some_and(|t| t.health.is_some()))
19 .collect();
20
21 if health_target_names.len() < 2 {
22 return None;
23 }
24 let alerter = alerter.clone()?;
25
26 let pool = pool.clone();
27 let cancel = cancel.clone();
28 let meta_interval_secs = default_interval * 2;
29
30 info!("Meta-alert: monitoring-offline check every {meta_interval_secs}s ({} targets)", health_target_names.len());
31
32 Some(tokio::spawn(async move {
33 let mut interval = tokio::time::interval(
34 std::time::Duration::from_secs(meta_interval_secs),
35 );
36 interval.tick().await; // consume immediate first tick
37 let mut was_all_down = false;
38
39 loop {
40 tokio::select! {
41 _ = cancel.cancelled() => break,
42 _ = interval.tick() => {}
43 }
44
45 let mut all_down = true;
46 for name in &health_target_names {
47 if let Ok(Some(snap)) = db::get_latest_health(&pool, name).await
48 && (snap.status == HealthStatus::Operational
49 || snap.status == HealthStatus::Degraded)
50 {
51 all_down = false;
52 break;
53 }
54 }
55
56 if all_down && !was_all_down {
57 alerter.send_monitoring_offline_alert(health_target_names.len()).await;
58 } else if !all_down && was_all_down {
59 alerter.send_monitoring_recovery().await;
60 }
61 was_all_down = all_down;
62 }
63 }))
64 }
65