| 1 |
|
| 2 |
|
| 3 |
|
| 4 |
|
| 5 |
|
| 6 |
|
| 7 |
|
| 8 |
mod pom; |
| 9 |
use pom::*; |
| 10 |
|
| 11 |
use std::sync::Arc; |
| 12 |
|
| 13 |
use axum::extract::State; |
| 14 |
use axum::http::StatusCode; |
| 15 |
use axum::response::IntoResponse; |
| 16 |
use axum::Json; |
| 17 |
use tower_sessions::Session; |
| 18 |
|
| 19 |
use crate::{ |
| 20 |
db, |
| 21 |
error::Result, |
| 22 |
helpers::get_csrf_token, |
| 23 |
templates::*, |
| 24 |
AppState, |
| 25 |
}; |
| 26 |
|
| 27 |
|
| 28 |
fn format_uptime(d: std::time::Duration) -> String { |
| 29 |
let total_secs = d.as_secs(); |
| 30 |
let days = total_secs / 86400; |
| 31 |
let hours = (total_secs % 86400) / 3600; |
| 32 |
let minutes = (total_secs % 3600) / 60; |
| 33 |
if days > 0 { |
| 34 |
format!("{}d {}h {}m", days, hours, minutes) |
| 35 |
} else if hours > 0 { |
| 36 |
format!("{}h {}m", hours, minutes) |
| 37 |
} else { |
| 38 |
format!("{}m", minutes) |
| 39 |
} |
| 40 |
} |
| 41 |
|
| 42 |
|
| 43 |
#[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| 44 |
enum OverallStatus { |
| 45 |
Operational, |
| 46 |
Degraded, |
| 47 |
Error, |
| 48 |
} |
| 49 |
|
| 50 |
impl OverallStatus { |
| 51 |
fn label(self) -> &'static str { |
| 52 |
match self { |
| 53 |
Self::Operational => "All systems operational", |
| 54 |
Self::Degraded => "Degraded performance", |
| 55 |
Self::Error => "Issues detected", |
| 56 |
} |
| 57 |
} |
| 58 |
|
| 59 |
fn css_class(self) -> &'static str { |
| 60 |
match self { |
| 61 |
Self::Operational => "status-ok", |
| 62 |
Self::Degraded => "status-warn", |
| 63 |
Self::Error => "status-error", |
| 64 |
} |
| 65 |
} |
| 66 |
|
| 67 |
fn api_label(self) -> &'static str { |
| 68 |
match self { |
| 69 |
Self::Operational => "operational", |
| 70 |
Self::Degraded => "degraded", |
| 71 |
Self::Error => "error", |
| 72 |
} |
| 73 |
} |
| 74 |
} |
| 75 |
|
| 76 |
|
| 77 |
#[allow(dead_code)] |
| 78 |
struct HealthData { |
| 79 |
|
| 80 |
overall: OverallStatus, |
| 81 |
uptime: String, |
| 82 |
version: String, |
| 83 |
check_duration_ms: u64, |
| 84 |
|
| 85 |
|
| 86 |
db_ok: bool, |
| 87 |
db_status: &'static str, |
| 88 |
db_status_class: &'static str, |
| 89 |
db_pool_active: u32, |
| 90 |
db_pool_max: u32, |
| 91 |
stats: db::health::DbHealthStats, |
| 92 |
|
| 93 |
|
| 94 |
session_ok: bool, |
| 95 |
session_status: &'static str, |
| 96 |
session_status_class: &'static str, |
| 97 |
|
| 98 |
|
| 99 |
storage_configured: bool, |
| 100 |
s3_reachable: bool, |
| 101 |
storage_status: &'static str, |
| 102 |
storage_status_class: &'static str, |
| 103 |
storage_bucket: String, |
| 104 |
storage_region: String, |
| 105 |
|
| 106 |
|
| 107 |
stripe_configured: bool, |
| 108 |
stripe_status: &'static str, |
| 109 |
stripe_status_class: &'static str, |
| 110 |
stripe_mode: &'static str, |
| 111 |
|
| 112 |
|
| 113 |
#[allow(dead_code)] |
| 114 |
email_configured: bool, |
| 115 |
email_status: &'static str, |
| 116 |
email_status_class: &'static str, |
| 117 |
email_provider: &'static str, |
| 118 |
|
| 119 |
|
| 120 |
synckit_configured: bool, |
| 121 |
synckit_status: &'static str, |
| 122 |
synckit_status_class: &'static str, |
| 123 |
|
| 124 |
|
| 125 |
admin_configured: bool, |
| 126 |
|
| 127 |
|
| 128 |
monitor_enabled: bool, |
| 129 |
monitor_interval_secs: u64, |
| 130 |
alerts_configured: bool, |
| 131 |
uptime_24h: Option<f64>, |
| 132 |
uptime_7d: Option<f64>, |
| 133 |
last_incident: Option<String>, |
| 134 |
recent_snapshots: Vec<db::monitor::DbHealthSnapshot>, |
| 135 |
|
| 136 |
|
| 137 |
environment: &'static str, |
| 138 |
host: Arc<str>, |
| 139 |
started_at: String, |
| 140 |
|
| 141 |
|
| 142 |
public_tests: Vec<HealthTest>, |
| 143 |
db_tests: Vec<HealthTest>, |
| 144 |
|
| 145 |
|
| 146 |
pom_available: bool, |
| 147 |
pom_status: Option<String>, |
| 148 |
pom_status_class: Option<String>, |
| 149 |
pom_response_time_ms: Option<i64>, |
| 150 |
pom_checked_at: Option<String>, |
| 151 |
pom_uptime_24h: Option<f64>, |
| 152 |
pom_uptime_7d: Option<f64>, |
| 153 |
pom_recent: Vec<PomSnapshotJson>, |
| 154 |
pom_incident: Option<PomIncidentJson>, |
| 155 |
pom_recent_incidents: Vec<PomIncidentJson>, |
| 156 |
pom_avg_latency: Option<String>, |
| 157 |
pom_p95_latency: Option<String>, |
| 158 |
pom_routes_total: usize, |
| 159 |
pom_routes_ok: usize, |
| 160 |
pom_routes_failed: Vec<String>, |
| 161 |
|
| 162 |
|
| 163 |
privacy_jobs: Vec<db::scheduler_jobs::SchedulerJobRun>, |
| 164 |
} |
| 165 |
|
| 166 |
|
| 167 |
fn format_privacy_jobs( |
| 168 |
jobs: &[db::scheduler_jobs::SchedulerJobRun], |
| 169 |
now: chrono::DateTime<chrono::Utc>, |
| 170 |
) -> Vec<PrivacyJobDisplay> { |
| 171 |
|
| 172 |
let job_meta: &[(&str, &str, i64)] = &[ |
| 173 |
("ip_scrub", "Session IP scrub (30-day)", 26), |
| 174 |
("session_prune", "Session prune (90-day)", 26), |
| 175 |
("terminated_account_cleanup", "Terminated account cleanup (30-day)", 26), |
| 176 |
("content_removal_cleanup", "Content removal cleanup (90-day)", 26), |
| 177 |
]; |
| 178 |
|
| 179 |
job_meta |
| 180 |
.iter() |
| 181 |
.map(|(key, description, max_hours)| { |
| 182 |
let run = jobs.iter().find(|j| j.job_name == *key); |
| 183 |
match run { |
| 184 |
Some(r) => { |
| 185 |
let age = now.signed_duration_since(r.last_ran_at); |
| 186 |
let last_ran = if age.num_hours() < 1 { |
| 187 |
format!("{}m ago", age.num_minutes().max(0)) |
| 188 |
} else if age.num_hours() < 48 { |
| 189 |
format!("{}h ago", age.num_hours()) |
| 190 |
} else { |
| 191 |
format!("{}d ago", age.num_days()) |
| 192 |
}; |
| 193 |
let status_class = if age.num_hours() <= *max_hours { |
| 194 |
"status-ok" |
| 195 |
} else { |
| 196 |
"status-warn" |
| 197 |
}; |
| 198 |
PrivacyJobDisplay { |
| 199 |
name: key.to_string(), |
| 200 |
description: description.to_string(), |
| 201 |
last_ran, |
| 202 |
rows_affected: r.rows_affected.to_string(), |
| 203 |
status_class: status_class.to_string(), |
| 204 |
} |
| 205 |
} |
| 206 |
None => PrivacyJobDisplay { |
| 207 |
name: key.to_string(), |
| 208 |
description: description.to_string(), |
| 209 |
last_ran: "never".to_string(), |
| 210 |
rows_affected: "-".to_string(), |
| 211 |
status_class: "status-unknown".to_string(), |
| 212 |
}, |
| 213 |
} |
| 214 |
}) |
| 215 |
.collect() |
| 216 |
} |
| 217 |
|
| 218 |
|
| 219 |
|
| 220 |
async fn collect_health(state: &AppState) -> HealthData { |
| 221 |
use std::time::Instant; |
| 222 |
|
| 223 |
let check_start = Instant::now(); |
| 224 |
|
| 225 |
|
| 226 |
async fn run_test<F, Fut>(name: &str, f: F) -> HealthTest |
| 227 |
where |
| 228 |
F: FnOnce() -> Fut, |
| 229 |
Fut: std::future::Future<Output = bool>, |
| 230 |
{ |
| 231 |
let start = Instant::now(); |
| 232 |
let passed = f().await; |
| 233 |
HealthTest { |
| 234 |
name: name.to_string(), |
| 235 |
passed, |
| 236 |
latency_ms: start.elapsed().as_millis() as u64, |
| 237 |
} |
| 238 |
} |
| 239 |
|
| 240 |
|
| 241 |
let db_test_users = run_test("Count users", || async { |
| 242 |
sqlx::query_scalar::<_, i64>("SELECT COUNT(*) FROM users") |
| 243 |
.fetch_one(&state.db) |
| 244 |
.await |
| 245 |
.is_ok() |
| 246 |
}).await; |
| 247 |
|
| 248 |
let db_test_projects = run_test("Count projects", || async { |
| 249 |
sqlx::query_scalar::<_, i64>("SELECT COUNT(*) FROM projects") |
| 250 |
.fetch_one(&state.db) |
| 251 |
.await |
| 252 |
.is_ok() |
| 253 |
}).await; |
| 254 |
|
| 255 |
let db_test_items = run_test("Count items", || async { |
| 256 |
sqlx::query_scalar::<_, i64>("SELECT COUNT(*) FROM items") |
| 257 |
.fetch_one(&state.db) |
| 258 |
.await |
| 259 |
.is_ok() |
| 260 |
}).await; |
| 261 |
|
| 262 |
let db_test_transactions = run_test("Count transactions", || async { |
| 263 |
sqlx::query_scalar::<_, i64>("SELECT COUNT(*) FROM transactions") |
| 264 |
.fetch_one(&state.db) |
| 265 |
.await |
| 266 |
.is_ok() |
| 267 |
}).await; |
| 268 |
|
| 269 |
|
| 270 |
let stats = db::health::get_health_stats(&state.db).await.unwrap_or(db::health::DbHealthStats { |
| 271 |
user_count: 0, |
| 272 |
project_count: 0, |
| 273 |
item_count: 0, |
| 274 |
active_session_count: 0, |
| 275 |
active_creator_count: 0, |
| 276 |
transaction_count: 0, |
| 277 |
blog_post_count: 0, |
| 278 |
sync_app_count: 0, |
| 279 |
sync_device_count: 0, |
| 280 |
sync_log_entries: 0, |
| 281 |
}); |
| 282 |
|
| 283 |
|
| 284 |
let db_ok = db_test_users.passed && db_test_projects.passed; |
| 285 |
let db_status = if db_ok { "Connected" } else { "Error" }; |
| 286 |
let db_status_class = if db_ok { "status-ok" } else { "status-error" }; |
| 287 |
|
| 288 |
|
| 289 |
let pool_max = state.db.size(); |
| 290 |
let pool_idle = state.db.num_idle(); |
| 291 |
let pool_active = pool_max.saturating_sub(pool_idle as u32); |
| 292 |
|
| 293 |
|
| 294 |
let storage_configured = state.s3.is_some(); |
| 295 |
let s3_reachable = if let Some(ref s3) = state.s3 { |
| 296 |
s3.check_connectivity().await.is_ok() |
| 297 |
} else { |
| 298 |
false |
| 299 |
}; |
| 300 |
let (storage_status, storage_status_class) = if storage_configured && s3_reachable { |
| 301 |
("Connected", "status-ok") |
| 302 |
} else if storage_configured { |
| 303 |
("Configured (unreachable)", "status-warn") |
| 304 |
} else { |
| 305 |
("Not configured", "status-warn") |
| 306 |
}; |
| 307 |
let (storage_bucket, storage_region) = if let Some(ref storage) = state.config.storage { |
| 308 |
(storage.bucket.clone(), storage.region.clone()) |
| 309 |
} else { |
| 310 |
(String::new(), String::new()) |
| 311 |
}; |
| 312 |
|
| 313 |
|
| 314 |
let stripe_configured = state.stripe.is_some(); |
| 315 |
let stripe_status = if stripe_configured { "Configured" } else { "Not configured" }; |
| 316 |
let stripe_status_class = if stripe_configured { "status-ok" } else { "status-warn" }; |
| 317 |
let stripe_mode = if stripe_configured { |
| 318 |
if state.config.stripe.as_ref().map(|s| s.secret_key.starts_with("sk_live")).unwrap_or(false) { |
| 319 |
"Live" |
| 320 |
} else { |
| 321 |
"Test" |
| 322 |
} |
| 323 |
} else { |
| 324 |
"-" |
| 325 |
}; |
| 326 |
|
| 327 |
|
| 328 |
let email_configured = std::env::var("POSTMARK_TOKEN").is_ok(); |
| 329 |
let email_status = if email_configured { "Configured" } else { "Dev mode (logging)" }; |
| 330 |
let email_status_class = if email_configured { "status-ok" } else { "status-warn" }; |
| 331 |
let email_provider = if email_configured { "Postmark" } else { "Console" }; |
| 332 |
|
| 333 |
|
| 334 |
let session_ok = stats.active_session_count > 0; |
| 335 |
let session_status = if session_ok { "Active" } else { "Error" }; |
| 336 |
let session_status_class = if session_ok { "status-ok" } else { "status-error" }; |
| 337 |
|
| 338 |
|
| 339 |
let synckit_configured = state.config.synckit_jwt_secret.is_some(); |
| 340 |
let synckit_status = if synckit_configured { "Configured" } else { "Not configured" }; |
| 341 |
let synckit_status_class = if synckit_configured { "status-ok" } else { "status-warn" }; |
| 342 |
|
| 343 |
|
| 344 |
let admin_configured = state.config.admin_user_id.is_some(); |
| 345 |
|
| 346 |
|
| 347 |
let overall = if !db_ok || !session_ok { |
| 348 |
OverallStatus::Error |
| 349 |
} else if storage_configured && !s3_reachable { |
| 350 |
OverallStatus::Degraded |
| 351 |
} else { |
| 352 |
OverallStatus::Operational |
| 353 |
}; |
| 354 |
|
| 355 |
|
| 356 |
let environment = if cfg!(debug_assertions) { "Development" } else { "Production" }; |
| 357 |
let host = state.config.host_url.clone(); |
| 358 |
|
| 359 |
|
| 360 |
let uptime = format_uptime(state.start_instant.elapsed()); |
| 361 |
let started_at = state.started_at.format("%Y-%m-%d %H:%M:%S UTC").to_string(); |
| 362 |
|
| 363 |
|
| 364 |
let version = match option_env!("GIT_HASH") { |
| 365 |
Some(hash) if !hash.is_empty() => format!("{} ({})", env!("CARGO_PKG_VERSION"), hash), |
| 366 |
_ => env!("CARGO_PKG_VERSION").to_string(), |
| 367 |
}; |
| 368 |
|
| 369 |
|
| 370 |
let public_tests: Vec<HealthTest> = vec![]; |
| 371 |
|
| 372 |
let db_tests = vec![ |
| 373 |
db_test_users, |
| 374 |
db_test_projects, |
| 375 |
db_test_items, |
| 376 |
db_test_transactions, |
| 377 |
]; |
| 378 |
|
| 379 |
let check_duration_ms = check_start.elapsed().as_millis() as u64; |
| 380 |
|
| 381 |
|
| 382 |
let pom = fetch_pom_status().await; |
| 383 |
let (pom_available, pom_status, pom_status_class, pom_response_time_ms, pom_checked_at, pom_uptime_24h, pom_uptime_7d, pom_recent, pom_incident, pom_recent_incidents, pom_avg_latency, pom_p95_latency, pom_routes_total, pom_routes_ok, pom_routes_failed) = |
| 384 |
if let Some(ref pom) = pom { |
| 385 |
let latest = pom.latest.as_ref(); |
| 386 |
let status = latest.map(|s| s.status.clone()); |
| 387 |
let status_class = status.as_deref().map(|s| match s { |
| 388 |
"operational" => "status-ok".to_string(), |
| 389 |
"degraded" => "status-warn".to_string(), |
| 390 |
_ => "status-error".to_string(), |
| 391 |
}); |
| 392 |
let avg_latency = pom.latency_24h.as_ref().map(|l| format!("{:.0}ms", l.avg_ms)); |
| 393 |
let p95_latency = pom.latency_24h.as_ref().map(|l| format!("{}ms", l.p95_ms)); |
| 394 |
let routes_total = pom.route_status.len(); |
| 395 |
let routes_ok = pom.route_status.iter().filter(|r| r.ok).count(); |
| 396 |
let routes_failed: Vec<String> = pom.route_status.iter().filter(|r| !r.ok).map(|r| r.path.clone()).collect(); |
| 397 |
( |
| 398 |
true, |
| 399 |
status, |
| 400 |
status_class, |
| 401 |
latest.map(|s| s.response_time_ms), |
| 402 |
latest.map(|s| format_pom_timestamp(&s.checked_at)), |
| 403 |
pom.uptime_24h, |
| 404 |
pom.uptime_7d, |
| 405 |
pom.recent.clone(), |
| 406 |
pom.current_incident.clone(), |
| 407 |
pom.incidents.iter().filter(|i| i.ended_at.is_some()).cloned().collect(), |
| 408 |
avg_latency, |
| 409 |
p95_latency, |
| 410 |
routes_total, |
| 411 |
routes_ok, |
| 412 |
routes_failed, |
| 413 |
) |
| 414 |
} else { |
| 415 |
(false, None, None, None, None, None, None, Vec::new(), None, Vec::new(), None, None, 0, 0, Vec::new()) |
| 416 |
}; |
| 417 |
|
| 418 |
|
| 419 |
let monitor_interval_secs = std::env::var("HEALTH_CHECK_INTERVAL_SECS") |
| 420 |
.ok() |
| 421 |
.and_then(|v| v.parse::<u64>().ok()) |
| 422 |
.unwrap_or(crate::constants::HEALTH_CHECK_INTERVAL_SECS); |
| 423 |
|
| 424 |
let alerts_configured = std::env::var("ALERT_EMAIL").is_ok(); |
| 425 |
|
| 426 |
let uptime_24h = db::monitor::get_health_uptime_percent(&state.db, 24) |
| 427 |
.await |
| 428 |
.unwrap_or(None); |
| 429 |
let uptime_7d = db::monitor::get_health_uptime_percent(&state.db, 168) |
| 430 |
.await |
| 431 |
.unwrap_or(None); |
| 432 |
let last_incident = db::monitor::get_last_incident(&state.db) |
| 433 |
.await |
| 434 |
.unwrap_or(None) |
| 435 |
.map(|dt| dt.format("%Y-%m-%d %H:%M UTC").to_string()); |
| 436 |
let recent_snapshots = db::monitor::get_recent_health_history(&state.db, 10) |
| 437 |
.await |
| 438 |
.unwrap_or_default(); |
| 439 |
|
| 440 |
HealthData { |
| 441 |
overall, |
| 442 |
uptime, |
| 443 |
version, |
| 444 |
check_duration_ms, |
| 445 |
db_ok, |
| 446 |
db_status, |
| 447 |
db_status_class, |
| 448 |
db_pool_active: pool_active, |
| 449 |
db_pool_max: pool_max, |
| 450 |
stats, |
| 451 |
session_ok, |
| 452 |
session_status, |
| 453 |
session_status_class, |
| 454 |
storage_configured, |
| 455 |
s3_reachable, |
| 456 |
storage_status, |
| 457 |
storage_status_class, |
| 458 |
storage_bucket, |
| 459 |
storage_region, |
| 460 |
stripe_configured, |
| 461 |
stripe_status, |
| 462 |
stripe_status_class, |
| 463 |
stripe_mode, |
| 464 |
email_configured, |
| 465 |
email_status, |
| 466 |
email_status_class, |
| 467 |
email_provider, |
| 468 |
synckit_configured, |
| 469 |
synckit_status, |
| 470 |
synckit_status_class, |
| 471 |
admin_configured, |
| 472 |
monitor_enabled: true, |
| 473 |
monitor_interval_secs, |
| 474 |
alerts_configured, |
| 475 |
uptime_24h, |
| 476 |
uptime_7d, |
| 477 |
last_incident, |
| 478 |
recent_snapshots, |
| 479 |
environment, |
| 480 |
host, |
| 481 |
started_at, |
| 482 |
public_tests, |
| 483 |
db_tests, |
| 484 |
pom_available, |
| 485 |
pom_status, |
| 486 |
pom_status_class, |
| 487 |
pom_response_time_ms, |
| 488 |
pom_checked_at, |
| 489 |
pom_uptime_24h, |
| 490 |
pom_uptime_7d, |
| 491 |
pom_recent, |
| 492 |
pom_incident, |
| 493 |
pom_recent_incidents, |
| 494 |
pom_avg_latency, |
| 495 |
pom_p95_latency, |
| 496 |
pom_routes_total, |
| 497 |
pom_routes_ok, |
| 498 |
pom_routes_failed, |
| 499 |
privacy_jobs: db::scheduler_jobs::get_job_runs(&state.db).await.unwrap_or_default(), |
| 500 |
} |
| 501 |
} |
| 502 |
|
| 503 |
|
| 504 |
#[tracing::instrument(skip_all, name = "health::health")] |
| 505 |
pub(super) async fn health( |
| 506 |
State(state): State<AppState>, |
| 507 |
session: Session, |
| 508 |
) -> Result<impl IntoResponse> { |
| 509 |
let data = collect_health(&state).await; |
| 510 |
let now = chrono::Utc::now(); |
| 511 |
|
| 512 |
let pool_utilization = if data.db_pool_max > 0 { |
| 513 |
format!("{}%", (data.db_pool_active as f64 / data.db_pool_max as f64 * 100.0) as u32) |
| 514 |
} else { |
| 515 |
"0%".to_string() |
| 516 |
}; |
| 517 |
|
| 518 |
Ok(HealthTemplate { |
| 519 |
csrf_token: get_csrf_token(&session).await, |
| 520 |
session_user: None, |
| 521 |
overall_status: data.overall.label().to_string(), |
| 522 |
overall_status_class: data.overall.css_class().to_string(), |
| 523 |
uptime: data.uptime, |
| 524 |
version: data.version, |
| 525 |
check_duration_ms: data.check_duration_ms, |
| 526 |
db_status: data.db_status.to_string(), |
| 527 |
db_status_class: data.db_status_class.to_string(), |
| 528 |
db_pool_size: data.db_pool_max.to_string(), |
| 529 |
db_pool_max: data.db_pool_max.to_string(), |
| 530 |
db_pool_utilization: pool_utilization, |
| 531 |
db_active_connections: data.db_pool_active.to_string(), |
| 532 |
user_count: data.stats.user_count.to_string(), |
| 533 |
project_count: data.stats.project_count.to_string(), |
| 534 |
item_count: data.stats.item_count.to_string(), |
| 535 |
transaction_count: data.stats.transaction_count.to_string(), |
| 536 |
blog_post_count: data.stats.blog_post_count.to_string(), |
| 537 |
session_status: data.session_status.to_string(), |
| 538 |
session_status_class: data.session_status_class.to_string(), |
| 539 |
active_sessions: data.stats.active_session_count.to_string(), |
| 540 |
storage_status: data.storage_status.to_string(), |
| 541 |
storage_status_class: data.storage_status_class.to_string(), |
| 542 |
storage_configured: data.storage_configured, |
| 543 |
storage_bucket: data.storage_bucket, |
| 544 |
storage_region: data.storage_region, |
| 545 |
stripe_status: data.stripe_status.to_string(), |
| 546 |
stripe_status_class: data.stripe_status_class.to_string(), |
| 547 |
stripe_configured: data.stripe_configured, |
| 548 |
stripe_mode: data.stripe_mode.to_string(), |
| 549 |
connected_creators: data.stats.active_creator_count.to_string(), |
| 550 |
email_status: data.email_status.to_string(), |
| 551 |
email_status_class: data.email_status_class.to_string(), |
| 552 |
email_provider: data.email_provider.to_string(), |
| 553 |
synckit_status: data.synckit_status.to_string(), |
| 554 |
synckit_status_class: data.synckit_status_class.to_string(), |
| 555 |
synckit_configured: data.synckit_configured, |
| 556 |
synckit_app_count: data.stats.sync_app_count.to_string(), |
| 557 |
synckit_device_count: data.stats.sync_device_count.to_string(), |
| 558 |
synckit_log_entries: data.stats.sync_log_entries.to_string(), |
| 559 |
admin_status: if data.admin_configured { "Configured".to_string() } else { "Not configured".to_string() }, |
| 560 |
monitor_enabled: data.monitor_enabled, |
| 561 |
monitor_interval_secs: data.monitor_interval_secs, |
| 562 |
alerts_configured: data.alerts_configured, |
| 563 |
uptime_24h: data.uptime_24h.map(|v| format!("{:.1}", v)), |
| 564 |
uptime_7d: data.uptime_7d.map(|v| format!("{:.1}", v)), |
| 565 |
last_incident: data.last_incident, |
| 566 |
recent_snapshots: data.recent_snapshots.into_iter().map(|s| { |
| 567 |
let status_class = match s.status.as_str() { |
| 568 |
"operational" => "status-ok".to_string(), |
| 569 |
"degraded" => "status-warn".to_string(), |
| 570 |
_ => "status-error".to_string(), |
| 571 |
}; |
| 572 |
HealthSnapshotDisplay { |
| 573 |
checked_at: s.checked_at.format("%H:%M:%S UTC").to_string(), |
| 574 |
status: s.status, |
| 575 |
status_class, |
| 576 |
duration_ms: s.check_duration_ms, |
| 577 |
} |
| 578 |
}).collect(), |
| 579 |
environment: data.environment.to_string(), |
| 580 |
host: data.host, |
| 581 |
started_at: data.started_at, |
| 582 |
public_tests: data.public_tests, |
| 583 |
db_tests: data.db_tests, |
| 584 |
generated_at: now.format("%Y-%m-%d %H:%M:%S UTC").to_string(), |
| 585 |
pom_available: data.pom_available, |
| 586 |
pom_status: data.pom_status, |
| 587 |
pom_status_class: data.pom_status_class, |
| 588 |
pom_response_time_ms: data.pom_response_time_ms, |
| 589 |
pom_checked_at: data.pom_checked_at, |
| 590 |
pom_uptime_24h: data.pom_uptime_24h.map(|v| format!("{:.1}", v)), |
| 591 |
pom_uptime_7d: data.pom_uptime_7d.map(|v| format!("{:.1}", v)), |
| 592 |
pom_recent: data.pom_recent.into_iter().map(|s| { |
| 593 |
let status_class = match s.status.as_str() { |
| 594 |
"operational" => "status-ok".to_string(), |
| 595 |
"degraded" => "status-warn".to_string(), |
| 596 |
_ => "status-error".to_string(), |
| 597 |
}; |
| 598 |
PomSnapshotDisplay { |
| 599 |
checked_at: format_pom_timestamp(&s.checked_at), |
| 600 |
status: s.status, |
| 601 |
status_class, |
| 602 |
response_time_ms: s.response_time_ms, |
| 603 |
} |
| 604 |
}).collect(), |
| 605 |
pom_avg_latency: data.pom_avg_latency, |
| 606 |
pom_p95_latency: data.pom_p95_latency, |
| 607 |
pom_incident_active: data.pom_incident.is_some(), |
| 608 |
pom_incident_status: data.pom_incident.as_ref().map(|i| i.to_status.clone()), |
| 609 |
pom_incident_since: data.pom_incident.as_ref().map(|i| format_pom_timestamp(&i.started_at)), |
| 610 |
pom_recent_incidents: data.pom_recent_incidents.into_iter().map(|i| PomIncidentDisplay { |
| 611 |
started_at: format_pom_timestamp(&i.started_at), |
| 612 |
duration: i.duration_secs.map(format_incident_duration).unwrap_or_else(|| "-".to_string()), |
| 613 |
to_status: i.to_status, |
| 614 |
}).collect(), |
| 615 |
pom_routes_total: data.pom_routes_total, |
| 616 |
pom_routes_ok: data.pom_routes_ok, |
| 617 |
pom_routes_failed: data.pom_routes_failed, |
| 618 |
privacy_jobs: format_privacy_jobs(&data.privacy_jobs, now), |
| 619 |
}) |
| 620 |
} |
| 621 |
|
| 622 |
|
| 623 |
|
| 624 |
|
| 625 |
|
| 626 |
|
| 627 |
|
| 628 |
|
| 629 |
|
| 630 |
|
| 631 |
|
| 632 |
|
| 633 |
|
| 634 |
#[tracing::instrument(skip_all, name = "health::health_json")] |
| 635 |
pub(super) async fn health_json( |
| 636 |
State(state): State<AppState>, |
| 637 |
) -> impl IntoResponse { |
| 638 |
|
| 639 |
let latest = db::monitor::get_recent_health_history(&state.db, 1) |
| 640 |
.await |
| 641 |
.unwrap_or_default(); |
| 642 |
|
| 643 |
|
| 644 |
|
| 645 |
|
| 646 |
let (overall, db_ok) = if let Some(snap) = latest.first() { |
| 647 |
let status = match snap.status.as_str() { |
| 648 |
"operational" => OverallStatus::Operational, |
| 649 |
"degraded" => OverallStatus::Degraded, |
| 650 |
_ => OverallStatus::Error, |
| 651 |
}; |
| 652 |
let db_healthy = status != OverallStatus::Error; |
| 653 |
(status, db_healthy) |
| 654 |
} else { |
| 655 |
|
| 656 |
let db_ok = sqlx::query_scalar::<_, i32>("SELECT 1") |
| 657 |
.fetch_one(&state.db) |
| 658 |
.await |
| 659 |
.is_ok(); |
| 660 |
let status = if db_ok { OverallStatus::Operational } else { OverallStatus::Error }; |
| 661 |
(status, db_ok) |
| 662 |
}; |
| 663 |
|
| 664 |
let http_status = if overall == OverallStatus::Error { |
| 665 |
StatusCode::SERVICE_UNAVAILABLE |
| 666 |
} else { |
| 667 |
StatusCode::OK |
| 668 |
}; |
| 669 |
|
| 670 |
(http_status, Json(health_json_body(overall, db_ok))) |
| 671 |
} |
| 672 |
|
| 673 |
|
| 674 |
|
| 675 |
|
| 676 |
|
| 677 |
|
| 678 |
|
| 679 |
fn health_json_body(overall: OverallStatus, db_ok: bool) -> serde_json::Value { |
| 680 |
serde_json::json!({ |
| 681 |
"status": overall.api_label(), |
| 682 |
"version": env!("CARGO_PKG_VERSION"), |
| 683 |
"checks": { |
| 684 |
"database": db_ok, |
| 685 |
}, |
| 686 |
}) |
| 687 |
} |
| 688 |
|
| 689 |
#[cfg(test)] |
| 690 |
mod tests { |
| 691 |
use super::*; |
| 692 |
|
| 693 |
#[test] |
| 694 |
fn format_uptime_minutes_only() { |
| 695 |
assert_eq!(format_uptime(std::time::Duration::from_secs(0)), "0m"); |
| 696 |
assert_eq!(format_uptime(std::time::Duration::from_secs(59)), "0m"); |
| 697 |
assert_eq!(format_uptime(std::time::Duration::from_secs(60)), "1m"); |
| 698 |
assert_eq!(format_uptime(std::time::Duration::from_secs(300)), "5m"); |
| 699 |
} |
| 700 |
|
| 701 |
#[test] |
| 702 |
fn format_uptime_hours_and_minutes() { |
| 703 |
assert_eq!(format_uptime(std::time::Duration::from_secs(3600)), "1h 0m"); |
| 704 |
assert_eq!(format_uptime(std::time::Duration::from_secs(3660)), "1h 1m"); |
| 705 |
assert_eq!(format_uptime(std::time::Duration::from_secs(7200)), "2h 0m"); |
| 706 |
} |
| 707 |
|
| 708 |
#[test] |
| 709 |
fn format_uptime_days() { |
| 710 |
assert_eq!(format_uptime(std::time::Duration::from_secs(86400)), "1d 0h 0m"); |
| 711 |
assert_eq!(format_uptime(std::time::Duration::from_secs(90061)), "1d 1h 1m"); |
| 712 |
assert_eq!(format_uptime(std::time::Duration::from_secs(259200)), "3d 0h 0m"); |
| 713 |
} |
| 714 |
|
| 715 |
#[test] |
| 716 |
fn overall_status_labels() { |
| 717 |
assert_eq!(OverallStatus::Operational.label(), "All systems operational"); |
| 718 |
assert_eq!(OverallStatus::Degraded.label(), "Degraded performance"); |
| 719 |
assert_eq!(OverallStatus::Error.label(), "Issues detected"); |
| 720 |
} |
| 721 |
|
| 722 |
#[test] |
| 723 |
fn overall_status_css_classes() { |
| 724 |
assert_eq!(OverallStatus::Operational.css_class(), "status-ok"); |
| 725 |
assert_eq!(OverallStatus::Degraded.css_class(), "status-warn"); |
| 726 |
assert_eq!(OverallStatus::Error.css_class(), "status-error"); |
| 727 |
} |
| 728 |
|
| 729 |
#[test] |
| 730 |
fn overall_status_api_labels() { |
| 731 |
assert_eq!(OverallStatus::Operational.api_label(), "operational"); |
| 732 |
assert_eq!(OverallStatus::Degraded.api_label(), "degraded"); |
| 733 |
assert_eq!(OverallStatus::Error.api_label(), "error"); |
| 734 |
} |
| 735 |
|
| 736 |
|
| 737 |
#[test] |
| 738 |
fn pom_hetzner_health_expectations_resolve() { |
| 739 |
let body = health_json_body(OverallStatus::Operational, true); |
| 740 |
pom_contract::assert_health_expectations_resolve( |
| 741 |
"../pom/deploy/pom-hetzner.toml", |
| 742 |
"mnw", |
| 743 |
&body, |
| 744 |
); |
| 745 |
} |
| 746 |
} |
| 747 |
|