//! System health dashboard and JSON monitoring endpoint. //! //! Two layers: //! - `GET /health` (HTML); runs full live checks (DB queries, S3 probe, endpoint self-tests) //! - `GET /api/health` (JSON); reads cached results from the background monitor's database. //! Fast (<10ms), no live probes. This is what PoM and other external services should poll. mod pom; use pom::*; use std::sync::Arc; use axum::extract::State; use axum::http::StatusCode; use axum::response::IntoResponse; use axum::Json; use tower_sessions::Session; use crate::{ db, error::Result, helpers::get_csrf_token, templates::*, AppState, }; /// Format a [`std::time::Duration`] as a human-readable uptime string. fn format_uptime(d: std::time::Duration) -> String { let total_secs = d.as_secs(); let days = total_secs / 86400; let hours = (total_secs % 86400) / 3600; let minutes = (total_secs % 3600) / 60; if days > 0 { format!("{}d {}h {}m", days, hours, minutes) } else if hours > 0 { format!("{}h {}m", hours, minutes) } else { format!("{}m", minutes) } } /// Tri-state overall status derived from service checks. #[derive(Debug, Clone, Copy, PartialEq, Eq)] enum OverallStatus { Operational, Degraded, Error, } impl OverallStatus { fn label(self) -> &'static str { match self { Self::Operational => "All systems operational", Self::Degraded => "Degraded performance", Self::Error => "Issues detected", } } fn css_class(self) -> &'static str { match self { Self::Operational => "status-ok", Self::Degraded => "status-warn", Self::Error => "status-error", } } fn api_label(self) -> &'static str { match self { Self::Operational => "operational", Self::Degraded => "degraded", Self::Error => "error", } } } /// All computed health data, used by the HTML dashboard handler. #[allow(dead_code)] // fields read transitively via status strings struct HealthData { // Overall overall: OverallStatus, uptime: String, version: String, check_duration_ms: u64, // Database db_ok: bool, db_status: &'static str, db_status_class: &'static str, db_pool_active: u32, db_pool_max: u32, stats: db::health::DbHealthStats, // Sessions session_ok: bool, session_status: &'static str, session_status_class: &'static str, // Storage storage_configured: bool, s3_reachable: bool, storage_status: &'static str, storage_status_class: &'static str, storage_bucket: String, storage_region: String, // Stripe stripe_configured: bool, stripe_status: &'static str, stripe_status_class: &'static str, stripe_mode: &'static str, // Email #[allow(dead_code)] // used only by the HTML template via other fields email_configured: bool, email_status: &'static str, email_status_class: &'static str, email_provider: &'static str, // SyncKit synckit_configured: bool, synckit_status: &'static str, synckit_status_class: &'static str, // Security & Monitoring admin_configured: bool, // Background monitor monitor_enabled: bool, monitor_interval_secs: u64, alerts_configured: bool, uptime_24h: Option, uptime_7d: Option, last_incident: Option, recent_snapshots: Vec, // Server environment: &'static str, host: Arc, started_at: String, // Tests public_tests: Vec, db_tests: Vec, // External monitoring (PoM) pom_available: bool, pom_status: Option, pom_status_class: Option, pom_response_time_ms: Option, pom_checked_at: Option, pom_uptime_24h: Option, pom_uptime_7d: Option, pom_recent: Vec, pom_incident: Option, pom_recent_incidents: Vec, pom_avg_latency: Option, pom_p95_latency: Option, pom_routes_total: usize, pom_routes_ok: usize, pom_routes_failed: Vec, // Privacy & Compliance privacy_jobs: Vec, } /// Format scheduler job runs into display structs for the health template. fn format_privacy_jobs( jobs: &[db::scheduler_jobs::SchedulerJobRun], now: chrono::DateTime, ) -> Vec { // Jobs we want to display, with human names and expected frequency in hours let job_meta: &[(&str, &str, i64)] = &[ ("ip_scrub", "Session IP scrub (30-day)", 26), ("session_prune", "Session prune (90-day)", 26), ("terminated_account_cleanup", "Terminated account cleanup (30-day)", 26), ("content_removal_cleanup", "Content removal cleanup (90-day)", 26), ]; job_meta .iter() .map(|(key, description, max_hours)| { let run = jobs.iter().find(|j| j.job_name == *key); match run { Some(r) => { let age = now.signed_duration_since(r.last_ran_at); let last_ran = if age.num_hours() < 1 { format!("{}m ago", age.num_minutes().max(0)) } else if age.num_hours() < 48 { format!("{}h ago", age.num_hours()) } else { format!("{}d ago", age.num_days()) }; let status_class = if age.num_hours() <= *max_hours { "status-ok" } else { "status-warn" }; PrivacyJobDisplay { name: key.to_string(), description: description.to_string(), last_ran, rows_affected: r.rows_affected.to_string(), status_class: status_class.to_string(), } } None => PrivacyJobDisplay { name: key.to_string(), description: description.to_string(), last_ran: "never".to_string(), rows_affected: "-".to_string(), status_class: "status-unknown".to_string(), }, } }) .collect() } /// Run all health checks and return computed data. /// Used by the HTML dashboard; runs live probes (DB, S3, HTTP self-tests). async fn collect_health(state: &AppState) -> HealthData { use std::time::Instant; let check_start = Instant::now(); // Helper to run a timed test async fn run_test(name: &str, f: F) -> HealthTest where F: FnOnce() -> Fut, Fut: std::future::Future, { let start = Instant::now(); let passed = f().await; HealthTest { name: name.to_string(), passed, latency_ms: start.elapsed().as_millis() as u64, } } // Database health checks let db_test_users = run_test("Count users", || async { sqlx::query_scalar::<_, i64>("SELECT COUNT(*) FROM users") .fetch_one(&state.db) .await .is_ok() }).await; let db_test_projects = run_test("Count projects", || async { sqlx::query_scalar::<_, i64>("SELECT COUNT(*) FROM projects") .fetch_one(&state.db) .await .is_ok() }).await; let db_test_items = run_test("Count items", || async { sqlx::query_scalar::<_, i64>("SELECT COUNT(*) FROM items") .fetch_one(&state.db) .await .is_ok() }).await; let db_test_transactions = run_test("Count transactions", || async { sqlx::query_scalar::<_, i64>("SELECT COUNT(*) FROM transactions") .fetch_one(&state.db) .await .is_ok() }).await; // Get actual counts let stats = db::health::get_health_stats(&state.db).await.unwrap_or(db::health::DbHealthStats { user_count: 0, project_count: 0, item_count: 0, active_session_count: 0, active_creator_count: 0, transaction_count: 0, blog_post_count: 0, sync_app_count: 0, sync_device_count: 0, sync_log_entries: 0, }); // Database status let db_ok = db_test_users.passed && db_test_projects.passed; let db_status = if db_ok { "Connected" } else { "Error" }; let db_status_class = if db_ok { "status-ok" } else { "status-error" }; // Pool info let pool_max = state.db.size(); let pool_idle = state.db.num_idle(); let pool_active = pool_max.saturating_sub(pool_idle as u32); // S3 storage status with connectivity check let storage_configured = state.s3.is_some(); let s3_reachable = if let Some(ref s3) = state.s3 { s3.check_connectivity().await.is_ok() } else { false }; let (storage_status, storage_status_class) = if storage_configured && s3_reachable { ("Connected", "status-ok") } else if storage_configured { ("Configured (unreachable)", "status-warn") } else { ("Not configured", "status-warn") }; let (storage_bucket, storage_region) = if let Some(ref storage) = state.config.storage { (storage.bucket.clone(), storage.region.clone()) } else { (String::new(), String::new()) }; // Stripe status let stripe_configured = state.stripe.is_some(); let stripe_status = if stripe_configured { "Configured" } else { "Not configured" }; let stripe_status_class = if stripe_configured { "status-ok" } else { "status-warn" }; let stripe_mode = if stripe_configured { if state.config.stripe.as_ref().map(|s| s.secret_key.starts_with("sk_live")).unwrap_or(false) { "Live" } else { "Test" } } else { "-" }; // Email status let email_configured = std::env::var("POSTMARK_TOKEN").is_ok(); let email_status = if email_configured { "Configured" } else { "Dev mode (logging)" }; let email_status_class = if email_configured { "status-ok" } else { "status-warn" }; let email_provider = if email_configured { "Postmark" } else { "Console" }; // Session status let session_ok = stats.active_session_count > 0; let session_status = if session_ok { "Active" } else { "Error" }; let session_status_class = if session_ok { "status-ok" } else { "status-error" }; // SyncKit status let synckit_configured = state.config.synckit_jwt_secret.is_some(); let synckit_status = if synckit_configured { "Configured" } else { "Not configured" }; let synckit_status_class = if synckit_configured { "status-ok" } else { "status-warn" }; // Security & Monitoring let admin_configured = state.config.admin_user_id.is_some(); // Overall tri-state status let overall = if !db_ok || !session_ok { OverallStatus::Error } else if storage_configured && !s3_reachable { OverallStatus::Degraded } else { OverallStatus::Operational }; // Environment let environment = if cfg!(debug_assertions) { "Development" } else { "Production" }; let host = state.config.host_url.clone(); // Real uptime from AppState let uptime = format_uptime(state.start_instant.elapsed()); let started_at = state.started_at.format("%Y-%m-%d %H:%M:%S UTC").to_string(); // Version with git hash let version = match option_env!("GIT_HASH") { Some(hash) if !hash.is_empty() => format!("{} ({})", env!("CARGO_PKG_VERSION"), hash), _ => env!("CARGO_PKG_VERSION").to_string(), }; // Internal route-existence checks (no HTTP self-call — avoids pool contention) let public_tests: Vec = vec![]; let db_tests = vec![ db_test_users, db_test_projects, db_test_items, db_test_transactions, ]; let check_duration_ms = check_start.elapsed().as_millis() as u64; // External monitoring (PoM) — best-effort, don't delay the page let pom = fetch_pom_status().await; let (pom_available, pom_status, pom_status_class, pom_response_time_ms, pom_checked_at, pom_uptime_24h, pom_uptime_7d, pom_recent, pom_incident, pom_recent_incidents, pom_avg_latency, pom_p95_latency, pom_routes_total, pom_routes_ok, pom_routes_failed) = if let Some(ref pom) = pom { let latest = pom.latest.as_ref(); let status = latest.map(|s| s.status.clone()); let status_class = status.as_deref().map(|s| match s { "operational" => "status-ok".to_string(), "degraded" => "status-warn".to_string(), _ => "status-error".to_string(), }); let avg_latency = pom.latency_24h.as_ref().map(|l| format!("{:.0}ms", l.avg_ms)); let p95_latency = pom.latency_24h.as_ref().map(|l| format!("{}ms", l.p95_ms)); let routes_total = pom.route_status.len(); let routes_ok = pom.route_status.iter().filter(|r| r.ok).count(); let routes_failed: Vec = pom.route_status.iter().filter(|r| !r.ok).map(|r| r.path.clone()).collect(); ( true, status, status_class, latest.map(|s| s.response_time_ms), latest.map(|s| format_pom_timestamp(&s.checked_at)), pom.uptime_24h, pom.uptime_7d, pom.recent.clone(), pom.current_incident.clone(), pom.incidents.iter().filter(|i| i.ended_at.is_some()).cloned().collect(), avg_latency, p95_latency, routes_total, routes_ok, routes_failed, ) } else { (false, None, None, None, None, None, None, Vec::new(), None, Vec::new(), None, None, 0, 0, Vec::new()) }; // Background monitor data (best-effort — don't fail the page if queries fail) let monitor_interval_secs = std::env::var("HEALTH_CHECK_INTERVAL_SECS") .ok() .and_then(|v| v.parse::().ok()) .unwrap_or(crate::constants::HEALTH_CHECK_INTERVAL_SECS); let alerts_configured = std::env::var("ALERT_EMAIL").is_ok(); let uptime_24h = db::monitor::get_health_uptime_percent(&state.db, 24) .await .unwrap_or(None); let uptime_7d = db::monitor::get_health_uptime_percent(&state.db, 168) .await .unwrap_or(None); let last_incident = db::monitor::get_last_incident(&state.db) .await .unwrap_or(None) .map(|dt| dt.format("%Y-%m-%d %H:%M UTC").to_string()); let recent_snapshots = db::monitor::get_recent_health_history(&state.db, 10) .await .unwrap_or_default(); HealthData { overall, uptime, version, check_duration_ms, db_ok, db_status, db_status_class, db_pool_active: pool_active, db_pool_max: pool_max, stats, session_ok, session_status, session_status_class, storage_configured, s3_reachable, storage_status, storage_status_class, storage_bucket, storage_region, stripe_configured, stripe_status, stripe_status_class, stripe_mode, email_configured, email_status, email_status_class, email_provider, synckit_configured, synckit_status, synckit_status_class, admin_configured, monitor_enabled: true, monitor_interval_secs, alerts_configured, uptime_24h, uptime_7d, last_incident, recent_snapshots, environment, host, started_at, public_tests, db_tests, pom_available, pom_status, pom_status_class, pom_response_time_ms, pom_checked_at, pom_uptime_24h, pom_uptime_7d, pom_recent, pom_incident, pom_recent_incidents, pom_avg_latency, pom_p95_latency, pom_routes_total, pom_routes_ok, pom_routes_failed, privacy_jobs: db::scheduler_jobs::get_job_runs(&state.db).await.unwrap_or_default(), } } /// Render the system health dashboard with database, storage, and service status. #[tracing::instrument(skip_all, name = "health::health")] pub(super) async fn health( State(state): State, session: Session, ) -> Result { let data = collect_health(&state).await; let now = chrono::Utc::now(); let pool_utilization = if data.db_pool_max > 0 { format!("{}%", (data.db_pool_active as f64 / data.db_pool_max as f64 * 100.0) as u32) } else { "0%".to_string() }; Ok(HealthTemplate { csrf_token: get_csrf_token(&session).await, session_user: None, overall_status: data.overall.label().to_string(), overall_status_class: data.overall.css_class().to_string(), uptime: data.uptime, version: data.version, check_duration_ms: data.check_duration_ms, db_status: data.db_status.to_string(), db_status_class: data.db_status_class.to_string(), db_pool_size: data.db_pool_max.to_string(), db_pool_max: data.db_pool_max.to_string(), db_pool_utilization: pool_utilization, db_active_connections: data.db_pool_active.to_string(), user_count: data.stats.user_count.to_string(), project_count: data.stats.project_count.to_string(), item_count: data.stats.item_count.to_string(), transaction_count: data.stats.transaction_count.to_string(), blog_post_count: data.stats.blog_post_count.to_string(), session_status: data.session_status.to_string(), session_status_class: data.session_status_class.to_string(), active_sessions: data.stats.active_session_count.to_string(), storage_status: data.storage_status.to_string(), storage_status_class: data.storage_status_class.to_string(), storage_configured: data.storage_configured, storage_bucket: data.storage_bucket, storage_region: data.storage_region, stripe_status: data.stripe_status.to_string(), stripe_status_class: data.stripe_status_class.to_string(), stripe_configured: data.stripe_configured, stripe_mode: data.stripe_mode.to_string(), connected_creators: data.stats.active_creator_count.to_string(), email_status: data.email_status.to_string(), email_status_class: data.email_status_class.to_string(), email_provider: data.email_provider.to_string(), synckit_status: data.synckit_status.to_string(), synckit_status_class: data.synckit_status_class.to_string(), synckit_configured: data.synckit_configured, synckit_app_count: data.stats.sync_app_count.to_string(), synckit_device_count: data.stats.sync_device_count.to_string(), synckit_log_entries: data.stats.sync_log_entries.to_string(), admin_status: if data.admin_configured { "Configured".to_string() } else { "Not configured".to_string() }, monitor_enabled: data.monitor_enabled, monitor_interval_secs: data.monitor_interval_secs, alerts_configured: data.alerts_configured, uptime_24h: data.uptime_24h.map(|v| format!("{:.1}", v)), uptime_7d: data.uptime_7d.map(|v| format!("{:.1}", v)), last_incident: data.last_incident, recent_snapshots: data.recent_snapshots.into_iter().map(|s| { let status_class = match s.status.as_str() { "operational" => "status-ok".to_string(), "degraded" => "status-warn".to_string(), _ => "status-error".to_string(), }; HealthSnapshotDisplay { checked_at: s.checked_at.format("%H:%M:%S UTC").to_string(), status: s.status, status_class, duration_ms: s.check_duration_ms, } }).collect(), environment: data.environment.to_string(), host: data.host, started_at: data.started_at, public_tests: data.public_tests, db_tests: data.db_tests, generated_at: now.format("%Y-%m-%d %H:%M:%S UTC").to_string(), pom_available: data.pom_available, pom_status: data.pom_status, pom_status_class: data.pom_status_class, pom_response_time_ms: data.pom_response_time_ms, pom_checked_at: data.pom_checked_at, pom_uptime_24h: data.pom_uptime_24h.map(|v| format!("{:.1}", v)), pom_uptime_7d: data.pom_uptime_7d.map(|v| format!("{:.1}", v)), pom_recent: data.pom_recent.into_iter().map(|s| { let status_class = match s.status.as_str() { "operational" => "status-ok".to_string(), "degraded" => "status-warn".to_string(), _ => "status-error".to_string(), }; PomSnapshotDisplay { checked_at: format_pom_timestamp(&s.checked_at), status: s.status, status_class, response_time_ms: s.response_time_ms, } }).collect(), pom_avg_latency: data.pom_avg_latency, pom_p95_latency: data.pom_p95_latency, pom_incident_active: data.pom_incident.is_some(), pom_incident_status: data.pom_incident.as_ref().map(|i| i.to_status.clone()), pom_incident_since: data.pom_incident.as_ref().map(|i| format_pom_timestamp(&i.started_at)), pom_recent_incidents: data.pom_recent_incidents.into_iter().map(|i| PomIncidentDisplay { started_at: format_pom_timestamp(&i.started_at), duration: i.duration_secs.map(format_incident_duration).unwrap_or_else(|| "-".to_string()), to_status: i.to_status, }).collect(), pom_routes_total: data.pom_routes_total, pom_routes_ok: data.pom_routes_ok, pom_routes_failed: data.pom_routes_failed, privacy_jobs: format_privacy_jobs(&data.privacy_jobs, now), }) } // ============================================================================ // JSON Health Endpoint (fast — reads from background monitor cache) // ============================================================================ // Note: the HTML /health page has full diagnostics. The JSON endpoint is // intentionally minimal (status only) to avoid leaking version, uptime, // git hash, and service configuration to unauthenticated callers. /// `GET /api/health`: fast JSON health endpoint. /// /// Reads the latest snapshot from the background monitor's database instead of /// running live probes. Returns 200 if operational or degraded, 503 if error. #[tracing::instrument(skip_all, name = "health::health_json")] pub(super) async fn health_json( State(state): State, ) -> impl IntoResponse { // Read the latest snapshot from the background monitor (single DB row) let latest = db::monitor::get_recent_health_history(&state.db, 1) .await .unwrap_or_default(); // Use cached monitor data only — no live probes (fast <10ms as documented). // Falls back to a single DB probe only when no monitor snapshots exist yet // (fresh startup before the first monitor tick). let (overall, db_ok) = if let Some(snap) = latest.first() { let status = match snap.status.as_str() { "operational" => OverallStatus::Operational, "degraded" => OverallStatus::Degraded, _ => OverallStatus::Error, }; let db_healthy = status != OverallStatus::Error; (status, db_healthy) } else { // No monitor data yet — single minimal probe let db_ok = sqlx::query_scalar::<_, i32>("SELECT 1") .fetch_one(&state.db) .await .is_ok(); let status = if db_ok { OverallStatus::Operational } else { OverallStatus::Error }; (status, db_ok) }; let http_status = if overall == OverallStatus::Error { StatusCode::SERVICE_UNAVAILABLE } else { StatusCode::OK }; (http_status, Json(health_json_body(overall, db_ok))) } /// Build the JSON body for the `/api/health` response. /// /// Kept as a pure function (no AppState, no DB) so the schema-drift guard /// test in this module can exercise it directly. PoM polls this endpoint /// and runs key-by-key assertions from `pom/deploy/pom-hetzner.toml`; the /// guard test validates that every asserted path still resolves here. fn health_json_body(overall: OverallStatus, db_ok: bool) -> serde_json::Value { serde_json::json!({ "status": overall.api_label(), "version": env!("CARGO_PKG_VERSION"), "checks": { "database": db_ok, }, }) } #[cfg(test)] mod tests { use super::*; #[test] fn format_uptime_minutes_only() { assert_eq!(format_uptime(std::time::Duration::from_secs(0)), "0m"); assert_eq!(format_uptime(std::time::Duration::from_secs(59)), "0m"); assert_eq!(format_uptime(std::time::Duration::from_secs(60)), "1m"); assert_eq!(format_uptime(std::time::Duration::from_secs(300)), "5m"); } #[test] fn format_uptime_hours_and_minutes() { assert_eq!(format_uptime(std::time::Duration::from_secs(3600)), "1h 0m"); assert_eq!(format_uptime(std::time::Duration::from_secs(3660)), "1h 1m"); assert_eq!(format_uptime(std::time::Duration::from_secs(7200)), "2h 0m"); } #[test] fn format_uptime_days() { assert_eq!(format_uptime(std::time::Duration::from_secs(86400)), "1d 0h 0m"); assert_eq!(format_uptime(std::time::Duration::from_secs(90061)), "1d 1h 1m"); assert_eq!(format_uptime(std::time::Duration::from_secs(259200)), "3d 0h 0m"); } #[test] fn overall_status_labels() { assert_eq!(OverallStatus::Operational.label(), "All systems operational"); assert_eq!(OverallStatus::Degraded.label(), "Degraded performance"); assert_eq!(OverallStatus::Error.label(), "Issues detected"); } #[test] fn overall_status_css_classes() { assert_eq!(OverallStatus::Operational.css_class(), "status-ok"); assert_eq!(OverallStatus::Degraded.css_class(), "status-warn"); assert_eq!(OverallStatus::Error.css_class(), "status-error"); } #[test] fn overall_status_api_labels() { assert_eq!(OverallStatus::Operational.api_label(), "operational"); assert_eq!(OverallStatus::Degraded.api_label(), "degraded"); assert_eq!(OverallStatus::Error.api_label(), "error"); } /// Schema-drift guard for the `mnw` target. See `shared/pom-contract/`. #[test] fn pom_hetzner_health_expectations_resolve() { let body = health_json_body(OverallStatus::Operational, true); pom_contract::assert_health_expectations_resolve( "../pom/deploy/pom-hetzner.toml", "mnw", &body, ); } }