Skip to main content

max / makenotwork

10.7 KB · 289 lines History Blame Raw
1 use schemars::JsonSchema;
2 use serde::Deserialize;
3 use tracing::instrument;
4
5 use crate::checks::http;
6 use crate::db;
7 use crate::types::{LatencyStats, TargetInfo};
8
9 use super::PomServer;
10
11 #[derive(Debug, Deserialize, JsonSchema)]
12 pub struct CheckHealthParams {
13 /// Target name to check (omit to check all targets)
14 pub target: Option<String>,
15 }
16
17 #[derive(Debug, Deserialize, JsonSchema)]
18 pub struct HealthHistoryParams {
19 /// Filter by target name
20 pub target: Option<String>,
21 /// Number of results to return (default 10)
22 pub limit: Option<i64>,
23 }
24
25 impl PomServer {
26 #[instrument(skip_all)]
27 pub async fn get_status_impl(
28 &self,
29 ) -> crate::error::Result<String> {
30 let mut status_parts = Vec::new();
31
32 for name in self.config.target_names() {
33 let target = self.config.get_target(&name).unwrap();
34 let mut target_status = format!("## {name} ({})\n", target.label);
35
36 // Latest health
37 if let Ok(Some(health)) = db::get_latest_health(&self.pool, &name).await {
38 target_status.push_str(&format!(
39 "Health: {} ({}ms, {})\n",
40 health.status, health.response_time_ms, health.checked_at
41 ));
42 if let Some(details) = &health.details {
43 if let Some(v) = &details.version {
44 target_status.push_str(&format!("Version: {v}\n"));
45 }
46 if let Some(u) = &details.uptime {
47 target_status.push_str(&format!("Uptime: {u}\n"));
48 }
49 }
50 if let Some(err) = &health.error {
51 target_status.push_str(&format!("Error: {err}\n"));
52 }
53 } else {
54 target_status.push_str("Health: no data\n");
55 }
56
57 // 24h latency stats
58 let latency_cutoff = (chrono::Utc::now() - chrono::Duration::hours(24)).to_rfc3339();
59 if let Ok(times) = db::get_response_times(&self.pool, &name, &latency_cutoff).await {
60 let operational_times: Vec<i64> = times.iter()
61 .filter(|(_, ms)| *ms > 0)
62 .map(|(_, ms)| *ms)
63 .collect();
64 if let Some(l) = LatencyStats::from_times(&operational_times) {
65 target_status.push_str(&format!(
66 "Latency (24h): avg {:.0}ms, p95 {}ms, range {}-{}ms ({} samples)\n",
67 l.avg_ms, l.p95_ms, l.min_ms, l.max_ms, l.sample_count
68 ));
69 }
70 }
71
72 // Active incident
73 if let Ok(Some(incident)) = db::get_open_incident(&self.pool, &name).await {
74 target_status.push_str(&format!(
75 "Incident: [ACTIVE] {} since {}\n",
76 incident.to_status, incident.started_at
77 ));
78 }
79
80 // Recent incidents
81 if let Ok(incidents) = db::get_recent_incidents(&self.pool, &name, 5).await {
82 let closed: Vec<_> = incidents.iter().filter(|i| i.ended_at.is_some()).collect();
83 if !closed.is_empty() {
84 target_status.push_str("Recent incidents:\n");
85 for inc in closed {
86 let duration = inc.duration_secs.map(|d| format!(" ({d}s)")).unwrap_or_default();
87 target_status.push_str(&format!(
88 " {} -> {} at {}{}\n",
89 inc.from_status, inc.to_status, inc.started_at, duration
90 ));
91 }
92 }
93 }
94
95 // Latest test run
96 let latest_test = db::get_latest_test_run(&self.pool, &name).await.ok().flatten();
97 if let Some(ref test) = latest_test {
98 let result = if test.passed { "PASSED" } else { "FAILED" };
99 target_status.push_str(&format!("Tests: {result}"));
100 if let Some(d) = test.duration_secs {
101 target_status.push_str(&format!(" ({d}s)"));
102 }
103 target_status.push_str(&format!(" ({})\n", test.started_at));
104 if let (Some(p), Some(f)) = (test.summary.total_passed, test.summary.total_failed) {
105 target_status.push_str(&format!(" {p} passed, {f} failed\n"));
106 }
107 for step in &test.summary.steps {
108 let mark = if step.passed { "PASS" } else { "FAIL" };
109 target_status.push_str(&format!(" {mark} {}\n", step.name));
110 }
111 } else {
112 target_status.push_str("Tests: no data\n");
113 }
114
115 // Test staleness
116 if let Some(tests_config) = &target.tests {
117 let current_version = db::get_latest_health(&self.pool, &name)
118 .await
119 .ok()
120 .flatten()
121 .and_then(|h| h.details)
122 .and_then(|d| d.version);
123
124 let tested_version = if let Some(ref test) = latest_test {
125 db::get_version_at_time(&self.pool, &name, &test.started_at)
126 .await
127 .unwrap_or(None)
128 } else {
129 None
130 };
131
132 let staleness = http::compute_test_staleness(
133 current_version.as_deref(),
134 tested_version.as_deref(),
135 latest_test.as_ref().map(|t| t.started_at.as_str()),
136 tests_config.staleness_days,
137 );
138
139 if staleness.stale
140 && let Some(reason) = &staleness.reason
141 {
142 target_status.push_str(&format!("Tests: STALE \u{2014} {reason}\n"));
143 }
144 }
145
146 status_parts.push(target_status);
147 }
148
149 if status_parts.is_empty() {
150 return Ok("No targets configured.".to_string());
151 }
152
153 Ok(status_parts.join("\n"))
154 }
155
156 #[instrument(skip_all)]
157 pub async fn check_health_impl(
158 &self,
159 params: CheckHealthParams,
160 ) -> crate::error::Result<String> {
161 let targets: Vec<String> = match &params.target {
162 Some(t) => {
163 if self.config.get_target(t).is_none() {
164 return Ok(format!("Unknown target: {t}"));
165 }
166 vec![t.clone()]
167 }
168 None => self.config.target_names(),
169 };
170
171 let mut results = Vec::new();
172
173 for name in &targets {
174 let target = self.config.get_target(name).unwrap();
175 if let Some(health_config) = &target.health {
176 let snapshot = http::check_health(name, health_config, health_config.expect.as_ref()).await;
177 db::insert_health_check(&self.pool, &snapshot).await?;
178 results.push(serde_json::to_string_pretty(&snapshot)?);
179 } else {
180 results.push(format!("{name}: no health endpoint configured"));
181 }
182 }
183
184 Ok(results.join("\n\n"))
185 }
186
187 #[instrument(skip_all)]
188 pub async fn health_history_impl(
189 &self,
190 params: HealthHistoryParams,
191 ) -> crate::error::Result<String> {
192 let limit = params.limit.unwrap_or(10);
193 let history = db::get_health_history(&self.pool, params.target.as_deref(), limit).await?;
194
195 if history.is_empty() {
196 return Ok("No health check history.".to_string());
197 }
198
199 Ok(serde_json::to_string_pretty(&history)?)
200 }
201
202 #[instrument(skip_all)]
203 pub async fn get_mesh_status_impl(
204 &self,
205 ) -> crate::error::Result<String> {
206 let listen = &self.config.serve.listen;
207 let url = format!("http://{listen}/api/mesh");
208
209 let client = reqwest::Client::builder()
210 .timeout(std::time::Duration::from_secs(5))
211 .build()?;
212
213 let response = client.get(&url).send().await.map_err(|e| {
214 crate::error::PomError::Config(format!(
215 "Could not reach local PoM instance at {listen}: {e}"
216 ))
217 })?;
218
219 let data: serde_json::Value = response.json().await?;
220
221 let Some(instances) = data.get("instances").and_then(|v| v.as_object()) else {
222 return Ok("No mesh data available. Is serve mode running?".to_string());
223 };
224
225 let mut output = String::from("# Peer Mesh Status\n\n");
226
227 for (name, instance_data) in instances {
228 let instance = instance_data.get("instance");
229 let version = instance
230 .and_then(|i| i.get("version"))
231 .and_then(|v| v.as_str())
232 .unwrap_or("?");
233
234 output.push_str(&format!("## {name} (v{version})\n"));
235
236 if let Some(targets) = instance_data.get("targets").and_then(|v| v.as_object()) {
237 for (target_name, target_data) in targets {
238 let status = target_data.get("status").and_then(|v| v.as_str()).unwrap_or("?");
239 let ms = target_data.get("response_time_ms").and_then(|v| v.as_i64());
240 let ms_str = ms.map(|m| format!(" ({m}ms)")).unwrap_or_default();
241 output.push_str(&format!("- Target {target_name}: {status}{ms_str}\n"));
242 }
243 }
244
245 if let Some(peers) = instance_data.get("peers").and_then(|v| v.as_object()) {
246 for (peer_name, peer_data) in peers {
247 let status = peer_data.get("status").and_then(|v| v.as_str()).unwrap_or("?");
248 let latency = peer_data
249 .get("latency_ms")
250 .and_then(|v| v.as_u64())
251 .map(|ms| format!(" ({ms}ms)"))
252 .unwrap_or_default();
253 output.push_str(&format!("- Peer {peer_name}: {status}{latency}\n"));
254 }
255 }
256
257 if let Some(err) = instance_data.get("error").and_then(|v| v.as_str()) {
258 output.push_str(&format!("- ({err})\n"));
259 }
260
261 output.push('\n');
262 }
263
264 Ok(output)
265 }
266
267 #[instrument(skip_all)]
268 pub async fn list_targets_impl(
269 &self,
270 ) -> crate::error::Result<String> {
271 let targets: Vec<TargetInfo> = self
272 .config
273 .target_names()
274 .into_iter()
275 .map(|name| {
276 let t = self.config.get_target(&name).unwrap();
277 TargetInfo {
278 name,
279 label: t.label.clone(),
280 has_health: t.health.is_some(),
281 has_tests: t.tests.is_some(),
282 }
283 })
284 .collect();
285
286 Ok(serde_json::to_string_pretty(&targets)?)
287 }
288 }
289