Skip to main content

max / pom

23.4 KB · 684 lines History Blame Raw
1 use std::time::Instant;
2
3 use tracing::instrument;
4
5 use crate::config::{HealthConfig, HealthExpectation};
6 use crate::types::{HealthDetails, HealthSnapshot, HealthStatus};
7
8 /// Maximum response body size we'll read into memory (10 MB).
9 const MAX_RESPONSE_BYTES: u64 = 10 * 1024 * 1024;
10
11 #[instrument(skip_all)]
12 pub async fn check_health(
13 target_name: &str,
14 config: &HealthConfig,
15 expect: Option<&HealthExpectation>,
16 ) -> HealthSnapshot {
17 let client = reqwest::Client::builder()
18 .timeout(std::time::Duration::from_secs(config.timeout_secs))
19 .build()
20 .unwrap_or_else(|_| reqwest::Client::new());
21
22 let start = Instant::now();
23 let checked_at = chrono::Utc::now().to_rfc3339();
24
25 match client.get(&config.url).send().await {
26 Ok(response) => {
27 let response_time_ms = start.elapsed().as_millis() as i64;
28 let status_code = response.status().as_u16();
29
30 // Reject responses that declare a content-length exceeding our limit.
31 if let Some(len) = response.content_length()
32 && len > MAX_RESPONSE_BYTES
33 {
34 return HealthSnapshot {
35 id: None,
36 target: target_name.to_string(),
37 status: HealthStatus::Degraded,
38 checked_at,
39 response_time_ms,
40 details: None,
41 error: Some(format!(
42 "Response body too large: {len} bytes (limit: {MAX_RESPONSE_BYTES} bytes)"
43 )),
44 };
45 }
46
47 // Read body with size cap (handles chunked/streaming responses without content-length).
48 let body_result = match response.bytes().await {
49 Ok(bytes) => {
50 if bytes.len() as u64 > MAX_RESPONSE_BYTES {
51 Err(format!(
52 "Response body too large: {} bytes (limit: {MAX_RESPONSE_BYTES} bytes)",
53 bytes.len()
54 ))
55 } else {
56 String::from_utf8(bytes.to_vec())
57 .map_err(|e| format!("Response body not valid UTF-8: {e}"))
58 }
59 }
60 Err(e) => Err(format!("Failed to read response body: {e}")),
61 };
62
63 match body_result {
64 Ok(body) => {
65 let json: Option<serde_json::Value> = serde_json::from_str(&body).ok();
66
67 let (mut status, details, mut error) = if let Some(ref json) = json {
68 let (s, d) = classify_json_response(status_code, json);
69 (s, Some(d), None)
70 } else {
71 (classify_non_json(status_code), None, Some("Failed to parse response as JSON".to_string()))
72 };
73
74 // Apply expectation validation
75 if let Some(exp) = expect {
76 let failures = validate_expectations(exp, status_code, &body, json.as_ref());
77 if !failures.is_empty() {
78 status = HealthStatus::Degraded;
79 error = Some(failures.join("; "));
80 } else if json.is_none() {
81 // Non-JSON response but all expectations passed — treat as operational
82 status = HealthStatus::Operational;
83 error = None;
84 }
85 }
86
87 HealthSnapshot {
88 id: None,
89 target: target_name.to_string(),
90 status,
91 checked_at,
92 response_time_ms,
93 details,
94 error,
95 }
96 }
97 Err(e) => HealthSnapshot {
98 id: None,
99 target: target_name.to_string(),
100 status: HealthStatus::Degraded,
101 checked_at,
102 response_time_ms,
103 details: None,
104 error: Some(e),
105 },
106 }
107 }
108 Err(e) => {
109 let response_time_ms = start.elapsed().as_millis() as i64;
110 HealthSnapshot {
111 id: None,
112 target: target_name.to_string(),
113 status: HealthStatus::Unreachable,
114 checked_at,
115 response_time_ms,
116 details: None,
117 error: Some(format!("{e}")),
118 }
119 }
120 }
121 }
122
123 /// Walk a dot-separated path through nested JSON objects.
124 pub fn resolve_json_path<'a>(value: &'a serde_json::Value, path: &str) -> Option<&'a serde_json::Value> {
125 let mut current = value;
126 for key in path.split('.') {
127 current = current.get(key)?;
128 }
129 Some(current)
130 }
131
132 /// Validate response against expectations. Returns a list of failure descriptions.
133 pub fn validate_expectations(
134 expect: &HealthExpectation,
135 status_code: u16,
136 body: &str,
137 json: Option<&serde_json::Value>,
138 ) -> Vec<String> {
139 let mut failures = Vec::new();
140
141 if let Some(expected_code) = expect.status_code
142 && status_code != expected_code
143 {
144 failures.push(format!("expected status {expected_code}, got {status_code}"));
145 }
146
147 if let Some(ref substring) = expect.body_contains
148 && !body.contains(substring.as_str())
149 {
150 failures.push(format!("body missing expected substring \"{substring}\""));
151 }
152
153 if !expect.json_fields.is_empty() {
154 if let Some(json) = json {
155 for (path, expected_value) in &expect.json_fields {
156 match resolve_json_path(json, path) {
157 Some(actual) => {
158 let actual_str: std::borrow::Cow<'_, str> = match actual {
159 serde_json::Value::String(s) => std::borrow::Cow::Borrowed(s),
160 other => std::borrow::Cow::Owned(other.to_string()),
161 };
162 if *actual_str != *expected_value {
163 failures.push(format!("json field \"{path}\": expected \"{expected_value}\", got \"{actual_str}\""));
164 }
165 }
166 None => {
167 failures.push(format!("json field \"{path}\" not found"));
168 }
169 }
170 }
171 } else {
172 failures.push("expected JSON response for field validation, got non-JSON".to_string());
173 }
174 }
175
176 failures
177 }
178
179 /// Classify a JSON health response into status + details.
180 pub fn classify_json_response(
181 status_code: u16,
182 json: &serde_json::Value,
183 ) -> (HealthStatus, HealthDetails) {
184 let api_status = json
185 .get("status")
186 .and_then(|s| s.as_str())
187 .unwrap_or("unknown");
188
189 let status = match api_status {
190 "operational" => HealthStatus::Operational,
191 "degraded" => HealthStatus::Degraded,
192 _ if (200..300).contains(&status_code) => HealthStatus::Degraded,
193 _ => HealthStatus::Error,
194 };
195
196 let details = HealthDetails {
197 version: json.get("version").and_then(|v| v.as_str()).map(String::from),
198 uptime: json.get("uptime").and_then(|v| v.as_str()).map(String::from),
199 checks: json.get("checks").cloned(),
200 monitoring: json.get("monitoring").cloned(),
201 };
202
203 (status, details)
204 }
205
206 /// Classify a response that couldn't be parsed as JSON.
207 pub fn classify_non_json(status_code: u16) -> HealthStatus {
208 if (200..300).contains(&status_code) {
209 HealthStatus::Degraded
210 } else {
211 HealthStatus::Error
212 }
213 }
214
215 /// Detect sustained latency drift by checking if all recent response times
216 /// exceed the baseline average by the given threshold multiplier.
217 ///
218 /// Returns a description string if drift is detected, `None` otherwise.
219 /// Requires at least 10 baseline samples to avoid false positives.
220 pub fn detect_latency_drift(
221 recent_times: &[i64],
222 baseline: &crate::types::LatencyStats,
223 threshold: f64,
224 ) -> Option<String> {
225 if baseline.sample_count < 10 || recent_times.is_empty() {
226 return None;
227 }
228 let drift_threshold = baseline.avg_ms * threshold;
229 let all_over = recent_times.iter().all(|&t| t as f64 > drift_threshold);
230 if all_over {
231 let avg_recent: f64 = recent_times.iter().sum::<i64>() as f64 / recent_times.len() as f64;
232 Some(format!(
233 "latency drift: last {} checks avg {:.0}ms (baseline avg {:.0}ms, threshold {:.0}ms)",
234 recent_times.len(),
235 avg_recent,
236 baseline.avg_ms,
237 drift_threshold,
238 ))
239 } else {
240 None
241 }
242 }
243
244 /// Detect sustained test duration drift by checking if all recent durations
245 /// exceed the baseline average by the given threshold multiplier.
246 ///
247 /// Returns a description string if drift is detected, `None` otherwise.
248 /// Requires at least `baseline_count` samples for the baseline window.
249 pub fn detect_test_duration_drift(
250 durations: &[(String, i64)],
251 baseline_count: usize,
252 recent_count: usize,
253 threshold: f64,
254 ) -> Option<String> {
255 if durations.len() < baseline_count + recent_count {
256 return None;
257 }
258
259 // durations are ordered most recent first from get_test_durations
260 let recent = &durations[..recent_count];
261 let baseline = &durations[recent_count..];
262
263 if baseline.is_empty() {
264 return None;
265 }
266
267 let baseline_avg = baseline.iter().map(|(_, d)| *d).sum::<i64>() as f64 / baseline.len() as f64;
268 let drift_threshold = baseline_avg * threshold;
269
270 let all_over = recent.iter().all(|(_, d)| *d as f64 > drift_threshold);
271 if all_over {
272 let recent_avg = recent.iter().map(|(_, d)| *d).sum::<i64>() as f64 / recent.len() as f64;
273 Some(format!(
274 "test duration drift: last {} runs avg {:.0}s (baseline avg {:.0}s, threshold {:.0}s)",
275 recent_count,
276 recent_avg,
277 baseline_avg,
278 drift_threshold,
279 ))
280 } else {
281 None
282 }
283 }
284
285 /// Compute test staleness from version and timing data.
286 ///
287 /// A target's tests are considered stale when:
288 /// 1. No tests have ever been run
289 /// 2. Tests are older than `staleness_days`
290 /// 3. The deployed version has changed since the last test run
291 pub fn compute_test_staleness(
292 current_version: Option<&str>,
293 tested_version: Option<&str>,
294 last_test_at: Option<&str>,
295 staleness_days: u64,
296 ) -> crate::types::TestStaleness {
297 let Some(last_test_at) = last_test_at else {
298 return crate::types::TestStaleness {
299 stale: true,
300 reason: Some("no tests have been run".to_string()),
301 current_version: current_version.map(String::from),
302 tested_version: None,
303 last_test_at: None,
304 days_since_test: None,
305 };
306 };
307
308 let days_since = chrono::DateTime::parse_from_rfc3339(last_test_at)
309 .ok()
310 .map(|dt| {
311 let now = chrono::Utc::now();
312 (now - dt.with_timezone(&chrono::Utc)).num_days()
313 });
314
315 if let Some(days) = days_since
316 && days >= staleness_days as i64
317 {
318 return crate::types::TestStaleness {
319 stale: true,
320 reason: Some(format!("tests are {days} days old (threshold: {staleness_days}d)")),
321 current_version: current_version.map(String::from),
322 tested_version: tested_version.map(String::from),
323 last_test_at: Some(last_test_at.to_string()),
324 days_since_test: Some(days),
325 };
326 }
327
328 if let (Some(current), Some(tested)) = (current_version, tested_version)
329 && current != tested
330 {
331 return crate::types::TestStaleness {
332 stale: true,
333 reason: Some(format!("version changed: {tested} -> {current}")),
334 current_version: Some(current.to_string()),
335 tested_version: Some(tested.to_string()),
336 last_test_at: Some(last_test_at.to_string()),
337 days_since_test: days_since,
338 };
339 }
340
341 crate::types::TestStaleness {
342 stale: false,
343 reason: None,
344 current_version: current_version.map(String::from),
345 tested_version: tested_version.map(String::from),
346 last_test_at: Some(last_test_at.to_string()),
347 days_since_test: days_since,
348 }
349 }
350
351 #[cfg(test)]
352 mod tests {
353 use super::*;
354 use std::collections::HashMap;
355
356 #[test]
357 fn classify_operational() {
358 let json = serde_json::json!({
359 "status": "operational",
360 "version": "2.1.0",
361 "uptime": "3d 12h",
362 });
363 let (status, details) = classify_json_response(200, &json);
364 assert_eq!(status, HealthStatus::Operational);
365 assert_eq!(details.version.as_deref(), Some("2.1.0"));
366 assert_eq!(details.uptime.as_deref(), Some("3d 12h"));
367 }
368
369 #[test]
370 fn classify_degraded_explicit() {
371 let json = serde_json::json!({ "status": "degraded" });
372 let (status, _) = classify_json_response(200, &json);
373 assert_eq!(status, HealthStatus::Degraded);
374 }
375
376 #[test]
377 fn classify_unknown_status_with_success_code() {
378 let json = serde_json::json!({ "status": "starting_up" });
379 let (status, _) = classify_json_response(200, &json);
380 assert_eq!(status, HealthStatus::Degraded);
381 }
382
383 #[test]
384 fn classify_unknown_status_with_error_code() {
385 let json = serde_json::json!({ "status": "starting_up" });
386 let (status, _) = classify_json_response(503, &json);
387 assert_eq!(status, HealthStatus::Error);
388 }
389
390 #[test]
391 fn classify_missing_status_field() {
392 let json = serde_json::json!({ "version": "1.0.0" });
393 let (status, details) = classify_json_response(200, &json);
394 assert_eq!(status, HealthStatus::Degraded); // "unknown" falls through
395 assert_eq!(details.version.as_deref(), Some("1.0.0"));
396 }
397
398 #[test]
399 fn classify_extracts_checks_and_monitoring() {
400 let json = serde_json::json!({
401 "status": "operational",
402 "checks": { "db": "ok", "redis": "ok" },
403 "monitoring": { "external": true },
404 });
405 let (_, details) = classify_json_response(200, &json);
406 assert!(details.checks.is_some());
407 assert!(details.monitoring.is_some());
408 }
409
410 #[test]
411 fn classify_non_json_success() {
412 assert_eq!(classify_non_json(200), HealthStatus::Degraded);
413 assert_eq!(classify_non_json(204), HealthStatus::Degraded);
414 }
415
416 #[test]
417 fn classify_non_json_error() {
418 assert_eq!(classify_non_json(500), HealthStatus::Error);
419 assert_eq!(classify_non_json(404), HealthStatus::Error);
420 }
421
422 // --- resolve_json_path ---
423
424 #[test]
425 fn resolve_json_path_top_level() {
426 let json = serde_json::json!({"status": "operational"});
427 let val = resolve_json_path(&json, "status").unwrap();
428 assert_eq!(val, "operational");
429 }
430
431 #[test]
432 fn resolve_json_path_nested() {
433 let json = serde_json::json!({"checks": {"db": "ok", "redis": "warn"}});
434 let val = resolve_json_path(&json, "checks.db").unwrap();
435 assert_eq!(val, "ok");
436 }
437
438 #[test]
439 fn resolve_json_path_deeply_nested() {
440 let json = serde_json::json!({"a": {"b": {"c": 42}}});
441 let val = resolve_json_path(&json, "a.b.c").unwrap();
442 assert_eq!(val, 42);
443 }
444
445 #[test]
446 fn resolve_json_path_missing() {
447 let json = serde_json::json!({"status": "operational"});
448 assert!(resolve_json_path(&json, "missing").is_none());
449 }
450
451 #[test]
452 fn resolve_json_path_partial_missing() {
453 let json = serde_json::json!({"checks": {"db": "ok"}});
454 assert!(resolve_json_path(&json, "checks.redis").is_none());
455 }
456
457 // --- validate_expectations ---
458
459 #[test]
460 fn validate_status_code_match() {
461 let expect = HealthExpectation {
462 status_code: Some(200),
463 ..Default::default()
464 };
465 let failures = validate_expectations(&expect, 200, "", None);
466 assert!(failures.is_empty());
467 }
468
469 #[test]
470 fn validate_status_code_mismatch() {
471 let expect = HealthExpectation {
472 status_code: Some(200),
473 ..Default::default()
474 };
475 let failures = validate_expectations(&expect, 503, "", None);
476 assert_eq!(failures.len(), 1);
477 assert!(failures[0].contains("expected status 200"));
478 assert!(failures[0].contains("got 503"));
479 }
480
481 #[test]
482 fn validate_body_contains_match() {
483 let expect = HealthExpectation {
484 body_contains: Some("operational".to_string()),
485 ..Default::default()
486 };
487 let failures = validate_expectations(&expect, 200, r#"{"status":"operational"}"#, None);
488 assert!(failures.is_empty());
489 }
490
491 #[test]
492 fn validate_body_contains_mismatch() {
493 let expect = HealthExpectation {
494 body_contains: Some("operational".to_string()),
495 ..Default::default()
496 };
497 let failures = validate_expectations(&expect, 200, r#"{"status":"error"}"#, None);
498 assert_eq!(failures.len(), 1);
499 assert!(failures[0].contains("body missing"));
500 }
501
502 #[test]
503 fn validate_json_fields_match() {
504 let mut fields = HashMap::new();
505 fields.insert("status".to_string(), "operational".to_string());
506 fields.insert("checks.db".to_string(), "ok".to_string());
507 let expect = HealthExpectation {
508 json_fields: fields,
509 ..Default::default()
510 };
511 let json = serde_json::json!({"status": "operational", "checks": {"db": "ok"}});
512 let failures = validate_expectations(&expect, 200, "", Some(&json));
513 assert!(failures.is_empty());
514 }
515
516 #[test]
517 fn validate_json_fields_mismatch() {
518 let mut fields = HashMap::new();
519 fields.insert("status".to_string(), "operational".to_string());
520 let expect = HealthExpectation {
521 json_fields: fields,
522 ..Default::default()
523 };
524 let json = serde_json::json!({"status": "degraded"});
525 let failures = validate_expectations(&expect, 200, "", Some(&json));
526 assert_eq!(failures.len(), 1);
527 assert!(failures[0].contains("expected \"operational\""));
528 assert!(failures[0].contains("got \"degraded\""));
529 }
530
531 #[test]
532 fn validate_json_field_missing() {
533 let mut fields = HashMap::new();
534 fields.insert("checks.redis".to_string(), "ok".to_string());
535 let expect = HealthExpectation {
536 json_fields: fields,
537 ..Default::default()
538 };
539 let json = serde_json::json!({"checks": {"db": "ok"}});
540 let failures = validate_expectations(&expect, 200, "", Some(&json));
541 assert_eq!(failures.len(), 1);
542 assert!(failures[0].contains("not found"));
543 }
544
545 #[test]
546 fn validate_json_fields_on_non_json() {
547 let mut fields = HashMap::new();
548 fields.insert("status".to_string(), "ok".to_string());
549 let expect = HealthExpectation {
550 json_fields: fields,
551 ..Default::default()
552 };
553 let failures = validate_expectations(&expect, 200, "not json", None);
554 assert_eq!(failures.len(), 1);
555 assert!(failures[0].contains("non-JSON"));
556 }
557
558 #[test]
559 fn validate_mixed_failures() {
560 let mut fields = HashMap::new();
561 fields.insert("status".to_string(), "operational".to_string());
562 let expect = HealthExpectation {
563 status_code: Some(200),
564 body_contains: Some("healthy".to_string()),
565 json_fields: fields,
566 };
567 let json = serde_json::json!({"status": "degraded"});
568 let failures = validate_expectations(&expect, 503, r#"{"status":"degraded"}"#, Some(&json));
569 assert_eq!(failures.len(), 3); // status code + body + json field
570 }
571
572 #[test]
573 fn validate_empty_expectations_always_pass() {
574 let expect = HealthExpectation::default();
575 let failures = validate_expectations(&expect, 500, "garbage", None);
576 assert!(failures.is_empty());
577 }
578
579 // --- detect_latency_drift ---
580
581 fn baseline(avg: f64, count: i64) -> crate::types::LatencyStats {
582 crate::types::LatencyStats {
583 min_ms: avg as i64 / 2,
584 max_ms: avg as i64 * 2,
585 avg_ms: avg,
586 p95_ms: (avg * 1.5) as i64,
587 sample_count: count,
588 }
589 }
590
591 #[test]
592 fn drift_all_over_threshold() {
593 let bl = baseline(100.0, 100);
594 let recent = vec![250, 260, 270]; // all > 200 (100 * 2.0)
595 let result = detect_latency_drift(&recent, &bl, 2.0);
596 assert!(result.is_some());
597 assert!(result.unwrap().contains("latency drift"));
598 }
599
600 #[test]
601 fn drift_one_under_threshold() {
602 let bl = baseline(100.0, 100);
603 let recent = vec![250, 150, 270]; // 150 < 200
604 let result = detect_latency_drift(&recent, &bl, 2.0);
605 assert!(result.is_none());
606 }
607
608 #[test]
609 fn drift_insufficient_baseline() {
610 let bl = baseline(100.0, 5); // < 10 samples
611 let recent = vec![250, 260, 270];
612 let result = detect_latency_drift(&recent, &bl, 2.0);
613 assert!(result.is_none());
614 }
615
616 #[test]
617 fn drift_empty_recent() {
618 let bl = baseline(100.0, 100);
619 let result = detect_latency_drift(&[], &bl, 2.0);
620 assert!(result.is_none());
621 }
622
623 #[test]
624 fn drift_threshold_edge() {
625 let bl = baseline(100.0, 100);
626 // Exactly at threshold (200): not strictly over
627 let recent = vec![200, 200, 200];
628 let result = detect_latency_drift(&recent, &bl, 2.0);
629 assert!(result.is_none()); // must be strictly greater
630 }
631
632 #[test]
633 fn drift_just_over_threshold() {
634 let bl = baseline(100.0, 100);
635 let recent = vec![201, 201, 201];
636 let result = detect_latency_drift(&recent, &bl, 2.0);
637 assert!(result.is_some());
638 }
639
640 // --- compute_test_staleness ---
641
642 #[test]
643 fn staleness_no_test_run() {
644 let result = compute_test_staleness(Some("1.0.0"), None, None, 7);
645 assert!(result.stale);
646 assert_eq!(result.reason.as_deref(), Some("no tests have been run"));
647 assert!(result.last_test_at.is_none());
648 }
649
650 #[test]
651 fn staleness_stale_by_age() {
652 let old = (chrono::Utc::now() - chrono::Duration::days(10)).to_rfc3339();
653 let result = compute_test_staleness(Some("1.0.0"), Some("1.0.0"), Some(&old), 7);
654 assert!(result.stale);
655 let reason = result.reason.unwrap();
656 assert!(reason.contains("days old"), "reason was: {reason}");
657 assert!(reason.contains("threshold: 7d"), "reason was: {reason}");
658 }
659
660 #[test]
661 fn staleness_stale_by_version() {
662 let recent = chrono::Utc::now().to_rfc3339();
663 let result = compute_test_staleness(Some("1.1.0"), Some("1.0.0"), Some(&recent), 7);
664 assert!(result.stale);
665 let reason = result.reason.unwrap();
666 assert!(reason.contains("version changed: 1.0.0 -> 1.1.0"), "reason was: {reason}");
667 }
668
669 #[test]
670 fn staleness_fresh() {
671 let recent = chrono::Utc::now().to_rfc3339();
672 let result = compute_test_staleness(Some("1.0.0"), Some("1.0.0"), Some(&recent), 7);
673 assert!(!result.stale);
674 assert!(result.reason.is_none());
675 }
676
677 #[test]
678 fn staleness_missing_versions_not_stale() {
679 let recent = chrono::Utc::now().to_rfc3339();
680 let result = compute_test_staleness(None, None, Some(&recent), 7);
681 assert!(!result.stale);
682 }
683 }
684