Skip to main content

max / pom

Version bump to 0.2.2 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-03-12 02:03 UTC
Commit: d7fcbc6bb36c3425ab34e092ee663dcdc8c34f63
Parent: a75999e
25 files changed, +6037 insertions, -466 deletions
M .gitignore +7
@@ -2,3 +2,10 @@
2 2 *.db
3 3 *.db-wal
4 4 *.db-shm
5 +
6 + # OS
7 + .DS_Store
8 +
9 + # IDE
10 + .idea/
11 + .vscode/
M Cargo.lock +312 -4
@@ -77,6 +77,51 @@ dependencies = [
77 77 ]
78 78
79 79 [[package]]
80 + name = "anyhow"
81 + version = "1.0.102"
82 + source = "registry+https://github.com/rust-lang/crates.io-index"
83 + checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
84 +
85 + [[package]]
86 + name = "asn1-rs"
87 + version = "0.6.2"
88 + source = "registry+https://github.com/rust-lang/crates.io-index"
89 + checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048"
90 + dependencies = [
91 + "asn1-rs-derive",
92 + "asn1-rs-impl",
93 + "displaydoc",
94 + "nom",
95 + "num-traits",
96 + "rusticata-macros",
97 + "thiserror 1.0.69",
98 + "time",
99 + ]
100 +
101 + [[package]]
102 + name = "asn1-rs-derive"
103 + version = "0.5.1"
104 + source = "registry+https://github.com/rust-lang/crates.io-index"
105 + checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
106 + dependencies = [
107 + "proc-macro2",
108 + "quote",
109 + "syn",
110 + "synstructure",
111 + ]
112 +
113 + [[package]]
114 + name = "asn1-rs-impl"
115 + version = "0.2.0"
116 + source = "registry+https://github.com/rust-lang/crates.io-index"
117 + checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
118 + dependencies = [
119 + "proc-macro2",
120 + "quote",
121 + "syn",
122 + ]
123 +
124 + [[package]]
80 125 name = "atoi"
81 126 version = "2.0.0"
82 127 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -98,6 +143,78 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
98 143 checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
99 144
100 145 [[package]]
146 + name = "aws-lc-rs"
147 + version = "1.16.1"
148 + source = "registry+https://github.com/rust-lang/crates.io-index"
149 + checksum = "94bffc006df10ac2a68c83692d734a465f8ee6c5b384d8545a636f81d858f4bf"
150 + dependencies = [
151 + "aws-lc-sys",
152 + "zeroize",
153 + ]
154 +
155 + [[package]]
156 + name = "aws-lc-sys"
157 + version = "0.38.0"
158 + source = "registry+https://github.com/rust-lang/crates.io-index"
159 + checksum = "4321e568ed89bb5a7d291a7f37997c2c0df89809d7b6d12062c81ddb54aa782e"
160 + dependencies = [
161 + "cc",
162 + "cmake",
163 + "dunce",
164 + "fs_extra",
165 + ]
166 +
167 + [[package]]
168 + name = "axum"
169 + version = "0.8.8"
170 + source = "registry+https://github.com/rust-lang/crates.io-index"
171 + checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8"
172 + dependencies = [
173 + "axum-core",
174 + "bytes",
175 + "form_urlencoded",
176 + "futures-util",
177 + "http",
178 + "http-body",
179 + "http-body-util",
180 + "hyper",
181 + "hyper-util",
182 + "itoa",
183 + "matchit",
184 + "memchr",
185 + "mime",
186 + "percent-encoding",
187 + "pin-project-lite",
188 + "serde_core",
189 + "serde_json",
190 + "serde_path_to_error",
191 + "serde_urlencoded",
192 + "sync_wrapper",
193 + "tokio",
194 + "tower",
195 + "tower-layer",
196 + "tower-service",
197 + ]
198 +
199 + [[package]]
200 + name = "axum-core"
201 + version = "0.5.6"
202 + source = "registry+https://github.com/rust-lang/crates.io-index"
203 + checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1"
204 + dependencies = [
205 + "bytes",
206 + "futures-core",
207 + "http",
208 + "http-body",
209 + "http-body-util",
210 + "mime",
211 + "pin-project-lite",
212 + "sync_wrapper",
213 + "tower-layer",
214 + "tower-service",
215 + ]
216 +
217 + [[package]]
101 218 name = "base64"
102 219 version = "0.21.7"
103 220 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -158,6 +275,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
158 275 checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2"
159 276 dependencies = [
160 277 "find-msvc-tools",
278 + "jobserver",
279 + "libc",
161 280 "shlex",
162 281 ]
163 282
@@ -228,6 +347,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
228 347 checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831"
229 348
230 349 [[package]]
350 + name = "cmake"
351 + version = "0.1.57"
352 + source = "registry+https://github.com/rust-lang/crates.io-index"
353 + checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d"
354 + dependencies = [
355 + "cc",
356 + ]
357 +
358 + [[package]]
231 359 name = "colorchoice"
232 360 version = "1.0.4"
233 361 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -304,6 +432,12 @@ dependencies = [
304 432 ]
305 433
306 434 [[package]]
435 + name = "data-encoding"
436 + version = "2.10.0"
437 + source = "registry+https://github.com/rust-lang/crates.io-index"
438 + checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea"
439 +
440 + [[package]]
307 441 name = "der"
308 442 version = "0.7.10"
309 443 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -315,6 +449,29 @@ dependencies = [
315 449 ]
316 450
317 451 [[package]]
452 + name = "der-parser"
453 + version = "9.0.0"
454 + source = "registry+https://github.com/rust-lang/crates.io-index"
455 + checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553"
456 + dependencies = [
457 + "asn1-rs",
458 + "displaydoc",
459 + "nom",
460 + "num-bigint",
461 + "num-traits",
462 + "rusticata-macros",
463 + ]
464 +
465 + [[package]]
466 + name = "deranged"
467 + version = "0.5.8"
468 + source = "registry+https://github.com/rust-lang/crates.io-index"
469 + checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c"
470 + dependencies = [
471 + "powerfmt",
472 + ]
473 +
474 + [[package]]
318 475 name = "digest"
319 476 version = "0.10.7"
320 477 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -365,6 +522,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
365 522 checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
366 523
367 524 [[package]]
525 + name = "dunce"
526 + version = "1.0.5"
527 + source = "registry+https://github.com/rust-lang/crates.io-index"
528 + checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
529 +
530 + [[package]]
368 531 name = "dyn-clone"
369 532 version = "1.0.20"
370 533 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -450,6 +613,12 @@ dependencies = [
450 613 ]
451 614
452 615 [[package]]
616 + name = "fs_extra"
617 + version = "1.3.0"
618 + source = "registry+https://github.com/rust-lang/crates.io-index"
619 + checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
620 +
621 + [[package]]
453 622 name = "futures"
454 623 version = "0.3.32"
455 624 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -580,12 +749,25 @@ dependencies = [
580 749 "cfg-if",
581 750 "js-sys",
582 751 "libc",
583 - "r-efi",
752 + "r-efi 5.3.0",
584 753 "wasip2",
585 754 "wasm-bindgen",
586 755 ]
587 756
588 757 [[package]]
758 + name = "getrandom"
759 + version = "0.4.2"
760 + source = "registry+https://github.com/rust-lang/crates.io-index"
761 + checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
762 + dependencies = [
763 + "cfg-if",
764 + "libc",
765 + "r-efi 6.0.0",
766 + "wasip2",
767 + "wasip3",
768 + ]
769 +
770 + [[package]]
589 771 name = "hashbrown"
590 772 version = "0.15.5"
591 773 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -651,6 +833,17 @@ dependencies = [
651 833 ]
652 834
653 835 [[package]]
836 + name = "hostname"
837 + version = "0.4.2"
838 + source = "registry+https://github.com/rust-lang/crates.io-index"
839 + checksum = "617aaa3557aef3810a6369d0a99fac8a080891b68bd9f9812a1eeda0c0730cbd"
840 + dependencies = [
841 + "cfg-if",
842 + "libc",
843 + "windows-link",
844 + ]
845 +
846 + [[package]]
654 847 name = "http"
655 848 version = "1.4.0"
656 849 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -690,6 +883,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
690 883 checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87"
691 884
692 885 [[package]]
886 + name = "httpdate"
887 + version = "1.0.3"
888 + source = "registry+https://github.com/rust-lang/crates.io-index"
889 + checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
890 +
891 + [[package]]
693 892 name = "hyper"
694 893 version = "1.8.1"
695 894 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -702,6 +901,7 @@ dependencies = [
702 901 "http",
703 902 "http-body",
704 903 "httparse",
904 + "httpdate",
705 905 "itoa",
706 906 "pin-project-lite",
707 907 "pin-utils",
@@ -856,6 +1056,12 @@ dependencies = [
856 1056 ]
857 1057
858 1058 [[package]]
1059 + name = "id-arena"
1060 + version = "2.3.0"
1061 + source = "registry+https://github.com/rust-lang/crates.io-index"
1062 + checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
1063 +
1064 + [[package]]
859 1065 name = "idna"
860 1066 version = "1.1.0"
861 1067 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -884,6 +1090,8 @@ checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017"
884 1090 dependencies = [
885 1091 "equivalent",
886 1092 "hashbrown 0.16.1",
1093 + "serde",
1094 + "serde_core",
887 1095 ]
888 1096
889 1097 [[package]]
@@ -915,6 +1123,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
915 1123 checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2"
916 1124
917 1125 [[package]]
1126 + name = "jobserver"
1127 + version = "0.1.34"
1128 + source = "registry+https://github.com/rust-lang/crates.io-index"
1129 + checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33"
1130 + dependencies = [
1131 + "getrandom 0.3.4",
1132 + "libc",
1133 + ]
1134 +
1135 + [[package]]
918 1136 name = "js-sys"
919 1137 version = "0.3.91"
920 1138 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -934,6 +1152,12 @@ dependencies = [
934 1152 ]
935 1153
936 1154 [[package]]
1155 + name = "leb128fmt"
1156 + version = "0.1.0"
1157 + source = "registry+https://github.com/rust-lang/crates.io-index"
1158 + checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
1159 +
1160 + [[package]]
937 1161 name = "libc"
938 1162 version = "0.2.183"
939 1163 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1005,6 +1229,12 @@ dependencies = [
1005 1229 ]
1006 1230
1007 1231 [[package]]
1232 + name = "matchit"
1233 + version = "0.8.4"
1234 + source = "registry+https://github.com/rust-lang/crates.io-index"
1235 + checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3"
1236 +
1237 + [[package]]
1008 1238 name = "md-5"
1009 1239 version = "0.10.6"
1010 1240 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1021,6 +1251,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1021 1251 checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79"
1022 1252
1023 1253 [[package]]
1254 + name = "mime"
1255 + version = "0.3.17"
1256 + source = "registry+https://github.com/rust-lang/crates.io-index"
1257 + checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
1258 +
1259 + [[package]]
1260 + name = "minimal-lexical"
1261 + version = "0.2.1"
1262 + source = "registry+https://github.com/rust-lang/crates.io-index"
1263 + checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
1264 +
1265 + [[package]]
1024 1266 name = "mio"
1025 1267 version = "1.1.1"
1026 1268 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1032,6 +1274,16 @@ dependencies = [
1032 1274 ]
1033 1275
1034 1276 [[package]]
1277 + name = "nom"
1278 + version = "7.1.3"
1279 + source = "registry+https://github.com/rust-lang/crates.io-index"
1280 + checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
1281 + dependencies = [
1282 + "memchr",
1283 + "minimal-lexical",
1284 + ]
1285 +
1286 + [[package]]
1035 1287 name = "nu-ansi-term"
1036 1288 version = "0.50.3"
1037 1289 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1041,6 +1293,16 @@ dependencies = [
1041 1293 ]
1042 1294
1043 1295 [[package]]
1296 + name = "num-bigint"
1297 + version = "0.4.6"
1298 + source = "registry+https://github.com/rust-lang/crates.io-index"
1299 + checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
1300 + dependencies = [
1301 + "num-integer",
1302 + "num-traits",
1303 + ]
1304 +
1305 + [[package]]
1044 1306 name = "num-bigint-dig"
1045 1307 version = "0.8.6"
1046 1308 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1057,6 +1319,12 @@ dependencies = [
1057 1319 ]
1058 1320
1059 1321 [[package]]
1322 + name = "num-conv"
1323 + version = "0.2.0"
1324 + source = "registry+https://github.com/rust-lang/crates.io-index"
1325 + checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050"
1326 +
1327 + [[package]]
1060 1328 name = "num-integer"
1061 1329 version = "0.1.46"
1062 1330 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1087,6 +1355,15 @@ dependencies = [
1087 1355 ]
1088 1356
1089 1357 [[package]]
1358 + name = "oid-registry"
1359 + version = "0.7.1"
1360 + source = "registry+https://github.com/rust-lang/crates.io-index"
1361 + checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9"
1362 + dependencies = [
1363 + "asn1-rs",
1364 + ]
1365 +
1366 + [[package]]
1090 1367 name = "once_cell"
1091 1368 version = "1.21.3"
1092 1369 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1201,21 +1478,31 @@ checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6"
1201 1478
1202 1479 [[package]]
1203 1480 name = "pom"
1204 - version = "0.1.0"
1481 + version = "0.2.1"
1205 1482 dependencies = [
1483 + "axum",
1206 1484 "chrono",
1207 1485 "clap",
1208 1486 "dirs",
1487 + "hostname",
1488 + "http-body-util",
1209 1489 "reqwest",
1210 1490 "rmcp",
1491 + "rustls-pki-types",
1211 1492 "schemars",
1212 1493 "serde",
1213 1494 "serde_json",
1214 1495 "sqlx",
1496 + "thiserror 2.0.18",
1215 1497 "tokio",
1498 + "tokio-rustls",
1216 1499 "toml",
1500 + "tower",
1217 1501 "tracing",
1218 1502 "tracing-subscriber",
1503 + "uuid",
1504 + "webpki-roots",
1505 + "x509-parser",
1219 1506 ]
1220 1507
1221 1508 [[package]]
@@ -1228,6 +1515,12 @@ dependencies = [
1228 1515 ]
1229 1516
1230 1517 [[package]]
1518 + name = "powerfmt"
1519 + version = "0.2.0"
1520 + source = "registry+https://github.com/rust-lang/crates.io-index"
1521 + checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
1522 +
1523 + [[package]]
1231 1524 name = "ppv-lite86"
1232 1525 version = "0.2.21"
1233 1526 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1237,6 +1530,16 @@ dependencies = [
1237 1530 ]
1238 1531
1239 1532 [[package]]
1533 + name = "prettyplease"
1534 + version = "0.2.37"
1535 + source = "registry+https://github.com/rust-lang/crates.io-index"
1536 + checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b"
1537 + dependencies = [
1538 + "proc-macro2",
1539 + "syn",
1540 + ]
1541 +
1542 + [[package]]
1240 1543 name = "proc-macro2"
1241 1544 version = "1.0.106"
1242 1545 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1259,7 +1562,7 @@ dependencies = [
1259 1562 "rustc-hash",
1260 1563 "rustls",
1261 1564 "socket2",
1262 - "thiserror",
1565 + "thiserror 2.0.18",
1263 1566 "tokio",
1264 1567 "tracing",
1265 1568 "web-time",
@@ -1280,7 +1583,7 @@ dependencies = [
1280 1583 "rustls",
1281 1584 "rustls-pki-types",
1282 1585 "slab",
1283 - "thiserror",
1586 + "thiserror 2.0.18",
1284 1587 "tinyvec",
1285 1588 "tracing",
1286 1589 "web-time",
@@ -1316,6 +1619,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1316 1619 checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
1317 1620
1318 1621 [[package]]
1622 + name = "r-efi"
1623 + version = "6.0.0"
1624 + source = "registry+https://github.com/rust-lang/crates.io-index"
1625 + checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf"
1626 +
Lines truncated
M Cargo.toml +21 -1
@@ -1,6 +1,6 @@
1 1 [package]
2 2 name = "pom"
3 - version = "0.1.0"
3 + version = "0.2.2"
4 4 edition = "2024"
5 5
6 6 [lib]
@@ -24,6 +24,9 @@ tokio = { version = "1", features = ["rt-multi-thread", "macros", "io-std", "io-
24 24 # HTTP client
25 25 reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
26 26
27 + # HTTP server (API in serve mode)
28 + axum = { version = "0.8", default-features = false, features = ["json", "tokio", "http1", "query"] }
29 +
27 30 # Database
28 31 sqlx = { version = "0.8", features = ["runtime-tokio", "sqlite"] }
29 32
@@ -35,12 +38,29 @@ schemars = "0.8"
35 38 # Config
36 39 toml = "0.8"
37 40
41 + # Errors
42 + thiserror = "2"
43 +
38 44 # Time
39 45 chrono = { version = "0.4", features = ["serde"] }
40 46
41 47 # Paths
42 48 dirs = "6"
43 49
50 + # Identity
51 + uuid = { version = "1", features = ["v4"] }
52 + hostname = "0.4"
53 +
54 + # TLS certificate checking
55 + x509-parser = "0.16"
56 + tokio-rustls = "0.26"
57 + rustls-pki-types = "1"
58 + webpki-roots = "1"
59 +
44 60 # Logging
45 61 tracing = "0.1"
46 62 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
63 +
64 + [dev-dependencies]
65 + tower = { version = "0.5", features = ["util"] }
66 + http-body-util = "0.1"
M deploy/deploy.sh +16 -8
@@ -2,19 +2,20 @@
2 2 set -euo pipefail
3 3
4 4 ASTRA_HOST="max@100.106.221.39"
5 - HETZNER_HOST="root@5.78.144.244"
5 + HETZNER_HOST="root@100.120.174.96"
6 6
7 7 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
8 8 PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
9 9
10 10 deploy_target() {
11 11 local name="$1"
12 - local host target
12 + local host target sudo_prefix=""
13 13
14 14 case "$name" in
15 15 astra)
16 16 host="$ASTRA_HOST"
17 17 target="aarch64-unknown-linux-gnu"
18 + sudo_prefix="sudo"
18 19 ;;
19 20 hetzner)
20 21 host="$HETZNER_HOST"
@@ -31,16 +32,23 @@ deploy_target() {
31 32
32 33 local binary="$PROJECT_DIR/target/$target/release/pom"
33 34
35 + local config_file="$SCRIPT_DIR/pom-${name}.toml"
36 + if [ ! -f "$config_file" ]; then
37 + echo "Config not found: $config_file"
38 + exit 1
39 + fi
40 +
34 41 echo "=== Deploying to $name ($host) ==="
35 - ssh "$host" "mkdir -p /etc/pom"
36 - scp "$binary" "$host:/usr/local/bin/pom"
37 - scp "$PROJECT_DIR/pom.toml" "$host:/etc/pom/pom.toml"
38 - scp "$SCRIPT_DIR/pom.service" "$host:/etc/systemd/system/pom.service"
42 + ssh "$host" "$sudo_prefix mkdir -p /etc/pom"
43 + scp "$binary" "$host:/tmp/pom"
44 + scp "$config_file" "$host:/tmp/pom.toml"
45 + scp "$SCRIPT_DIR/pom.service" "$host:/tmp/pom.service"
39 46
40 - ssh "$host" "systemctl daemon-reload && systemctl enable pom && systemctl restart pom"
47 + ssh "$host" "$sudo_prefix mv /tmp/pom /usr/local/bin/pom && $sudo_prefix chmod +x /usr/local/bin/pom && $sudo_prefix mv /tmp/pom.toml /etc/pom/pom.toml && $sudo_prefix mv /tmp/pom.service /etc/systemd/system/pom.service"
48 + ssh "$host" "$sudo_prefix systemctl daemon-reload && $sudo_prefix systemctl enable pom && $sudo_prefix systemctl restart pom"
41 49
42 50 echo "=== $name: deployed ==="
43 - ssh "$host" "systemctl status pom --no-pager"
51 + ssh "$host" "$sudo_prefix systemctl status pom --no-pager"
44 52 }
45 53
46 54 if [ $# -eq 0 ]; then
@@ -0,0 +1,44 @@
1 + [serve]
2 + interval_secs = 300
3 + prune_days = 30
4 + listen = "0.0.0.0:9100"
5 + peer_heartbeat_secs = 60
6 +
7 + [instance]
8 + name = "astra"
9 +
10 + [targets.mnw]
11 + label = "Makenotwork Production"
12 +
13 + [targets.mnw.health]
14 + url = "https://makenot.work/api/health"
15 + timeout_secs = 10
16 +
17 + [targets.mnw.health.expect]
18 + status_code = 200
19 + json_fields = { "status" = "operational" }
20 +
21 + [targets.mnw.health.trending]
22 + baseline_window_hours = 168
23 + spike_threshold = 2.0
24 +
25 + [targets.mnw.tls]
26 + host = "makenot.work"
27 +
28 + [targets.mnw.tests]
29 + ssh = "max@100.106.221.39"
30 + command = "/home/max/staging/run-ci.sh"
31 + timeout_secs = 600
32 + staleness_days = 7
33 +
34 + [peers.hetzner]
35 + address = "100.120.174.96:9100"
36 + on_missing = "alert"
37 +
38 + [peers.macbook]
39 + address = "100.100.246.136:9100"
40 + on_missing = "log"
41 +
42 + [alerts]
43 + # postmark_token = "" # set in production — omit for dev mode (log only)
44 + to = "pom-alerts@makenot.work"
@@ -0,0 +1,44 @@
1 + [serve]
2 + interval_secs = 300
3 + prune_days = 30
4 + listen = "0.0.0.0:9100"
5 + peer_heartbeat_secs = 60
6 +
7 + [instance]
8 + name = "hetzner"
9 +
10 + [targets.mnw]
11 + label = "Makenotwork Production"
12 +
13 + [targets.mnw.health]
14 + url = "https://makenot.work/api/health"
15 + timeout_secs = 10
16 +
17 + [targets.mnw.health.expect]
18 + status_code = 200
19 + json_fields = { "status" = "operational" }
20 +
21 + [targets.mnw.health.trending]
22 + baseline_window_hours = 168
23 + spike_threshold = 2.0
24 +
25 + [targets.mnw.tls]
26 + host = "makenot.work"
27 +
28 + [targets.mnw.tests]
29 + ssh = "max@100.106.221.39"
30 + command = "/home/max/staging/run-ci.sh"
31 + timeout_secs = 600
32 + staleness_days = 7
33 +
34 + [peers.astra]
35 + address = "100.106.221.39:9100"
36 + on_missing = "alert"
37 +
38 + [peers.macbook]
39 + address = "100.100.246.136:9100"
40 + on_missing = "log"
41 +
42 + [alerts]
43 + # postmark_token = "" # set in production — omit for dev mode (log only)
44 + to = "pom-alerts@makenot.work"
M pom.toml +1
@@ -1,6 +1,7 @@
1 1 [serve]
2 2 interval_secs = 300
3 3 prune_days = 30
4 + listen = "127.0.0.1:9100"
4 5
5 6 [targets.mnw]
6 7 label = "Makenotwork Production"
A src/alerts.rs +389
@@ -0,0 +1,389 @@
1 + //! Email alerting via Postmark API.
2 + //!
3 + //! Sends alerts on health status transitions and peer disappearance/recovery.
4 + //! If no `postmark_token` is configured, alerts are logged to stdout instead.
5 +
6 + use sqlx::SqlitePool;
7 + use tracing::{info, warn};
8 +
9 + use crate::config::AlertConfig;
10 + use crate::db;
11 +
12 + #[derive(Clone)]
13 + pub struct Alerter {
14 + config: AlertConfig,
15 + client: reqwest::Client,
16 + pool: SqlitePool,
17 + instance_name: String,
18 + }
19 +
20 + impl Alerter {
21 + pub fn new(config: AlertConfig, pool: SqlitePool, instance_name: String) -> Self {
22 + let client = reqwest::Client::builder()
23 + .timeout(std::time::Duration::from_secs(10))
24 + .build()
25 + .unwrap_or_default();
26 + Self { config, client, pool, instance_name }
27 + }
28 +
29 + pub async fn send_health_alert(
30 + &self,
31 + target: &str,
32 + label: &str,
33 + from_status: &str,
34 + to_status: &str,
35 + error: Option<&str>,
36 + ) {
37 + let alert_key = format!("health:{target}");
38 + if self.is_within_cooldown(&alert_key).await {
39 + info!("alert cooldown active for {alert_key}, skipping");
40 + return;
41 + }
42 +
43 + let subject = format!("[PoM] {target}: {from_status} -> {to_status}");
44 + let mut body = format!(
45 + "Target: {label} ({target})\n\
46 + Status: {from_status} -> {to_status}\n\
47 + Instance: {}\n\
48 + Time: {}\n",
49 + self.instance_name,
50 + chrono::Utc::now().to_rfc3339(),
51 + );
52 + if let Some(err) = error {
53 + body.push_str(&format!("Error: {err}\n"));
54 + }
55 + body.push_str("\n- PoM");
56 +
57 + self.send_email(&subject, &body).await;
58 + self.record_alert(target, "health", Some(from_status), Some(to_status), error).await;
59 + }
60 +
61 + pub async fn send_health_recovery(
62 + &self,
63 + target: &str,
64 + label: &str,
65 + from_status: &str,
66 + ) {
67 + let alert_key = format!("health:{target}");
68 + // No cooldown on recovery — always send
69 + let subject = format!("[PoM] {target}: recovered");
70 + let body = format!(
71 + "Target: {label} ({target})\n\
72 + Status: {from_status} -> operational\n\
73 + Instance: {}\n\
74 + Time: {}\n\n\
75 + - PoM",
76 + self.instance_name,
77 + chrono::Utc::now().to_rfc3339(),
78 + );
79 +
80 + self.send_email(&subject, &body).await;
81 + self.record_alert(&alert_key, "recovery", Some(from_status), Some("operational"), None).await;
82 + }
83 +
84 + pub async fn send_tls_expiry_alert(
85 + &self,
86 + target: &str,
87 + host: &str,
88 + days_remaining: i64,
89 + not_after: &str,
90 + ) {
91 + let alert_key = format!("tls:{target}");
92 + if self.is_within_cooldown(&alert_key).await {
93 + info!("alert cooldown active for {alert_key}, skipping");
94 + return;
95 + }
96 +
97 + let subject = format!("[PoM] {target}: TLS cert expires in {days_remaining} days");
98 + let body = format!(
99 + "Target: {target}\n\
100 + Host: {host}\n\
101 + Days remaining: {days_remaining}\n\
102 + Expires: {not_after}\n\
103 + Instance: {}\n\
104 + Time: {}\n\n\
105 + - PoM",
106 + self.instance_name,
107 + chrono::Utc::now().to_rfc3339(),
108 + );
109 +
110 + self.send_email(&subject, &body).await;
111 + self.record_alert(&alert_key, "tls_expiry", None, None, None).await;
112 + }
113 +
114 + pub async fn send_tls_error_alert(
115 + &self,
116 + target: &str,
117 + host: &str,
118 + error: &str,
119 + ) {
120 + let alert_key = format!("tls:{target}");
121 + if self.is_within_cooldown(&alert_key).await {
122 + info!("alert cooldown active for {alert_key}, skipping");
123 + return;
124 + }
125 +
126 + let subject = format!("[PoM] {target}: TLS check failed");
127 + let body = format!(
128 + "Target: {target}\n\
129 + Host: {host}\n\
130 + Error: {error}\n\
131 + Instance: {}\n\
132 + Time: {}\n\n\
133 + - PoM",
134 + self.instance_name,
135 + chrono::Utc::now().to_rfc3339(),
136 + );
137 +
138 + self.send_email(&subject, &body).await;
139 + self.record_alert(&alert_key, "tls_error", None, None, Some(error)).await;
140 + }
141 +
142 + pub async fn send_tls_recovery(
143 + &self,
144 + target: &str,
145 + label: &str,
146 + days_remaining: i64,
147 + ) {
148 + let alert_key = format!("tls:{target}");
149 + // No cooldown on recovery — always send
150 + let subject = format!("[PoM] {target}: TLS cert renewed");
151 + let body = format!(
152 + "Target: {label} ({target})\n\
153 + Days remaining: {days_remaining}\n\
154 + Instance: {}\n\
155 + Time: {}\n\n\
156 + - PoM",
157 + self.instance_name,
158 + chrono::Utc::now().to_rfc3339(),
159 + );
160 +
161 + self.send_email(&subject, &body).await;
162 + self.record_alert(&alert_key, "tls_recovery", None, None, None).await;
163 + }
164 +
165 + pub async fn send_peer_missing(
166 + &self,
167 + peer_name: &str,
168 + address: &str,
169 + consecutive_failures: u32,
170 + ) {
171 + let alert_key = format!("peer:{peer_name}");
172 + if self.is_within_cooldown(&alert_key).await {
173 + info!("alert cooldown active for {alert_key}, skipping");
174 + return;
175 + }
176 +
177 + let subject = format!("[PoM] peer {peer_name}: missing");
178 + let body = format!(
179 + "Peer: {peer_name}\n\
180 + Address: {address}\n\
181 + Consecutive failures: {consecutive_failures}\n\
182 + Instance: {}\n\
183 + Time: {}\n\n\
184 + - PoM",
185 + self.instance_name,
186 + chrono::Utc::now().to_rfc3339(),
187 + );
188 +
189 + self.send_email(&subject, &body).await;
190 + self.record_alert(&alert_key, "peer_missing", None, None, None).await;
191 + }
192 +
193 + pub async fn send_peer_recovery(
194 + &self,
195 + peer_name: &str,
196 + address: &str,
197 + ) {
198 + let subject = format!("[PoM] peer {peer_name}: recovered");
199 + let body = format!(
200 + "Peer: {peer_name}\n\
201 + Address: {address}\n\
202 + Instance: {}\n\
203 + Time: {}\n\n\
204 + - PoM",
205 + self.instance_name,
206 + chrono::Utc::now().to_rfc3339(),
207 + );
208 +
209 + let alert_key = format!("peer:{peer_name}");
210 + self.send_email(&subject, &body).await;
211 + self.record_alert(&alert_key, "peer_recovery", None, None, None).await;
212 + }
213 +
214 + pub async fn send_latency_drift_alert(
215 + &self,
216 + target: &str,
217 + label: &str,
218 + drift_message: &str,
219 + ) {
220 + let alert_key = format!("latency:{target}");
221 + if self.is_within_cooldown(&alert_key).await {
222 + info!("alert cooldown active for {alert_key}, skipping");
223 + return;
224 + }
225 +
226 + let subject = format!("[PoM] {target}: latency drift detected");
227 + let body = format!(
228 + "Target: {label} ({target})\n\
229 + {drift_message}\n\
230 + Instance: {}\n\
231 + Time: {}\n\n\
232 + - PoM",
233 + self.instance_name,
234 + chrono::Utc::now().to_rfc3339(),
235 + );
236 +
237 + self.send_email(&subject, &body).await;
238 + self.record_alert(&alert_key, "latency_drift", None, None, Some(drift_message)).await;
239 + }
240 +
241 + pub async fn send_latency_recovery(
242 + &self,
243 + target: &str,
244 + label: &str,
245 + ) {
246 + // No cooldown on recovery — always send
247 + let alert_key = format!("latency:{target}");
248 + let subject = format!("[PoM] {target}: latency recovered");
249 + let body = format!(
250 + "Target: {label} ({target})\n\
251 + Latency returned to normal.\n\
252 + Instance: {}\n\
253 + Time: {}\n\n\
254 + - PoM",
255 + self.instance_name,
256 + chrono::Utc::now().to_rfc3339(),
257 + );
258 +
259 + self.send_email(&subject, &body).await;
260 + self.record_alert(&alert_key, "latency_recovery", None, None, None).await;
261 + }
262 +
263 + async fn is_within_cooldown(&self, target: &str) -> bool {
264 + let latest = match db::get_latest_alert_for_target(&self.pool, target).await {
265 + Ok(Some(row)) => row,
266 + _ => return false,
267 + };
268 +
269 + let sent_at = match chrono::DateTime::parse_from_rfc3339(&latest.sent_at) {
270 + Ok(dt) => dt,
271 + Err(_) => return false,
272 + };
273 +
274 + let elapsed = chrono::Utc::now().signed_duration_since(sent_at);
275 + elapsed.num_seconds() < self.config.cooldown_secs as i64
276 + }
277 +
278 + async fn send_email(&self, subject: &str, body: &str) {
279 + let Some(ref token) = self.config.postmark_token else {
280 + info!("[dev] alert: {subject}");
281 + info!("[dev] {body}");
282 + return;
283 + };
284 +
285 + let payload = serde_json::json!({
286 + "From": self.config.from,
287 + "To": self.config.to,
288 + "Subject": subject,
289 + "TextBody": body,
290 + });
291 +
292 + match self.client
293 + .post("https://api.postmarkapp.com/email")
294 + .header("X-Postmark-Server-Token", token)
295 + .header("Content-Type", "application/json")
296 + .header("Accept", "application/json")
297 + .json(&payload)
298 + .send()
299 + .await
300 + {
301 + Ok(resp) if resp.status().is_success() => {
302 + info!("alert sent: {subject}");
303 + }
304 + Ok(resp) => {
305 + let status = resp.status();
306 + let text = resp.text().await.unwrap_or_default();
307 + warn!("postmark error ({status}): {text}");
308 + }
309 + Err(e) => {
310 + warn!("failed to send alert: {e}");
311 + }
312 + }
313 + }
314 +
315 + async fn record_alert(
316 + &self,
317 + target: &str,
318 + alert_type: &str,
319 + from_status: Option<&str>,
320 + to_status: Option<&str>,
321 + error: Option<&str>,
322 + ) {
323 + if let Err(e) = db::insert_alert(&self.pool, target, alert_type, from_status, to_status, error).await {
324 + warn!("failed to record alert: {e}");
325 + }
326 + }
327 + }
328 +
329 + #[cfg(test)]
330 + mod tests {
331 + use super::*;
332 +
333 + fn test_alerter(pool: SqlitePool) -> Alerter {
334 + let config = AlertConfig {
335 + postmark_token: None, // dev mode
336 + to: "test@example.com".to_string(),
337 + from: "PoM Alerts <pom-alerts@makenot.work>".to_string(),
338 + cooldown_secs: 300,
339 + };
340 + Alerter::new(config, pool, "test-instance".to_string())
341 + }
342 +
343 + #[tokio::test]
344 + async fn cooldown_prevents_duplicate_alerts() {
345 + let pool = db::connect_in_memory().await.unwrap();
346 + let alerter = test_alerter(pool.clone());
347 +
348 + // First alert — not in cooldown
349 + assert!(!alerter.is_within_cooldown("health:mnw").await);
350 +
351 + // Record an alert
352 + db::insert_alert(&pool, "health:mnw", "health", Some("operational"), Some("error"), None)
353 + .await
354 + .unwrap();
355 +
356 + // Now should be in cooldown
357 + assert!(alerter.is_within_cooldown("health:mnw").await);
358 + }
359 +
360 + #[tokio::test]
361 + async fn cooldown_does_not_affect_other_targets() {
362 + let pool = db::connect_in_memory().await.unwrap();
363 + let alerter = test_alerter(pool.clone());
364 +
365 + db::insert_alert(&pool, "health:mnw", "health", None, None, None)
366 + .await
367 + .unwrap();
368 +
369 + // Different target should not be in cooldown
370 + assert!(!alerter.is_within_cooldown("health:other").await);
371 + }
372 +
373 + #[tokio::test]
374 + async fn dev_mode_does_not_send_http() {
375 + let pool = db::connect_in_memory().await.unwrap();
376 + let alerter = test_alerter(pool.clone());
377 +
378 + // This should log instead of making HTTP calls (no panic, no error)
379 + alerter.send_health_alert("mnw", "MakeNotWork", "operational", "error", None).await;
380 +
381 + // Verify alert was recorded in DB
382 + let latest = db::get_latest_alert_for_target(&pool, "mnw").await.unwrap();
383 + assert!(latest.is_some());
384 + let row = latest.unwrap();
385 + assert_eq!(row.alert_type, "health");
386 + assert_eq!(row.from_status.as_deref(), Some("operational"));
387 + assert_eq!(row.to_status.as_deref(), Some("error"));
388 + }
389 + }
A src/api.rs +411
@@ -0,0 +1,411 @@
1 + //! HTTP API for serve mode — exposes health check data to consumers like MNW.
2 +
3 + use std::collections::HashMap;
4 + use std::sync::Arc;
5 +
6 + use axum::extract::{Path, State as AxumState};
7 + use axum::http::StatusCode;
8 + use axum::response::IntoResponse;
9 + use axum::routing::get;
10 + use axum::{Json, Router};
11 + use serde::Serialize;
12 +
13 + use crate::checks::http::compute_test_staleness;
14 + use crate::config::Config;
15 + use crate::db;
16 + use crate::peer::SharedMeshState;
17 + use crate::types::{HealthSnapshot, LatencyBucket, LatencyStats, TestStaleness};
18 +
19 + /// Shared state for the API server.
20 + #[derive(Clone)]
21 + pub struct ApiState {
22 + pub pool: sqlx::SqlitePool,
23 + pub config: Arc<Config>,
24 + pub mesh: Option<SharedMeshState>,
25 + }
26 +
27 + /// Build the axum router for the PoM API.
28 + pub fn router(pool: sqlx::SqlitePool, config: Config, mesh: Option<SharedMeshState>) -> Router {
29 + let state = ApiState {
30 + pool,
31 + config: Arc::new(config),
32 + mesh,
33 + };
34 +
35 + Router::new()
36 + .route("/api/status", get(status_all))
37 + .route("/api/status/{target}", get(status_target))
38 + .route("/api/trends/{target}", get(trends))
39 + .route("/api/peer/info", get(peer_info))
40 + .route("/api/peer/status", get(peer_status))
41 + .route("/api/mesh", get(mesh_view))
42 + .with_state(state)
43 + }
44 +
45 + // --- Response types ---
46 +
47 + #[derive(Serialize)]
48 + struct StatusResponse {
49 + targets: HashMap<String, TargetStatus>,
50 + }
51 +
52 + #[derive(Serialize)]
53 + struct TargetStatus {
54 + label: String,
55 + latest: Option<SnapshotJson>,
56 + recent: Vec<SnapshotJson>,
57 + uptime_24h: Option<f64>,
58 + uptime_7d: Option<f64>,
59 + #[serde(skip_serializing_if = "Option::is_none")]
60 + latency_24h: Option<LatencyStats>,
61 + #[serde(skip_serializing_if = "Option::is_none")]
62 + tls: Option<db::TlsCheckRow>,
63 + #[serde(skip_serializing_if = "Option::is_none")]
64 + test_staleness: Option<TestStaleness>,
65 + #[serde(skip_serializing_if = "Option::is_none")]
66 + current_incident: Option<db::IncidentRow>,
67 + #[serde(skip_serializing_if = "Vec::is_empty")]
68 + incidents: Vec<db::IncidentRow>,
69 + }
70 +
71 + #[derive(Serialize)]
72 + struct SnapshotJson {
73 + status: String,
74 + checked_at: String,
75 + response_time_ms: i64,
76 + #[serde(skip_serializing_if = "Option::is_none")]
77 + details: Option<serde_json::Value>,
78 + #[serde(skip_serializing_if = "Option::is_none")]
79 + error: Option<String>,
80 + }
81 +
82 + impl From<HealthSnapshot> for SnapshotJson {
83 + fn from(s: HealthSnapshot) -> Self {
84 + Self {
85 + status: s.status.to_string(),
86 + checked_at: s.checked_at,
87 + response_time_ms: s.response_time_ms,
88 + details: s.details.map(|d| serde_json::to_value(d).unwrap_or_default()),
89 + error: s.error,
90 + }
91 + }
92 + }
93 +
94 + /// Build a `TargetStatus` for a single target.
95 + async fn build_target_status(
96 + pool: &sqlx::SqlitePool,
97 + name: &str,
98 + label: &str,
99 + config: &Config,
100 + ) -> TargetStatus {
101 + let recent = db::get_health_history(pool, Some(name), 10)
102 + .await
103 + .unwrap_or_default();
104 +
105 + let latest_snapshot = recent.first().cloned();
106 + let latest = latest_snapshot.clone().map(SnapshotJson::from);
107 + let recent_json: Vec<SnapshotJson> = recent.into_iter().map(SnapshotJson::from).collect();
108 +
109 + let uptime_24h = db::get_uptime_percent(pool, name, 24)
110 + .await
111 + .unwrap_or(None);
112 + let uptime_7d = db::get_uptime_percent(pool, name, 168)
113 + .await
114 + .unwrap_or(None);
115 +
116 + // Compute 24h latency stats from operational checks
117 + let latency_24h = {
118 + let cutoff = (chrono::Utc::now() - chrono::Duration::hours(24)).to_rfc3339();
119 + let times = db::get_response_times(pool, name, &cutoff)
120 + .await
121 + .unwrap_or_default();
122 + let operational_times: Vec<i64> = times.iter()
123 + .filter(|(_, ms)| *ms > 0)
124 + .map(|(_, ms)| *ms)
125 + .collect();
126 + LatencyStats::from_times(&operational_times)
127 + };
128 +
129 + let tls = db::get_latest_tls_check(pool, name)
130 + .await
131 + .unwrap_or(None);
132 +
133 + // Compute test staleness for targets with test config
134 + let test_staleness = if let Some(target_config) = config.get_target(name)
135 + && let Some(tests_config) = &target_config.tests
136 + {
137 + let current_version = latest_snapshot
138 + .as_ref()
139 + .and_then(|s| s.details.as_ref())
140 + .and_then(|d| d.version.clone());
141 +
142 + let latest_test = db::get_latest_test_run(pool, name).await.unwrap_or(None);
143 +
144 + let tested_version = if let Some(ref test) = latest_test {
145 + db::get_version_at_time(pool, name, &test.started_at)
146 + .await
147 + .unwrap_or(None)
148 + } else {
149 + None
150 + };
151 +
152 + let staleness = compute_test_staleness(
153 + current_version.as_deref(),
154 + tested_version.as_deref(),
155 + latest_test.as_ref().map(|t| t.started_at.as_str()),
156 + tests_config.staleness_days,
157 + );
158 + Some(staleness)
159 + } else {
160 + None
161 + };
162 +
163 + let current_incident = db::get_open_incident(pool, name)
164 + .await
165 + .unwrap_or(None);
166 +
167 + let incidents = db::get_recent_incidents(pool, name, 10)
168 + .await
169 + .unwrap_or_default();
170 +
171 + TargetStatus {
172 + label: label.to_string(),
173 + latest,
174 + recent: recent_json,
175 + uptime_24h,
176 + uptime_7d,
177 + latency_24h,
178 + tls,
179 + test_staleness,
180 + current_incident,
181 + incidents,
182 + }
183 + }
184 +
185 + /// `GET /api/status` — JSON summary for all targets.
186 + async fn status_all(
187 + AxumState(state): AxumState<ApiState>,
188 + ) -> impl IntoResponse {
189 + let mut targets = HashMap::new();
190 +
191 + for name in state.config.target_names() {
192 + if let Some(target_config) = state.config.get_target(&name) {
193 + let status = build_target_status(&state.pool, &name, &target_config.label, &state.config).await;
194 + targets.insert(name, status);
195 + }
196 + }
197 +
198 + Json(StatusResponse { targets })
199 + }
200 +
201 + /// `GET /api/status/{target}` — JSON summary for a single target.
202 + async fn status_target(
203 + AxumState(state): AxumState<ApiState>,
204 + Path(target): Path<String>,
205 + ) -> impl IntoResponse {
206 + let Some(target_config) = state.config.get_target(&target) else {
207 + return Err((StatusCode::NOT_FOUND, Json(serde_json::json!({
208 + "error": format!("unknown target: {target}")
209 + }))));
210 + };
211 +
212 + let status = build_target_status(&state.pool, &target, &target_config.label, &state.config).await;
213 + Ok(Json(status))
214 + }
215 +
216 + // --- Peer endpoints ---
217 +
218 + /// `GET /api/peer/info` — Returns this instance's identity info.
219 + async fn peer_info(
220 + AxumState(state): AxumState<ApiState>,
221 + ) -> impl IntoResponse {
222 + let Some(ref mesh) = state.mesh else {
223 + return Err((StatusCode::SERVICE_UNAVAILABLE, Json(serde_json::json!({
224 + "error": "peer mesh not enabled"
225 + }))));
226 + };
227 +
228 + let mesh_state = mesh.read().await;
229 + Ok(Json(serde_json::to_value(&mesh_state.instance).unwrap_or_default()))
230 + }
231 +
232 + /// `GET /api/peer/status` — This instance's full view: own info + target statuses + peer summaries.
233 + async fn peer_status(
234 + AxumState(state): AxumState<ApiState>,
235 + ) -> impl IntoResponse {
236 + let Some(ref mesh) = state.mesh else {
237 + return Err((StatusCode::SERVICE_UNAVAILABLE, Json(serde_json::json!({
238 + "error": "peer mesh not enabled"
239 + }))));
240 + };
241 +
242 + // Collect mesh data under lock, then drop lock before DB queries
243 + let (instance, peers) = {
244 + let mesh_state = mesh.read().await;
245 + let instance = mesh_state.instance.clone();
246 + let peers: HashMap<String, serde_json::Value> = mesh_state.peers.iter().map(|(name, peer)| {
247 + (name.clone(), serde_json::json!({
248 + "status": peer.status,
249 + "last_seen": peer.last_seen,
250 + "latency_ms": peer.latency_ms,
251 + }))
252 + }).collect();
253 + (instance, peers)
254 + };
255 +
256 + // Build target statuses (DB queries with no lock held)
257 + let mut targets = HashMap::new();
258 + for name in state.config.target_names() {
259 + if let Some(target_config) = state.config.get_target(&name)
260 + && let Ok(Some(latest)) = db::get_latest_health(&state.pool, &name).await
261 + {
262 + targets.insert(name, serde_json::json!({
263 + "label": target_config.label,
264 + "status": latest.status.to_string(),
265 + "response_time_ms": latest.response_time_ms,
266 + "checked_at": latest.checked_at,
267 + }));
268 + }
269 + }
270 +
271 + Ok(Json(serde_json::json!({
272 + "instance": instance,
273 + "targets": targets,
274 + "peers": peers,
275 + })))
276 + }
277 +
278 + /// `GET /api/mesh` — Aggregated view: self + each peer's cached status.
279 + async fn mesh_view(
280 + AxumState(state): AxumState<ApiState>,
281 + ) -> impl IntoResponse {
282 + let Some(ref mesh) = state.mesh else {
283 + return Err((StatusCode::SERVICE_UNAVAILABLE, Json(serde_json::json!({
284 + "error": "peer mesh not enabled"
285 + }))));
286 + };
287 +
288 + // Collect all mesh data under lock, then drop lock before DB queries
289 + let (instance, own_peers_json, peer_entries) = {
290 + let mesh_state = mesh.read().await;
291 + let instance = mesh_state.instance.clone();
292 + let own_peers: HashMap<String, serde_json::Value> = mesh_state.peers.iter().map(|(name, peer)| {
293 + (name.clone(), serde_json::json!({
294 + "status": peer.status,
295 + "last_seen": peer.last_seen,
296 + "latency_ms": peer.latency_ms,
297 + }))
298 + }).collect();
299 + let peer_entries: Vec<(String, Option<serde_json::Value>, serde_json::Value)> = mesh_state.peers.iter().map(|(name, peer)| {
300 + let fallback = serde_json::json!({
301 + "status": peer.status,
302 + "last_seen": peer.last_seen,
303 + "error": "no status data cached",
304 + });
305 + (name.clone(), peer.status_data.clone(), fallback)
306 + }).collect();
307 + (instance, own_peers, peer_entries)
308 + };
309 +
310 + // Build target statuses (DB queries with no lock held)
311 + let mut targets = HashMap::new();
312 + for name in state.config.target_names() {
313 + if let Some(target_config) = state.config.get_target(&name)
314 + && let Ok(Some(latest)) = db::get_latest_health(&state.pool, &name).await
315 + {
316 + targets.insert(name, serde_json::json!({
317 + "label": target_config.label,
318 + "status": latest.status.to_string(),
319 + "response_time_ms": latest.response_time_ms,
320 + "checked_at": latest.checked_at,
321 + }));
322 + }
323 + }
324 +
325 + let self_entry = serde_json::json!({
326 + "instance": instance,
327 + "targets": targets,
328 + "peers": own_peers_json,
329 + });
330 +
331 + let mut instances = serde_json::Map::new();
332 + instances.insert(instance.name.clone(), self_entry);
333 +
334 + for (name, status_data, fallback) in peer_entries {
335 + instances.insert(name, status_data.unwrap_or(fallback));
336 + }
337 +
338 + Ok(Json(serde_json::json!({
339 + "instances": instances,
340 + })))
341 + }
342 +
343 + // --- Trends endpoint ---
344 +
345 + #[derive(Serialize)]
346 + struct TrendResponse {
347 + target: String,
348 + window_hours: u64,
349 + bucket_minutes: u64,
350 + buckets: Vec<LatencyBucket>,
351 + overall: Option<LatencyStats>,
352 + baseline: Option<LatencyStats>,
353 + }
354 +
355 + /// `GET /api/trends/{target}?hours=24&bucket_minutes=60` — latency trend data.
356 + async fn trends(
357 + AxumState(state): AxumState<ApiState>,
358 + Path(target): Path<String>,
359 + axum::extract::Query(params): axum::extract::Query<TrendQueryParams>,
360 + ) -> impl IntoResponse {
361 + let Some(_target_config) = state.config.get_target(&target) else {
362 + return Err((StatusCode::NOT_FOUND, Json(serde_json::json!({
363 + "error": format!("unknown target: {target}")
364 + }))));
365 + };
366 +
367 + let hours = params.hours.unwrap_or(24);
368 + let bucket_minutes = params.bucket_minutes.unwrap_or(60);
369 +
370 + let cutoff = (chrono::Utc::now() - chrono::Duration::hours(hours as i64)).to_rfc3339();
371 + let times = db::get_response_times(&state.pool, &target, &cutoff)
372 + .await
373 + .unwrap_or_default();
374 +
375 + let operational_times: Vec<i64> = times.iter()
376 + .filter(|(_, ms)| *ms > 0)
377 + .map(|(_, ms)| *ms)
378 + .collect();
379 + let overall = LatencyStats::from_times(&operational_times);
380 +
381 + let operational_data: Vec<(String, i64)> = times.into_iter()
382 + .filter(|(_, ms)| *ms > 0)
383 + .collect();
384 + let buckets = LatencyStats::bucket_by_time(&operational_data, bucket_minutes);
385 +
386 + // 7d baseline for reference
387 + let baseline_cutoff = (chrono::Utc::now() - chrono::Duration::hours(168)).to_rfc3339();
388 + let baseline_times = db::get_response_times(&state.pool, &target, &baseline_cutoff)
389 + .await
390 + .unwrap_or_default();
391 + let baseline_operational: Vec<i64> = baseline_times.iter()
392 + .filter(|(_, ms)| *ms > 0)
393 + .map(|(_, ms)| *ms)
394 + .collect();
395 + let baseline = LatencyStats::from_times(&baseline_operational);
396 +
397 + Ok(Json(TrendResponse {
398 + target,
399 + window_hours: hours,
400 + bucket_minutes,
401 + buckets,
402 + overall,
403 + baseline,
404 + }))
405 + }
406 +
407 + #[derive(serde::Deserialize)]
408 + struct TrendQueryParams {
409 + hours: Option<u64>,
410 + bucket_minutes: Option<u64>,
411 + }
M src/checks/http.rs +435 -29
@@ -1,11 +1,12 @@
1 1 use std::time::Instant;
2 2
3 - use crate::config::HealthConfig;
3 + use crate::config::{HealthConfig, HealthExpectation};
4 4 use crate::types::{HealthDetails, HealthSnapshot, HealthStatus};
5 5
6 6 pub async fn check_health(
7 7 target_name: &str,
8 8 config: &HealthConfig,
9 + expect: Option<&HealthExpectation>,
9 10 ) -> HealthSnapshot {
10 11 let client = reqwest::Client::builder()
11 12 .timeout(std::time::Duration::from_secs(config.timeout_secs))
@@ -18,51 +19,46 @@ pub async fn check_health(
18 19 match client.get(&config.url).send().await {
19 20 Ok(response) => {
20 21 let response_time_ms = start.elapsed().as_millis() as i64;
21 - let status_code = response.status();
22 -
23 - match response.json::<serde_json::Value>().await {
24 - Ok(json) => {
25 - let api_status = json
26 - .get("status")
27 - .and_then(|s| s.as_str())
28 - .unwrap_or("unknown");
29 -
30 - let status = match api_status {
31 - "operational" => HealthStatus::Operational,
32 - "degraded" => HealthStatus::Degraded,
33 - _ if status_code.is_success() => HealthStatus::Degraded,
34 - _ => HealthStatus::Error,
35 - };
22 + let status_code = response.status().as_u16();
23 +
24 + match response.text().await {
25 + Ok(body) => {
26 + let json: Option<serde_json::Value> = serde_json::from_str(&body).ok();
36 27
37 - let details = HealthDetails {
38 - version: json.get("version").and_then(|v| v.as_str()).map(String::from),
39 - uptime: json.get("uptime").and_then(|v| v.as_str()).map(String::from),
40 - checks: json.get("checks").cloned(),
41 - monitoring: json.get("monitoring").cloned(),
28 + let (mut status, details, mut error) = if let Some(ref json) = json {
29 + let (s, d) = classify_json_response(status_code, json);
30 + (s, Some(d), None)
31 + } else {
32 + (classify_non_json(status_code), None, Some("Failed to parse response as JSON".to_string()))
42 33 };
43 34
35 + // Apply expectation validation
36 + if let Some(exp) = expect {
37 + let failures = validate_expectations(exp, status_code, &body, json.as_ref());
38 + if !failures.is_empty() {
39 + status = HealthStatus::Degraded;
40 + error = Some(failures.join("; "));
41 + }
42 + }
43 +
44 44 HealthSnapshot {
45 45 id: None,
46 46 target: target_name.to_string(),
47 47 status,
48 48 checked_at,
49 49 response_time_ms,
50 - details: Some(details),
51 - error: None,
50 + details,
51 + error,
52 52 }
53 53 }
54 54 Err(e) => HealthSnapshot {
55 55 id: None,
56 56 target: target_name.to_string(),
57 - status: if status_code.is_success() {
58 - HealthStatus::Degraded
59 - } else {
60 - HealthStatus::Error
61 - },
57 + status: classify_non_json(status_code),
62 58 checked_at,
63 59 response_time_ms,
64 60 details: None,
65 - error: Some(format!("Failed to parse response: {e}")),
61 + error: Some(format!("Failed to read response body: {e}")),
66 62 },
67 63 }
68 64 }
@@ -80,3 +76,524 @@ pub async fn check_health(
80 76 }
81 77 }
82 78 }
79 +
80 + /// Walk a dot-separated path through nested JSON objects.
81 + pub fn resolve_json_path<'a>(value: &'a serde_json::Value, path: &str) -> Option<&'a serde_json::Value> {
82 + let mut current = value;
83 + for key in path.split('.') {
84 + current = current.get(key)?;
85 + }
86 + Some(current)
87 + }
88 +
89 + /// Validate response against expectations. Returns a list of failure descriptions.
90 + pub fn validate_expectations(
91 + expect: &HealthExpectation,
92 + status_code: u16,
93 + body: &str,
94 + json: Option<&serde_json::Value>,
95 + ) -> Vec<String> {
96 + let mut failures = Vec::new();
97 +
98 + if let Some(expected_code) = expect.status_code
99 + && status_code != expected_code
100 + {
101 + failures.push(format!("expected status {expected_code}, got {status_code}"));
102 + }
103 +
104 + if let Some(ref substring) = expect.body_contains
105 + && !body.contains(substring.as_str())
106 + {
107 + failures.push(format!("body missing expected substring \"{substring}\""));
108 + }
109 +
110 + if !expect.json_fields.is_empty() {
111 + if let Some(json) = json {
112 + for (path, expected_value) in &expect.json_fields {
113 + match resolve_json_path(json, path) {
114 + Some(actual) => {
115 + let actual_str = match actual {
116 + serde_json::Value::String(s) => s.clone(),
117 + other => other.to_string(),
118 + };
119 + if actual_str != *expected_value {
120 + failures.push(format!("json field \"{path}\": expected \"{expected_value}\", got \"{actual_str}\""));
121 + }
122 + }
123 + None => {
124 + failures.push(format!("json field \"{path}\" not found"));
125 + }
126 + }
127 + }
128 + } else {
129 + failures.push("expected JSON response for field validation, got non-JSON".to_string());
130 + }
131 + }
132 +
133 + failures
134 + }
135 +
136 + /// Classify a JSON health response into status + details.
137 + pub fn classify_json_response(
138 + status_code: u16,
139 + json: &serde_json::Value,
140 + ) -> (HealthStatus, HealthDetails) {
141 + let api_status = json
142 + .get("status")
143 + .and_then(|s| s.as_str())
144 + .unwrap_or("unknown");
145 +
146 + let status = match api_status {
147 + "operational" => HealthStatus::Operational,
148 + "degraded" => HealthStatus::Degraded,
149 + _ if (200..300).contains(&status_code) => HealthStatus::Degraded,
150 + _ => HealthStatus::Error,
151 + };
152 +
153 + let details = HealthDetails {
154 + version: json.get("version").and_then(|v| v.as_str()).map(String::from),
155 + uptime: json.get("uptime").and_then(|v| v.as_str()).map(String::from),
156 + checks: json.get("checks").cloned(),
157 + monitoring: json.get("monitoring").cloned(),
158 + };
159 +
160 + (status, details)
161 + }
162 +
163 + /// Classify a response that couldn't be parsed as JSON.
164 + pub fn classify_non_json(status_code: u16) -> HealthStatus {
165 + if (200..300).contains(&status_code) {
166 + HealthStatus::Degraded
167 + } else {
168 + HealthStatus::Error
169 + }
170 + }
171 +
172 + /// Detect sustained latency drift by checking if all recent response times
173 + /// exceed the baseline average by the given threshold multiplier.
174 + ///
175 + /// Returns a description string if drift is detected, `None` otherwise.
176 + /// Requires at least 10 baseline samples to avoid false positives.
177 + pub fn detect_latency_drift(
178 + recent_times: &[i64],
179 + baseline: &crate::types::LatencyStats,
180 + threshold: f64,
181 + ) -> Option<String> {
182 + if baseline.sample_count < 10 || recent_times.is_empty() {
183 + return None;
184 + }
185 + let drift_threshold = baseline.avg_ms * threshold;
186 + let all_over = recent_times.iter().all(|&t| t as f64 > drift_threshold);
187 + if all_over {
188 + let avg_recent: f64 = recent_times.iter().sum::<i64>() as f64 / recent_times.len() as f64;
189 + Some(format!(
190 + "latency drift: last {} checks avg {:.0}ms (baseline avg {:.0}ms, threshold {:.0}ms)",
191 + recent_times.len(),
192 + avg_recent,
193 + baseline.avg_ms,
194 + drift_threshold,
195 + ))
196 + } else {
197 + None
198 + }
199 + }
200 +
201 + /// Compute test staleness from version and timing data.
202 + ///
203 + /// A target's tests are considered stale when:
204 + /// 1. No tests have ever been run
205 + /// 2. Tests are older than `staleness_days`
206 + /// 3. The deployed version has changed since the last test run
207 + pub fn compute_test_staleness(
208 + current_version: Option<&str>,
209 + tested_version: Option<&str>,
210 + last_test_at: Option<&str>,
211 + staleness_days: u64,
212 + ) -> crate::types::TestStaleness {
213 + let Some(last_test_at) = last_test_at else {
214 + return crate::types::TestStaleness {
215 + stale: true,
216 + reason: Some("no tests have been run".to_string()),
217 + current_version: current_version.map(String::from),
218 + tested_version: None,
219 + last_test_at: None,
220 + days_since_test: None,
221 + };
222 + };
223 +
224 + let days_since = chrono::DateTime::parse_from_rfc3339(last_test_at)
225 + .ok()
226 + .map(|dt| {
227 + let now = chrono::Utc::now();
228 + (now - dt.with_timezone(&chrono::Utc)).num_days()
229 + });
230 +
231 + if let Some(days) = days_since
232 + && days >= staleness_days as i64
233 + {
234 + return crate::types::TestStaleness {
235 + stale: true,
236 + reason: Some(format!("tests are {days} days old (threshold: {staleness_days}d)")),
237 + current_version: current_version.map(String::from),
238 + tested_version: tested_version.map(String::from),
239 + last_test_at: Some(last_test_at.to_string()),
240 + days_since_test: Some(days),
241 + };
242 + }
243 +
244 + if let (Some(current), Some(tested)) = (current_version, tested_version)
245 + && current != tested
246 + {
247 + return crate::types::TestStaleness {
248 + stale: true,
249 + reason: Some(format!("version changed: {tested} -> {current}")),
250 + current_version: Some(current.to_string()),
251 + tested_version: Some(tested.to_string()),
252 + last_test_at: Some(last_test_at.to_string()),
253 + days_since_test: days_since,
254 + };
255 + }
256 +
257 + crate::types::TestStaleness {
258 + stale: false,
259 + reason: None,
260 + current_version: current_version.map(String::from),
261 + tested_version: tested_version.map(String::from),
262 + last_test_at: Some(last_test_at.to_string()),
263 + days_since_test: days_since,
264 + }
265 + }
266 +
267 + #[cfg(test)]
268 + mod tests {
269 + use super::*;
270 + use std::collections::HashMap;
271 +
272 + #[test]
273 + fn classify_operational() {
274 + let json = serde_json::json!({
275 + "status": "operational",
276 + "version": "2.1.0",
277 + "uptime": "3d 12h",
278 + });
279 + let (status, details) = classify_json_response(200, &json);
280 + assert_eq!(status, HealthStatus::Operational);
281 + assert_eq!(details.version.as_deref(), Some("2.1.0"));
282 + assert_eq!(details.uptime.as_deref(), Some("3d 12h"));
283 + }
284 +
285 + #[test]
286 + fn classify_degraded_explicit() {
287 + let json = serde_json::json!({ "status": "degraded" });
288 + let (status, _) = classify_json_response(200, &json);
289 + assert_eq!(status, HealthStatus::Degraded);
290 + }
291 +
292 + #[test]
293 + fn classify_unknown_status_with_success_code() {
294 + let json = serde_json::json!({ "status": "starting_up" });
295 + let (status, _) = classify_json_response(200, &json);
296 + assert_eq!(status, HealthStatus::Degraded);
297 + }
298 +
299 + #[test]
300 + fn classify_unknown_status_with_error_code() {
301 + let json = serde_json::json!({ "status": "starting_up" });
302 + let (status, _) = classify_json_response(503, &json);
303 + assert_eq!(status, HealthStatus::Error);
304 + }
305 +
306 + #[test]
307 + fn classify_missing_status_field() {
308 + let json = serde_json::json!({ "version": "1.0.0" });
309 + let (status, details) = classify_json_response(200, &json);
310 + assert_eq!(status, HealthStatus::Degraded); // "unknown" falls through
311 + assert_eq!(details.version.as_deref(), Some("1.0.0"));
312 + }
313 +
314 + #[test]
315 + fn classify_extracts_checks_and_monitoring() {
316 + let json = serde_json::json!({
317 + "status": "operational",
318 + "checks": { "db": "ok", "redis": "ok" },
319 + "monitoring": { "external": true },
320 + });
321 + let (_, details) = classify_json_response(200, &json);
322 + assert!(details.checks.is_some());
323 + assert!(details.monitoring.is_some());
324 + }
325 +
326 + #[test]
327 + fn classify_non_json_success() {
328 + assert_eq!(classify_non_json(200), HealthStatus::Degraded);
329 + assert_eq!(classify_non_json(204), HealthStatus::Degraded);
330 + }
331 +
332 + #[test]
333 + fn classify_non_json_error() {
334 + assert_eq!(classify_non_json(500), HealthStatus::Error);
335 + assert_eq!(classify_non_json(404), HealthStatus::Error);
336 + }
337 +
338 + // --- resolve_json_path ---
339 +
340 + #[test]
341 + fn resolve_json_path_top_level() {
342 + let json = serde_json::json!({"status": "operational"});
343 + let val = resolve_json_path(&json, "status").unwrap();
344 + assert_eq!(val, "operational");
345 + }
346 +
347 + #[test]
348 + fn resolve_json_path_nested() {
349 + let json = serde_json::json!({"checks": {"db": "ok", "redis": "warn"}});
350 + let val = resolve_json_path(&json, "checks.db").unwrap();
351 + assert_eq!(val, "ok");
352 + }
353 +
354 + #[test]
355 + fn resolve_json_path_deeply_nested() {
356 + let json = serde_json::json!({"a": {"b": {"c": 42}}});
357 + let val = resolve_json_path(&json, "a.b.c").unwrap();
358 + assert_eq!(val, 42);
359 + }
360 +
361 + #[test]
362 + fn resolve_json_path_missing() {
363 + let json = serde_json::json!({"status": "operational"});
364 + assert!(resolve_json_path(&json, "missing").is_none());
365 + }
366 +
367 + #[test]
368 + fn resolve_json_path_partial_missing() {
369 + let json = serde_json::json!({"checks": {"db": "ok"}});
370 + assert!(resolve_json_path(&json, "checks.redis").is_none());
371 + }
372 +
373 + // --- validate_expectations ---
374 +
375 + #[test]
376 + fn validate_status_code_match() {
377 + let expect = HealthExpectation {
378 + status_code: Some(200),
379 + ..Default::default()
380 + };
381 + let failures = validate_expectations(&expect, 200, "", None);
382 + assert!(failures.is_empty());
383 + }
384 +
385 + #[test]
386 + fn validate_status_code_mismatch() {
387 + let expect = HealthExpectation {
388 + status_code: Some(200),
389 + ..Default::default()
390 + };
391 + let failures = validate_expectations(&expect, 503, "", None);
392 + assert_eq!(failures.len(), 1);
393 + assert!(failures[0].contains("expected status 200"));
394 + assert!(failures[0].contains("got 503"));
395 + }
396 +
397 + #[test]
398 + fn validate_body_contains_match() {
399 + let expect = HealthExpectation {
400 + body_contains: Some("operational".to_string()),
401 + ..Default::default()
402 + };
403 + let failures = validate_expectations(&expect, 200, r#"{"status":"operational"}"#, None);
404 + assert!(failures.is_empty());
405 + }
406 +
407 + #[test]
408 + fn validate_body_contains_mismatch() {
409 + let expect = HealthExpectation {
410 + body_contains: Some("operational".to_string()),
411 + ..Default::default()
412 + };
413 + let failures = validate_expectations(&expect, 200, r#"{"status":"error"}"#, None);
414 + assert_eq!(failures.len(), 1);
415 + assert!(failures[0].contains("body missing"));
416 + }
417 +
418 + #[test]
419 + fn validate_json_fields_match() {
420 + let mut fields = HashMap::new();
421 + fields.insert("status".to_string(), "operational".to_string());
422 + fields.insert("checks.db".to_string(), "ok".to_string());
423 + let expect = HealthExpectation {
424 + json_fields: fields,
425 + ..Default::default()
426 + };
427 + let json = serde_json::json!({"status": "operational", "checks": {"db": "ok"}});
428 + let failures = validate_expectations(&expect, 200, "", Some(&json));
429 + assert!(failures.is_empty());
430 + }
431 +
432 + #[test]
433 + fn validate_json_fields_mismatch() {
434 + let mut fields = HashMap::new();
435 + fields.insert("status".to_string(), "operational".to_string());
436 + let expect = HealthExpectation {
437 + json_fields: fields,
438 + ..Default::default()
439 + };
440 + let json = serde_json::json!({"status": "degraded"});
441 + let failures = validate_expectations(&expect, 200, "", Some(&json));
442 + assert_eq!(failures.len(), 1);
443 + assert!(failures[0].contains("expected \"operational\""));
444 + assert!(failures[0].contains("got \"degraded\""));
445 + }
446 +
447 + #[test]
448 + fn validate_json_field_missing() {
449 + let mut fields = HashMap::new();
450 + fields.insert("checks.redis".to_string(), "ok".to_string());
451 + let expect = HealthExpectation {
452 + json_fields: fields,
453 + ..Default::default()
454 + };
455 + let json = serde_json::json!({"checks": {"db": "ok"}});
456 + let failures = validate_expectations(&expect, 200, "", Some(&json));
457 + assert_eq!(failures.len(), 1);
458 + assert!(failures[0].contains("not found"));
459 + }
460 +
461 + #[test]
462 + fn validate_json_fields_on_non_json() {
463 + let mut fields = HashMap::new();
464 + fields.insert("status".to_string(), "ok".to_string());
465 + let expect = HealthExpectation {
466 + json_fields: fields,
467 + ..Default::default()
468 + };
469 + let failures = validate_expectations(&expect, 200, "not json", None);
470 + assert_eq!(failures.len(), 1);
471 + assert!(failures[0].contains("non-JSON"));
472 + }
473 +
474 + #[test]
475 + fn validate_mixed_failures() {
476 + let mut fields = HashMap::new();
477 + fields.insert("status".to_string(), "operational".to_string());
478 + let expect = HealthExpectation {
479 + status_code: Some(200),
480 + body_contains: Some("healthy".to_string()),
481 + json_fields: fields,
482 + };
483 + let json = serde_json::json!({"status": "degraded"});
484 + let failures = validate_expectations(&expect, 503, r#"{"status":"degraded"}"#, Some(&json));
485 + assert_eq!(failures.len(), 3); // status code + body + json field
486 + }
487 +
488 + #[test]
Lines truncated
@@ -1,3 +1,4 @@
1 1 pub mod http;
2 2 pub mod parse;
3 3 pub mod ssh;
4 + pub mod tls;
@@ -0,0 +1,153 @@
1 + //! TLS certificate probing — connect to a host, inspect the leaf cert, track expiry.
2 +
3 + use std::sync::Arc;
4 +
5 + use tokio::net::TcpStream;
6 + use tokio_rustls::rustls;
7 + use tokio_rustls::TlsConnector;
8 +
9 + use crate::config::TlsConfig;
10 + use crate::types::TlsStatus;
11 +
12 + /// Connect to host:port, complete TLS handshake, and extract leaf cert fields.
13 + pub async fn check_tls(target_name: &str, config: &TlsConfig) -> TlsStatus {
14 + let checked_at = chrono::Utc::now().to_rfc3339();
15 + let addr = format!("{}:{}", config.host, config.port);
16 +
17 + // TCP connect with timeout
18 + let tcp = match tokio::time::timeout(
19 + std::time::Duration::from_secs(10),
20 + TcpStream::connect(&addr),
21 + )
22 + .await
23 + {
24 + Ok(Ok(stream)) => stream,
25 + Ok(Err(e)) => return tls_error(target_name, config, &checked_at, &format!("TCP connect failed: {e}")),
26 + Err(_) => return tls_error(target_name, config, &checked_at, "TCP connect timed out"),
27 + };
28 +
29 + // Build rustls config with webpki trust store
30 + let mut root_store = rustls::RootCertStore::empty();
31 + root_store.extend(webpki_roots::TLS_SERVER_ROOTS.iter().cloned());
32 + let tls_config = rustls::ClientConfig::builder()
33 + .with_root_certificates(root_store)
34 + .with_no_client_auth();
35 +
36 + let connector = TlsConnector::from(Arc::new(tls_config));
37 + let server_name = match rustls_pki_types::ServerName::try_from(config.host.clone()) {
38 + Ok(name) => name,
39 + Err(e) => return tls_error(target_name, config, &checked_at, &format!("invalid server name: {e}")),
40 + };
41 +
42 + // TLS handshake with timeout
43 + let tls_stream = match tokio::time::timeout(
44 + std::time::Duration::from_secs(10),
45 + connector.connect(server_name, tcp),
46 + )
47 + .await
48 + {
49 + Ok(Ok(stream)) => stream,
50 + Ok(Err(e)) => return tls_error(target_name, config, &checked_at, &format!("TLS handshake failed: {e}")),
51 + Err(_) => return tls_error(target_name, config, &checked_at, "TLS handshake timed out"),
52 + };
53 +
54 + // Extract peer certificates
55 + let (_io, client_conn) = tls_stream.into_inner();
56 + let certs = match client_conn.peer_certificates() {
57 + Some(certs) if !certs.is_empty() => certs,
58 + _ => return tls_error(target_name, config, &checked_at, "no peer certificates"),
59 + };
60 +
61 + // Parse the leaf (first) certificate
62 + parse_leaf_cert(target_name, config, &checked_at, certs[0].as_ref())
63 + }
64 +
65 + /// Parse DER-encoded leaf cert bytes into a TlsStatus.
66 + pub fn parse_leaf_cert(
67 + target_name: &str,
68 + config: &TlsConfig,
69 + checked_at: &str,
70 + der_bytes: &[u8],
71 + ) -> TlsStatus {
72 + use x509_parser::prelude::FromDer;
73 + let (_, cert) = match x509_parser::prelude::X509Certificate::from_der(der_bytes) {
74 + Ok(result) => result,
75 + Err(e) => return tls_error(target_name, config, checked_at, &format!("cert parse error: {e}")),
76 + };
77 +
78 + let not_before_ts = cert.validity().not_before.timestamp();
79 + let not_after_ts = cert.validity().not_after.timestamp();
80 +
81 + let now = chrono::Utc::now();
82 + let not_after_chrono = chrono::DateTime::from_timestamp(not_after_ts, 0)
83 + .unwrap_or(now);
84 + let not_before_chrono = chrono::DateTime::from_timestamp(not_before_ts, 0)
85 + .unwrap_or(now);
86 + let days_remaining = (not_after_chrono - now).num_days();
87 +
88 + let subject = cert.subject().to_string();
89 + let issuer = cert.issuer().to_string();
90 +
91 + TlsStatus {
92 + target: target_name.to_string(),
93 + host: config.host.clone(),
94 + port: config.port,
95 + valid: days_remaining > 0,
96 + days_remaining,
97 + not_before: not_before_chrono.to_rfc3339(),
98 + not_after: not_after_chrono.to_rfc3339(),
99 + subject,
100 + issuer,
101 + checked_at: checked_at.to_string(),
102 + error: None,
103 + }
104 + }
105 +
106 + fn tls_error(target_name: &str, config: &TlsConfig, checked_at: &str, error: &str) -> TlsStatus {
107 + TlsStatus {
108 + target: target_name.to_string(),
109 + host: config.host.clone(),
110 + port: config.port,
111 + valid: false,
112 + days_remaining: 0,
113 + not_before: String::new(),
114 + not_after: String::new(),
115 + subject: String::new(),
116 + issuer: String::new(),
117 + checked_at: checked_at.to_string(),
118 + error: Some(error.to_string()),
119 + }
120 + }
121 +
122 + #[cfg(test)]
123 + mod tests {
124 + use super::*;
125 +
126 + fn test_config() -> TlsConfig {
127 + TlsConfig {
128 + host: "example.com".to_string(),
129 + port: 443,
130 + warn_days: 14,
131 + }
132 + }
133 +
134 + #[test]
135 + fn parse_leaf_cert_with_invalid_der() {
136 + let config = test_config();
137 + let result = parse_leaf_cert("test", &config, "2026-03-11T00:00:00Z", b"not-a-cert");
138 + assert!(!result.valid);
139 + assert!(result.error.as_ref().unwrap().contains("cert parse error"));
140 + }
141 +
142 + #[test]
143 + fn tls_error_populates_all_fields() {
144 + let config = test_config();
145 + let result = tls_error("test", &config, "2026-03-11T00:00:00Z", "connection refused");
146 + assert_eq!(result.target, "test");
147 + assert_eq!(result.host, "example.com");
148 + assert_eq!(result.port, 443);
149 + assert!(!result.valid);
150 + assert_eq!(result.days_remaining, 0);
151 + assert_eq!(result.error.as_deref(), Some("connection refused"));
152 + }
153 + }
A src/cli.rs +500
@@ -0,0 +1,564 @@
1 + //! CLI command handlers for PoM subcommands.
2 +
3 + use clap::Subcommand;
4 + use tracing::info;
5 +
6 + use pom::alerts::Alerter;
7 + use pom::checks::{http, ssh, tls};
8 + use pom::config::Config;
9 + use pom::db;
10 + use pom::display;
11 + use pom::error::{PomError, Result};
12 + use pom::peer;
13 + use pom::types::LatencyStats;
14 + use pom::types::TestStaleness;
15 +
16 + #[derive(Subcommand)]
17 + pub(crate) enum HistoryKind {
18 + /// Health check history
19 + Health {
20 + /// Filter by target
21 + target: Option<String>,
22 + /// Number of results
23 + #[arg(short, default_value = "10")]
24 + n: i64,
25 + /// Output as JSON
26 + #[arg(long)]
27 + json: bool,
28 + },
29 + /// Test run history
30 + Tests {
31 + /// Filter by target
32 + target: Option<String>,
33 + /// Number of results
34 + #[arg(short, default_value = "10")]
35 + n: i64,
36 + /// Output as JSON
37 + #[arg(long)]
38 + json: bool,
39 + },
40 + }
41 +
42 + pub(crate) async fn cmd_health(
43 + pool: &sqlx::SqlitePool,
44 + config: &Config,
45 + target: Option<&str>,
46 + json: bool,
47 + ) -> Result<()> {
48 + let targets: Vec<String> = match target {
49 + Some(t) => {
50 + if config.get_target(t).is_none() {
51 + eprintln!("Unknown target: {t}");
52 + std::process::exit(1);
53 + }
54 + vec![t.to_string()]
55 + }
56 + None => config.target_names(),
57 + };
58 +
59 + let mut snapshots = Vec::new();
60 +
61 + for name in &targets {
62 + let target_config = config.get_target(name).unwrap();
63 + if let Some(health_config) = &target_config.health {
64 + let snapshot = http::check_health(name, health_config, health_config.expect.as_ref()).await;
65 + db::insert_health_check(pool, &snapshot).await?;
66 + snapshots.push(snapshot);
67 + } else {
68 + eprintln!("{name}: no health endpoint configured");
69 + }
70 + }
71 +
72 + if json {
73 + println!("{}", serde_json::to_string_pretty(&snapshots)?);
74 + } else {
75 + print!("{}", display::format_health_snapshots(&snapshots));
76 + }
77 +
78 + Ok(())
79 + }
80 +
81 + pub(crate) async fn cmd_test(
82 + pool: &sqlx::SqlitePool,
83 + config: &Config,
84 + target_name: &str,
85 + filter: Option<&str>,
86 + json: bool,
87 + ) -> Result<()> {
88 + let target = config.get_target(target_name).ok_or_else(|| {
89 + PomError::Config(format!("Unknown target: {target_name}"))
90 + })?;
91 + let tests_config = target.tests.as_ref().ok_or_else(|| {
92 + PomError::Config(format!("Target '{target_name}' has no test configuration"))
93 + })?;
94 +
95 + eprintln!("Running tests on {target_name}...");
96 + let run = ssh::run_tests(target_name, tests_config, filter).await;
97 + db::insert_test_run(pool, &run).await?;
98 +
99 + if json {
100 + let summary = serde_json::json!({
101 + "target": run.target,
102 + "passed": run.passed,
103 + "exit_code": run.exit_code,
104 + "duration_secs": run.duration_secs,
105 + "started_at": run.started_at,
106 + "finished_at": run.finished_at,
107 + "filter": run.filter,
108 + "summary": run.summary,
109 + });
110 + println!("{}", serde_json::to_string_pretty(&summary)?);
111 + } else {
112 + print!("{}", display::format_test_result(target_name, &run));
113 + }
114 +
115 + Ok(())
116 + }
117 +
118 + pub(crate) async fn cmd_status(
119 + pool: &sqlx::SqlitePool,
120 + config: &Config,
121 + json: bool,
122 + ) -> Result<()> {
123 + let mut target_statuses = Vec::new();
124 +
125 + for name in config.target_names() {
126 + let target = config.get_target(&name).unwrap();
127 + let health = db::get_latest_health(pool, &name).await?;
128 + let tls_check = db::get_latest_tls_check(pool, &name).await?;
129 + let test = db::get_latest_test_run(pool, &name).await?;
130 + let incident = db::get_open_incident(pool, &name).await?;
131 +
132 + // Compute 24h latency stats
133 + let latency_24h = {
134 + let cutoff = (chrono::Utc::now() - chrono::Duration::hours(24)).to_rfc3339();
135 + let times = db::get_response_times(pool, &name, &cutoff).await.unwrap_or_default();
136 + let operational_times: Vec<i64> = times.iter()
137 + .filter(|(_, ms)| *ms > 0)
138 + .map(|(_, ms)| *ms)
139 + .collect();
140 + LatencyStats::from_times(&operational_times)
141 + };
142 +
143 + // Compute test staleness
144 + let staleness: Option<TestStaleness> = if let Some(tests_config) = &target.tests {
145 + let current_version = health.as_ref()
146 + .and_then(|h| h.details.as_ref())
147 + .and_then(|d| d.version.clone());
148 +
149 + let tested_version = if let Some(ref t) = test {
150 + db::get_version_at_time(pool, &name, &t.started_at).await.unwrap_or(None)
151 + } else {
152 + None
153 + };
154 +
155 + Some(http::compute_test_staleness(
156 + current_version.as_deref(),
157 + tested_version.as_deref(),
158 + test.as_ref().map(|t| t.started_at.as_str()),
159 + tests_config.staleness_days,
160 + ))
161 + } else {
162 + None
163 + };
164 +
165 + if json {
166 + target_statuses.push(serde_json::json!({
167 + "target": name,
168 + "label": target.label,
169 + "health": health,
170 + "tls": tls_check,
171 + "latency_24h": latency_24h,
172 + "last_test": test.map(|t| serde_json::json!({
173 + "passed": t.passed,
174 + "exit_code": t.exit_code,
175 + "duration_secs": t.duration_secs,
176 + "started_at": t.started_at,
177 + "summary": t.summary,
178 + })),
179 + "test_staleness": staleness,
180 + "incident": incident,
181 + }));
182 + } else {
183 + print!(
184 + "{}",
185 + display::format_status_target(
186 + &name,
187 + &target.label,
188 + health.as_ref(),
189 + latency_24h.as_ref(),
190 + tls_check.as_ref(),
191 + test.as_ref(),
192 + staleness.as_ref(),
193 + incident.as_ref(),
194 + )
195 + );
196 + }
197 + }
198 +
199 + if json {
200 + println!("{}", serde_json::to_string_pretty(&target_statuses)?);
201 + }
202 +
203 + Ok(())
204 + }
205 +
206 + pub(crate) async fn cmd_history(
207 + pool: &sqlx::SqlitePool,
208 + kind: HistoryKind,
209 + ) -> Result<()> {
210 + match kind {
211 + HistoryKind::Health { target, n, json } => {
212 + let history = db::get_health_history(pool, target.as_deref(), n).await?;
213 + if json {
214 + println!("{}", serde_json::to_string_pretty(&history)?);
215 + } else {
216 + print!("{}", display::format_health_history(&history));
217 + }
218 + }
219 + HistoryKind::Tests { target, n, json } => {
220 + let history = db::get_test_history(pool, target.as_deref(), n).await?;
221 + if json {
222 + let summaries: Vec<serde_json::Value> = history
223 + .iter()
224 + .map(|r| serde_json::json!({
225 + "id": r.id,
226 + "target": r.target,
227 + "passed": r.passed,
228 + "exit_code": r.exit_code,
229 + "duration_secs": r.duration_secs,
230 + "started_at": r.started_at,
231 + "summary": r.summary,
232 + }))
233 + .collect();
234 + println!("{}", serde_json::to_string_pretty(&summaries)?);
235 + } else {
236 + print!("{}", display::format_test_history(&history));
237 + }
238 + }
239 + }
240 +
241 + Ok(())
242 + }
243 +
244 + pub(crate) async fn cmd_prune(
245 + pool: &sqlx::SqlitePool,
246 + days: i64,
247 + ) -> Result<()> {
248 + let (health_pruned, test_pruned, heartbeat_pruned, alerts_pruned, tls_pruned, incidents_pruned) = db::prune_old_records(pool, days).await?;
249 + print!(
250 + "{}",
251 + display::format_prune(health_pruned, test_pruned, heartbeat_pruned, alerts_pruned, tls_pruned, incidents_pruned, days),
252 + );
253 + Ok(())
254 + }
255 +
256 + pub(crate) async fn cmd_serve(
257 + pool: &sqlx::SqlitePool,
258 + config: &Config,
259 + ) -> Result<()> {
260 + let default_interval = config.serve.interval_secs;
261 + let prune_days = config.serve.prune_days;
262 + let listen_addr = config.serve.listen.clone();
263 +
264 + // --- Instance identity ---
265 + let instance_id = peer::load_or_create_instance_id(config.instance.id.as_deref())?;
266 + let instance_name = config.instance_name();
267 + let instance_info = peer::InstanceInfo {
268 + id: instance_id.clone(),
269 + name: instance_name.clone(),
270 + version: env!("CARGO_PKG_VERSION").to_string(),
271 + targets: config.target_names(),
272 + started_at: chrono::Utc::now().to_rfc3339(),
273 + };
274 +
275 + // --- Alerter ---
276 + let alerter = config.alerts.as_ref().map(|alert_config| {
277 + info!("Alerts enabled (to: {})", alert_config.to);
278 + Alerter::new(alert_config.clone(), pool.clone(), instance_name.clone())
279 + });
280 +
281 + info!("Instance: {instance_name} (id={instance_id})");
282 + info!("Starting serve mode (default interval: {default_interval}s, prune: {prune_days}d)");
283 +
284 + // --- Mesh state ---
285 + let mesh = peer::new_mesh_state(instance_info, &config.peers);
286 +
287 + // Load known peer identities from DB
288 + {
289 + let mut state = mesh.write().await;
290 + for (name, peer) in state.peers.iter_mut() {
291 + if let Ok(Some(known_id)) = db::get_peer_identity(pool, name).await {
292 + peer.known_id = Some(known_id);
293 + }
294 + }
295 + }
296 +
297 + // Spawn a health check task per target
298 + let mut handles = Vec::new();
299 +
300 + for name in config.target_names() {
301 + let target_config = config.get_target(&name).unwrap().clone();
302 + if let Some(health_config) = target_config.health {
303 + let interval_secs = health_config.interval_secs.unwrap_or(default_interval);
304 + let pool = pool.clone();
305 + let name = name.clone();
306 + let label = target_config.label.clone();
307 + let alerter = alerter.clone();
308 +
309 + info!("{name}: health check every {interval_secs}s");
310 +
311 + let trending_config = health_config.trending.clone();
312 +
313 + handles.push(tokio::spawn(async move {
314 + let mut interval = tokio::time::interval(
315 + std::time::Duration::from_secs(interval_secs),
316 + );
317 + let expect = health_config.expect.as_ref();
318 + let mut in_drift = false;
319 + loop {
320 + interval.tick().await;
321 + let previous = db::get_latest_health(&pool, &name).await.ok().flatten();
322 + let snapshot = http::check_health(&name, &health_config, expect).await;
323 + info!("{}: {} ({}ms)", name, snapshot.status, snapshot.response_time_ms);
324 + if let Err(e) = db::insert_health_check(&pool, &snapshot).await {
325 + tracing::error!("{name}: failed to store health check: {e}");
326 + }
327 +
328 + // Fire alerts on status transitions
329 + if let Some(ref alerter) = alerter
330 + && let Some(ref prev) = previous
331 + && prev.status != snapshot.status
332 + {
333 + let from = prev.status.to_string();
334 + let to = snapshot.status.to_string();
335 + if snapshot.status == pom::types::HealthStatus::Operational {
336 + alerter.send_health_recovery(&name, &label, &from).await;
337 + } else {
338 + alerter.send_health_alert(
339 + &name,
340 + &label,
341 + &from,
342 + &to,
343 + snapshot.error.as_deref(),
344 + ).await;
345 + }
346 + }
347 +
348 + // Track incidents on status transitions
349 + if let Some(ref prev) = previous
350 + && prev.status != snapshot.status
351 + {
352 + let prev_op = prev.status == pom::types::HealthStatus::Operational;
353 + let now_op = snapshot.status == pom::types::HealthStatus::Operational;
354 +
355 + if prev_op && !now_op {
356 + // Was operational, now unhealthy — open incident
357 + if let Err(e) = db::insert_incident(&pool, &name, &prev.status.to_string(), &snapshot.status.to_string()).await {
358 + tracing::error!("{name}: failed to open incident: {e}");
359 + }
360 + } else if !prev_op && now_op {
361 + // Was unhealthy, now operational — close incidents
362 + if let Err(e) = db::close_open_incidents(&pool, &name).await {
363 + tracing::error!("{name}: failed to close incidents: {e}");
364 + }
365 + } else {
366 + // Status changed between non-operational states — close old, open new
367 + if let Err(e) = db::close_open_incidents(&pool, &name).await {
368 + tracing::error!("{name}: failed to close incidents: {e}");
369 + }
370 + if let Err(e) = db::insert_incident(&pool, &name, &prev.status.to_string(), &snapshot.status.to_string()).await {
371 + tracing::error!("{name}: failed to open incident: {e}");
372 + }
373 + }
374 + }
375 +
376 + // Latency drift detection
377 + if let Some(ref trending) = trending_config
378 + && snapshot.status == pom::types::HealthStatus::Operational
379 + {
380 + let baseline_cutoff = (chrono::Utc::now()
381 + - chrono::Duration::hours(trending.baseline_window_hours as i64))
382 + .to_rfc3339();
383 + let baseline_data = db::get_response_times(&pool, &name, &baseline_cutoff)
384 + .await
385 + .unwrap_or_default();
386 + let operational_times: Vec<i64> = baseline_data.iter()
387 + .filter(|(_, ms)| *ms > 0)
388 + .map(|(_, ms)| *ms)
389 + .collect();
390 + let baseline = LatencyStats::from_times(&operational_times);
391 + let recent = db::get_recent_response_times(&pool, &name, 3)
392 + .await
393 + .unwrap_or_default();
394 +
395 + if let Some(ref bl) = baseline {
396 + if let Some(msg) = http::detect_latency_drift(&recent, bl, trending.spike_threshold) {
397 + if !in_drift {
398 + info!("{name}: {msg}");
399 + if let Some(ref alerter) = alerter {
400 + alerter.send_latency_drift_alert(&name, &label, &msg).await;
401 + }
402 + in_drift = true;
403 + }
404 + } else if in_drift {
405 + info!("{name}: latency drift recovered");
406 + if let Some(ref alerter) = alerter {
407 + alerter.send_latency_recovery(&name, &label).await;
408 + }
409 + in_drift = false;
410 + }
411 + }
412 + }
413 + }
414 + }));
415 + }
416 + }
417 +
418 + // Spawn TLS check tasks
419 + let tls_interval_secs = config.serve.tls_check_interval_secs;
420 + for name in config.target_names() {
421 + let target_config = config.get_target(&name).unwrap().clone();
422 + if let Some(tls_config) = target_config.tls {
423 + let pool = pool.clone();
424 + let name = name.clone();
425 + let label = target_config.label.clone();
426 + let alerter = alerter.clone();
427 + let warn_days = tls_config.warn_days;
428 +
429 + info!("{name}: TLS check every {tls_interval_secs}s (host={})", tls_config.host);
430 +
431 + handles.push(tokio::spawn(async move {
432 + let mut interval = tokio::time::interval(
433 + std::time::Duration::from_secs(tls_interval_secs),
434 + );
435 + loop {
436 + interval.tick().await;
437 + let previous = db::get_latest_tls_check(&pool, &name).await.ok().flatten();
438 + let status = tls::check_tls(&name, &tls_config).await;
439 + info!("{}: TLS {} — {}d remaining", name, if status.valid { "valid" } else { "invalid" }, status.days_remaining);
440 + if let Err(e) = db::insert_tls_check(&pool, &status).await {
441 + tracing::error!("{name}: failed to store TLS check: {e}");
442 + }
443 +
444 + // Fire alerts on TLS state transitions
445 + if let Some(ref alerter) = alerter {
446 + let was_ok = previous.as_ref().is_none_or(|p| p.valid && p.error.is_none());
447 + let now_warn = status.valid && status.days_remaining <= warn_days as i64;
448 + let now_error = !status.valid || status.error.is_some();
449 +
450 + if was_ok && now_error {
451 + alerter.send_tls_error_alert(
452 + &name,
453 + &tls_config.host,
454 + status.error.as_deref().unwrap_or("certificate invalid"),
455 + ).await;
456 + } else if was_ok && now_warn {
457 + alerter.send_tls_expiry_alert(
458 + &name,
459 + &tls_config.host,
460 + status.days_remaining,
461 + &status.not_after,
462 + ).await;
463 + } else if let Some(ref prev) = previous {
464 + let was_bad = !prev.valid || prev.error.is_some() || prev.days_remaining <= warn_days as i64;
465 + let now_ok = status.valid && status.error.is_none() && status.days_remaining > warn_days as i64;
466 + if was_bad && now_ok {
467 + alerter.send_tls_recovery(
468 + &name,
469 + &label,
470 + status.days_remaining,
471 + ).await;
472 + }
473 + }
474 + }
475 + }
476 + }));
477 + }
478 + }
479 +
480 + // Spawn daily prune task
481 + let prune_pool = pool.clone();
482 + handles.push(tokio::spawn(async move {
483 + let mut interval = tokio::time::interval(
484 + std::time::Duration::from_secs(86400),
485 + );
486 + loop {
487 + interval.tick().await;
488 + match db::prune_old_records(&prune_pool, prune_days).await {
489 + Ok((h, t, p, a, tl, inc)) => info!("Pruned {h} health checks, {t} test runs, {p} peer heartbeats, {a} alerts, {tl} TLS checks, {inc} incidents"),
490 + Err(e) => tracing::error!("Prune failed: {e}"),
491 + }
492 + }
493 + }));
494 +
495 + // Spawn peer heartbeat tasks
496 + if !config.peers.is_empty() {
497 + let heartbeat_secs = config.serve.peer_heartbeat_secs;
498 + info!("Peer mesh: {} peers, heartbeat every {heartbeat_secs}s", config.peers.len());
499 + let hb_handles = peer::spawn_heartbeat_tasks(
500 + mesh.clone(),
Lines truncated
M src/config.rs +417 -8
@@ -1,13 +1,47 @@
1 + //! TOML configuration loading and types.
2 +
1 3 use serde::Deserialize;
2 4 use std::collections::HashMap;
3 5 use std::path::{Path, PathBuf};
4 6
7 + use crate::error::{PomError, Result};
8 + use crate::peer::OnMissing;
9 +
5 10 #[derive(Debug, Clone, Deserialize)]
6 11 pub struct Config {
7 12 #[serde(default)]
8 13 pub serve: ServeConfig,
9 14 #[serde(default)]
15 + pub instance: InstanceConfig,
16 + #[serde(default)]
10 17 pub targets: HashMap<String, TargetConfig>,
18 + #[serde(default)]
19 + pub peers: HashMap<String, PeerConfig>,
20 + pub alerts: Option<AlertConfig>,
21 + }
22 +
23 + #[derive(Debug, Clone, Deserialize)]
24 + pub struct AlertConfig {
25 + pub postmark_token: Option<String>,
26 + pub to: String,
27 + #[serde(default = "default_alert_from")]
28 + pub from: String,
29 + #[serde(default = "default_cooldown_secs")]
30 + pub cooldown_secs: u64,
31 + }
32 +
33 + #[derive(Debug, Clone, Default, Deserialize)]
34 + pub struct InstanceConfig {
35 + pub name: Option<String>,
36 + pub id: Option<String>,
37 + }
38 +
39 + #[derive(Debug, Clone, Deserialize)]
40 + pub struct PeerConfig {
41 + pub address: String,
42 + #[serde(default)]
43 + pub on_missing: OnMissing,
44 + pub grace_count: Option<u32>,
11 45 }
12 46
13 47 #[derive(Debug, Clone, Deserialize)]
@@ -16,6 +50,12 @@ pub struct ServeConfig {
16 50 pub interval_secs: u64,
17 51 #[serde(default = "default_prune_days")]
18 52 pub prune_days: i64,
53 + #[serde(default = "default_listen")]
54 + pub listen: String,
55 + #[serde(default = "default_peer_heartbeat")]
56 + pub peer_heartbeat_secs: u64,
57 + #[serde(default = "default_tls_check_interval")]
58 + pub tls_check_interval_secs: u64,
19 59 }
20 60
21 61 impl Default for ServeConfig {
@@ -23,10 +63,21 @@ impl Default for ServeConfig {
23 63 Self {
24 64 interval_secs: 300,
25 65 prune_days: 30,
66 + listen: default_listen(),
67 + peer_heartbeat_secs: 60,
68 + tls_check_interval_secs: 3600,
26 69 }
27 70 }
28 71 }
29 72
73 + fn default_peer_heartbeat() -> u64 {
74 + 60
75 + }
76 +
77 + fn default_tls_check_interval() -> u64 {
78 + 3600
79 + }
80 +
30 81 fn default_serve_interval() -> u64 {
31 82 300
32 83 }
@@ -35,11 +86,33 @@ fn default_prune_days() -> i64 {
35 86 30
36 87 }
37 88
89 + fn default_listen() -> String {
90 + "127.0.0.1:9100".to_string()
91 + }
92 +
38 93 #[derive(Debug, Clone, Deserialize)]
39 94 pub struct TargetConfig {
40 95 pub label: String,
41 96 pub health: Option<HealthConfig>,
42 97 pub tests: Option<TestsConfig>,
98 + pub tls: Option<TlsConfig>,
99 + }
100 +
101 + #[derive(Debug, Clone, Deserialize)]
102 + pub struct TlsConfig {
103 + pub host: String,
104 + #[serde(default = "default_tls_port")]
105 + pub port: u16,
106 + #[serde(default = "default_tls_warn_days")]
107 + pub warn_days: u32,
108 + }
109 +
110 + fn default_tls_port() -> u16 {
111 + 443
112 + }
113 +
114 + fn default_tls_warn_days() -> u32 {
115 + 14
43 116 }
44 117
45 118 #[derive(Debug, Clone, Deserialize)]
@@ -49,6 +122,34 @@ pub struct HealthConfig {
49 122 pub timeout_secs: u64,
50 123 /// Per-target interval override for serve mode
51 124 pub interval_secs: Option<u64>,
125 + /// Response validation expectations
126 + pub expect: Option<HealthExpectation>,
127 + /// Latency trending and drift detection
128 + pub trending: Option<TrendingConfig>,
129 + }
130 +
131 + #[derive(Debug, Clone, Deserialize)]
132 + pub struct TrendingConfig {
133 + #[serde(default = "default_baseline_window_hours")]
134 + pub baseline_window_hours: u64,
135 + #[serde(default = "default_spike_threshold")]
136 + pub spike_threshold: f64,
137 + }
138 +
139 + fn default_baseline_window_hours() -> u64 {
140 + 168
141 + }
142 +
143 + fn default_spike_threshold() -> f64 {
144 + 2.0
145 + }
146 +
147 + #[derive(Debug, Clone, Deserialize, Default)]
148 + pub struct HealthExpectation {
149 + pub status_code: Option<u16>,
150 + #[serde(default)]
151 + pub json_fields: HashMap<String, String>,
152 + pub body_contains: Option<String>,
52 153 }
53 154
54 155 #[derive(Debug, Clone, Deserialize)]
@@ -57,6 +158,12 @@ pub struct TestsConfig {
57 158 pub command: String,
58 159 #[serde(default = "default_test_timeout")]
59 160 pub timeout_secs: u64,
161 + #[serde(default = "default_staleness_days")]
162 + pub staleness_days: u64,
163 + }
164 +
165 + fn default_staleness_days() -> u64 {
166 + 7
60 167 }
61 168
62 169 fn default_health_timeout() -> u64 {
@@ -67,15 +174,26 @@ fn default_test_timeout() -> u64 {
67 174 600
68 175 }
69 176
177 + fn default_alert_from() -> String {
178 + "PoM Alerts <pom-alerts@makenot.work>".to_string()
179 + }
180 +
181 + fn default_cooldown_secs() -> u64 {
182 + 300
183 + }
184 +
70 185 impl Config {
71 - pub fn load(path: Option<&Path>) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {
186 + pub fn load(path: Option<&Path>) -> Result<Self> {
72 187 let config_path = match path {
73 188 Some(p) => p.to_path_buf(),
74 189 None => default_config_path()?,
75 190 };
76 191
77 192 if !config_path.exists() {
78 - return Err(format!("Config file not found: {}", config_path.display()).into());
193 + return Err(PomError::Config(format!(
194 + "Config file not found: {}",
195 + config_path.display()
196 + )));
79 197 }
80 198
81 199 let contents = std::fs::read_to_string(&config_path)?;
@@ -92,16 +210,318 @@ impl Config {
92 210 names.sort();
93 211 names
94 212 }
213 +
214 + pub fn instance_name(&self) -> String {
215 + self.instance
216 + .name
217 + .clone()
218 + .unwrap_or_else(|| hostname::get().map(|h| h.to_string_lossy().into_owned()).unwrap_or_else(|_| "unknown".to_string()))
219 + }
95 220 }
96 221
97 - pub fn default_config_path() -> Result<PathBuf, Box<dyn std::error::Error + Send + Sync>> {
98 - let config_dir = dirs::config_dir().ok_or("Could not determine config directory")?;
99 - Ok(config_dir.join("pom").join("pom.toml"))
222 + pub fn default_config_path() -> Result<PathBuf> {
223 + let config_dir =
224 + dirs::config_dir().ok_or_else(|| PomError::Config("Could not determine config directory".into()));
225 + Ok(config_dir?.join("pom").join("pom.toml"))
100 226 }
101 227
102 - pub fn db_path() -> Result<PathBuf, Box<dyn std::error::Error + Send + Sync>> {
103 - let data_dir = dirs::data_local_dir().ok_or("Could not determine data directory")?;
104 - let pom_dir = data_dir.join("pom");
228 + pub fn db_path() -> Result<PathBuf> {
229 + let data_dir =
230 + dirs::data_local_dir().ok_or_else(|| PomError::Config("Could not determine data directory".into()));
231 + let pom_dir = data_dir?.join("pom");
105 232 std::fs::create_dir_all(&pom_dir)?;
106 233 Ok(pom_dir.join("pom.db"))
107 234 }
235 +
236 + #[cfg(test)]
237 + mod tests {
238 + use super::*;
239 +
240 + #[test]
241 + fn parse_full_config() {
242 + let toml = r#"
243 + [serve]
244 + interval_secs = 120
245 + listen = "0.0.0.0:9100"
246 + peer_heartbeat_secs = 30
247 +
248 + [instance]
249 + name = "hetzner"
250 +
251 + [targets.mnw]
252 + label = "MakeNotWork"
253 + [targets.mnw.health]
254 + url = "https://makenot.work/health"
255 + timeout_secs = 5
256 + [targets.mnw.tests]
257 + ssh = "hetzner"
258 + command = "cd /srv/mnw && ./ci.sh"
259 +
260 + [peers.astra]
261 + address = "100.0.0.1:9100"
262 + on_missing = "alert"
263 + grace_count = 5
264 + "#;
265 +
266 + let config: Config = toml::from_str(toml).unwrap();
267 + assert_eq!(config.serve.interval_secs, 120);
268 + assert_eq!(config.serve.listen, "0.0.0.0:9100");
269 + assert_eq!(config.serve.peer_heartbeat_secs, 30);
270 + assert_eq!(config.instance.name.as_deref(), Some("hetzner"));
271 + assert_eq!(config.target_names(), vec!["mnw"]);
272 +
273 + let mnw = config.get_target("mnw").unwrap();
274 + assert_eq!(mnw.label, "MakeNotWork");
275 + assert_eq!(mnw.health.as_ref().unwrap().timeout_secs, 5);
276 + assert_eq!(mnw.tests.as_ref().unwrap().ssh, "hetzner");
277 +
278 + let astra = config.peers.get("astra").unwrap();
279 + assert_eq!(astra.address, "100.0.0.1:9100");
280 + assert_eq!(astra.on_missing, OnMissing::Alert);
281 + assert_eq!(astra.grace_count, Some(5));
282 + }
283 +
284 + #[test]
285 + fn empty_config_uses_defaults() {
286 + let config: Config = toml::from_str("").unwrap();
287 + assert_eq!(config.serve.interval_secs, 300);
288 + assert_eq!(config.serve.prune_days, 30);
289 + assert_eq!(config.serve.listen, "127.0.0.1:9100");
290 + assert_eq!(config.serve.peer_heartbeat_secs, 60);
291 + assert!(config.targets.is_empty());
292 + assert!(config.peers.is_empty());
293 + assert!(config.instance.name.is_none());
294 + }
295 +
296 + #[test]
297 + fn peer_on_missing_defaults_to_log() {
298 + let toml = r#"
299 + [peers.test]
300 + address = "10.0.0.1:9100"
301 + "#;
302 + let config: Config = toml::from_str(toml).unwrap();
303 + let peer = config.peers.get("test").unwrap();
304 + assert_eq!(peer.on_missing, OnMissing::Log);
305 + assert_eq!(peer.grace_count, None);
306 + }
307 +
308 + #[test]
309 + fn instance_name_falls_back_to_hostname() {
310 + let config: Config = toml::from_str("").unwrap();
311 + let name = config.instance_name();
312 + assert!(!name.is_empty());
313 + }
314 +
315 + #[test]
316 + fn config_without_alerts_section() {
317 + let config: Config = toml::from_str("").unwrap();
318 + assert!(config.alerts.is_none());
319 + }
320 +
321 + #[test]
322 + fn config_with_alerts_section() {
323 + let toml = r#"
324 + [alerts]
325 + postmark_token = "test-token"
326 + to = "alerts@example.com"
327 + "#;
328 + let config: Config = toml::from_str(toml).unwrap();
329 + let alerts = config.alerts.unwrap();
330 + assert_eq!(alerts.postmark_token.as_deref(), Some("test-token"));
331 + assert_eq!(alerts.to, "alerts@example.com");
332 + assert_eq!(alerts.from, "PoM Alerts <pom-alerts@makenot.work>");
333 + assert_eq!(alerts.cooldown_secs, 300);
334 + }
335 +
336 + #[test]
337 + fn config_with_tls() {
338 + let toml = r#"
339 + [targets.mnw]
340 + label = "MakeNotWork"
341 + [targets.mnw.tls]
342 + host = "makenot.work"
343 + port = 8443
344 + warn_days = 30
345 + "#;
346 + let config: Config = toml::from_str(toml).unwrap();
347 + let mnw = config.get_target("mnw").unwrap();
348 + let tls = mnw.tls.as_ref().unwrap();
349 + assert_eq!(tls.host, "makenot.work");
350 + assert_eq!(tls.port, 8443);
351 + assert_eq!(tls.warn_days, 30);
352 + }
353 +
354 + #[test]
355 + fn config_tls_defaults() {
356 + let toml = r#"
357 + [targets.mnw]
358 + label = "MakeNotWork"
359 + [targets.mnw.tls]
360 + host = "makenot.work"
361 + "#;
362 + let config: Config = toml::from_str(toml).unwrap();
363 + let tls = config.get_target("mnw").unwrap().tls.as_ref().unwrap();
364 + assert_eq!(tls.port, 443);
365 + assert_eq!(tls.warn_days, 14);
366 + }
367 +
368 + #[test]
369 + fn config_without_tls() {
370 + let toml = r#"
371 + [targets.mnw]
372 + label = "MakeNotWork"
373 + "#;
374 + let config: Config = toml::from_str(toml).unwrap();
375 + assert!(config.get_target("mnw").unwrap().tls.is_none());
376 + }
377 +
378 + #[test]
379 + fn config_tls_check_interval_default() {
380 + let config: Config = toml::from_str("").unwrap();
381 + assert_eq!(config.serve.tls_check_interval_secs, 3600);
382 + }
383 +
384 + #[test]
385 + fn config_tls_check_interval_custom() {
386 + let toml = r#"
387 + [serve]
388 + tls_check_interval_secs = 1800
389 + "#;
390 + let config: Config = toml::from_str(toml).unwrap();
391 + assert_eq!(config.serve.tls_check_interval_secs, 1800);
392 + }
393 +
394 + #[test]
395 + fn config_with_health_expect() {
396 + let toml = r#"
397 + [targets.mnw]
398 + label = "MakeNotWork"
399 + [targets.mnw.health]
400 + url = "https://makenot.work/health"
401 + [targets.mnw.health.expect]
402 + status_code = 200
403 + body_contains = "operational"
404 + json_fields = { "status" = "operational", "checks.db" = "ok" }
405 + "#;
406 + let config: Config = toml::from_str(toml).unwrap();
407 + let expect = config.get_target("mnw").unwrap().health.as_ref().unwrap().expect.as_ref().unwrap();
408 + assert_eq!(expect.status_code, Some(200));
409 + assert_eq!(expect.body_contains.as_deref(), Some("operational"));
410 + assert_eq!(expect.json_fields.get("status").unwrap(), "operational");
411 + assert_eq!(expect.json_fields.get("checks.db").unwrap(), "ok");
412 + }
413 +
414 + #[test]
415 + fn config_health_without_expect() {
416 + let toml = r#"
417 + [targets.mnw]
418 + label = "MakeNotWork"
419 + [targets.mnw.health]
420 + url = "https://makenot.work/health"
421 + "#;
422 + let config: Config = toml::from_str(toml).unwrap();
423 + assert!(config.get_target("mnw").unwrap().health.as_ref().unwrap().expect.is_none());
424 + }
425 +
426 + #[test]
427 + fn config_with_trending() {
428 + let toml = r#"
429 + [targets.mnw]
430 + label = "MakeNotWork"
431 + [targets.mnw.health]
432 + url = "https://makenot.work/health"
433 + [targets.mnw.health.trending]
434 + baseline_window_hours = 48
435 + spike_threshold = 1.5
436 + "#;
437 + let config: Config = toml::from_str(toml).unwrap();
438 + let trending = config.get_target("mnw").unwrap().health.as_ref().unwrap().trending.as_ref().unwrap();
439 + assert_eq!(trending.baseline_window_hours, 48);
440 + assert_eq!(trending.spike_threshold, 1.5);
441 + }
442 +
443 + #[test]
444 + fn config_trending_defaults() {
445 + let toml = r#"
446 + [targets.mnw]
447 + label = "MakeNotWork"
448 + [targets.mnw.health]
449 + url = "https://makenot.work/health"
450 + [targets.mnw.health.trending]
451 + "#;
452 + let config: Config = toml::from_str(toml).unwrap();
453 + let trending = config.get_target("mnw").unwrap().health.as_ref().unwrap().trending.as_ref().unwrap();
454 + assert_eq!(trending.baseline_window_hours, 168);
455 + assert_eq!(trending.spike_threshold, 2.0);
456 + }
457 +
458 + #[test]
459 + fn config_without_trending() {
460 + let toml = r#"
461 + [targets.mnw]
462 + label = "MakeNotWork"
463 + [targets.mnw.health]
464 + url = "https://makenot.work/health"
465 + "#;
466 + let config: Config = toml::from_str(toml).unwrap();
467 + assert!(config.get_target("mnw").unwrap().health.as_ref().unwrap().trending.is_none());
468 + }
469 +
470 + #[test]
471 + fn config_health_expect_empty() {
472 + let toml = r#"
473 + [targets.mnw]
474 + label = "MakeNotWork"
475 + [targets.mnw.health]
476 + url = "https://makenot.work/health"
477 + [targets.mnw.health.expect]
478 + "#;
479 + let config: Config = toml::from_str(toml).unwrap();
480 + let expect = config.get_target("mnw").unwrap().health.as_ref().unwrap().expect.as_ref().unwrap();
481 + assert_eq!(expect.status_code, None);
482 + assert!(expect.json_fields.is_empty());
483 + assert_eq!(expect.body_contains, None);
484 + }
485 +
486 + #[test]
487 + fn config_staleness_days_default() {
488 + let toml = r#"
489 + [targets.mnw]
490 + label = "MakeNotWork"
491 + [targets.mnw.tests]
492 + ssh = "host"
493 + command = "./ci.sh"
494 + "#;
495 + let config: Config = toml::from_str(toml).unwrap();
496 + assert_eq!(config.get_target("mnw").unwrap().tests.as_ref().unwrap().staleness_days, 7);
497 + }
498 +
499 + #[test]
500 + fn config_staleness_days_custom() {
501 + let toml = r#"
502 + [targets.mnw]
503 + label = "MakeNotWork"
504 + [targets.mnw.tests]
505 + ssh = "host"
506 + command = "./ci.sh"
507 + staleness_days = 14
508 + "#;
509 + let config: Config = toml::from_str(toml).unwrap();
510 + assert_eq!(config.get_target("mnw").unwrap().tests.as_ref().unwrap().staleness_days, 14);
511 + }
512 +
513 + #[test]
514 + fn config_with_alerts_custom_defaults() {
515 + let toml = r#"
516 + [alerts]
Lines truncated
M src/db.rs +392 -33
@@ -1,10 +1,103 @@
1 + //! SQLite persistence — schema, health checks, test runs, and peer data.
2 + //!
3 + //! Uses a migration versioning system: each schema change is a numbered migration
4 + //! stored in [`MIGRATIONS`]. On startup, [`run_migrations`] checks the current
5 + //! version and runs any pending migrations. Existing databases (pre-migration)
6 + //! are detected by the presence of the `health_checks` table and marked as v1.
7 +
1 8 use sqlx::sqlite::{SqliteConnectOptions, SqlitePool, SqlitePoolOptions};
2 9 use std::path::Path;
3 10 use std::str::FromStr;
11 + use tracing::info;
4 12
5 - use crate::types::{HealthDetails, HealthSnapshot, HealthStatus, TestRun, TestSummary};
13 + use crate::error::Result;
14 + use crate::types::{HealthDetails, HealthSnapshot, HealthStatus, TestRun, TestSummary, TlsStatus};
6 15
7 - pub async fn connect(path: &Path) -> Result<SqlitePool, Box<dyn std::error::Error + Send + Sync>> {
16 + /// Each migration is a (version, description, SQL) tuple. Versions start at 1.
17 + /// The SQL may contain multiple statements separated by semicolons.
18 + const MIGRATIONS: &[(i64, &str, &str)] = &[
19 + (1, "initial schema", r#"
20 + CREATE TABLE IF NOT EXISTS health_checks (
21 + id INTEGER PRIMARY KEY AUTOINCREMENT,
22 + target TEXT NOT NULL,
23 + status TEXT NOT NULL,
24 + checked_at TEXT NOT NULL,
25 + response_time_ms INTEGER NOT NULL,
26 + details_json TEXT,
27 + error TEXT
28 + );
29 + CREATE TABLE IF NOT EXISTS test_runs (
30 + id INTEGER PRIMARY KEY AUTOINCREMENT,
31 + target TEXT NOT NULL,
32 + started_at TEXT NOT NULL,
33 + finished_at TEXT,
34 + duration_secs INTEGER,
35 + exit_code INTEGER,
36 + passed INTEGER NOT NULL,
37 + summary_json TEXT NOT NULL,
38 + raw_output TEXT NOT NULL,
39 + filter TEXT
40 + );
41 + CREATE TABLE IF NOT EXISTS peer_identities (
42 + peer_name TEXT PRIMARY KEY,
43 + instance_id TEXT NOT NULL,
44 + first_seen TEXT NOT NULL
45 + );
46 + CREATE TABLE IF NOT EXISTS peer_heartbeats (
47 + id INTEGER PRIMARY KEY AUTOINCREMENT,
48 + peer_name TEXT NOT NULL,
49 + status TEXT NOT NULL,
50 + latency_ms INTEGER NOT NULL,
51 + checked_at TEXT NOT NULL
52 + );
53 + CREATE INDEX IF NOT EXISTS idx_health_checks_target_id ON health_checks(target, id DESC);
54 + CREATE INDEX IF NOT EXISTS idx_health_checks_target_checked ON health_checks(target, checked_at);
55 + CREATE INDEX IF NOT EXISTS idx_test_runs_target_id ON test_runs(target, id DESC);
56 + CREATE INDEX IF NOT EXISTS idx_peer_heartbeats_peer_id ON peer_heartbeats(peer_name, id DESC);
57 + "#),
58 + (2, "add alerts table", r#"
59 + CREATE TABLE IF NOT EXISTS alerts (
60 + id INTEGER PRIMARY KEY AUTOINCREMENT,
61 + target TEXT NOT NULL,
62 + alert_type TEXT NOT NULL,
63 + from_status TEXT,
64 + to_status TEXT,
65 + sent_at TEXT NOT NULL,
66 + error TEXT
67 + );
68 + CREATE INDEX IF NOT EXISTS idx_alerts_target_sent ON alerts(target, sent_at);
69 + "#),
70 + (3, "add tls_checks table", r#"
71 + CREATE TABLE tls_checks (
72 + id INTEGER PRIMARY KEY AUTOINCREMENT,
73 + target TEXT NOT NULL,
74 + host TEXT NOT NULL,
75 + valid INTEGER NOT NULL,
76 + days_remaining INTEGER NOT NULL,
77 + not_before TEXT NOT NULL,
78 + not_after TEXT NOT NULL,
79 + subject TEXT NOT NULL,
80 + issuer TEXT NOT NULL,
81 + checked_at TEXT NOT NULL,
82 + error TEXT
83 + );
84 + CREATE INDEX idx_tls_checks_target_id ON tls_checks(target, id DESC);
85 + "#),
86 + (4, "add incidents table", r#"
87 + CREATE TABLE incidents (
88 + id INTEGER PRIMARY KEY AUTOINCREMENT,
89 + target TEXT NOT NULL,
90 + started_at TEXT NOT NULL,
91 + ended_at TEXT,
92 + duration_secs INTEGER,
93 + from_status TEXT NOT NULL,
94 + to_status TEXT NOT NULL
95 + );
96 + CREATE INDEX idx_incidents_target_id ON incidents(target, id DESC);
97 + "#),
98 + ];
99 +
100 + pub async fn connect(path: &Path) -> Result<SqlitePool> {
8 101 let opts = SqliteConnectOptions::from_str(&format!("sqlite:{}", path.display()))?
9 102 .create_if_missing(true)
10 103 .journal_mode(sqlx::sqlite::SqliteJournalMode::Wal);
@@ -14,53 +107,113 @@ pub async fn connect(path: &Path) -> Result<SqlitePool, Box<dyn std::error::Erro
14 107 .connect_with(opts)
15 108 .await?;
16 109
17 - init_schema(&pool).await?;
110 + run_migrations(&pool).await?;
18 111 Ok(pool)
19 112 }
20 113
21 - pub async fn connect_in_memory() -> Result<SqlitePool, Box<dyn std::error::Error + Send + Sync>> {
114 + pub async fn connect_in_memory() -> Result<SqlitePool> {
22 115 let opts = SqliteConnectOptions::from_str("sqlite::memory:")?;
23 116 let pool = SqlitePoolOptions::new()
24 117 .max_connections(1)
25 118 .connect_with(opts)
26 119 .await?;
27 120
28 - init_schema(&pool).await?;
121 + run_migrations(&pool).await?;
29 122 Ok(pool)
30 123 }
31 124
32 - async fn init_schema(pool: &SqlitePool) -> Result<(), sqlx::Error> {
125 + /// Run pending schema migrations. Detects pre-migration databases by checking
126 + /// for existing tables and stamps them as version 1 without re-running.
127 + pub async fn run_migrations(pool: &SqlitePool) -> Result<()> {
128 + // Ensure the schema_version table exists
33 129 sqlx::query(
34 - "CREATE TABLE IF NOT EXISTS health_checks (
35 - id INTEGER PRIMARY KEY AUTOINCREMENT,
36 - target TEXT NOT NULL,
37 - status TEXT NOT NULL,
38 - checked_at TEXT NOT NULL,
39 - response_time_ms INTEGER NOT NULL,
40 - details_json TEXT,
41 - error TEXT
130 + "CREATE TABLE IF NOT EXISTS schema_version (
131 + version INTEGER NOT NULL,
132 + description TEXT NOT NULL,
133 + applied_at TEXT NOT NULL
42 134 )",
43 135 )
44 136 .execute(pool)
45 137 .await?;
46 138
139 + let current_version = get_schema_version(pool).await?;
140 +
141 + // Detect pre-migration databases: if schema_version is empty but tables exist,
142 + // this is an existing database that predates the migration system.
143 + if current_version == 0 && has_existing_tables(pool).await? {
144 + info!("detected pre-migration database, stamping as version 1");
145 + stamp_version(pool, 1, "initial schema (pre-existing)").await?;
146 + // Run remaining migrations (2+) if any
147 + for &(version, description, sql) in MIGRATIONS {
148 + if version > 1 {
149 + run_one_migration(pool, version, description, sql).await?;
150 + }
151 + }
152 + return Ok(());
153 + }
154 +
155 + // Run all migrations newer than current version
156 + for &(version, description, sql) in MIGRATIONS {
157 + if version > current_version {
158 + run_one_migration(pool, version, description, sql).await?;
159 + }
160 + }
161 +
162 + Ok(())
163 + }
164 +
165 + /// Get the current schema version (0 if no migrations have been applied).
166 + pub async fn get_schema_version(pool: &SqlitePool) -> Result<i64> {
167 + let row = sqlx::query_as::<_, (i64,)>(
168 + "SELECT COALESCE(MAX(version), 0) FROM schema_version",
169 + )
170 + .fetch_one(pool)
171 + .await?;
172 + Ok(row.0)
173 + }
174 +
175 + /// Check whether the database has existing tables from before the migration system.
176 + async fn has_existing_tables(pool: &SqlitePool) -> Result<bool> {
177 + let row = sqlx::query_as::<_, (i64,)>(
178 + "SELECT COUNT(*) FROM sqlite_master WHERE type = 'table' AND name = 'health_checks'",
179 + )
180 + .fetch_one(pool)
181 + .await?;
182 + Ok(row.0 > 0)
183 + }
184 +
185 + /// Execute a single migration's SQL and record it in schema_version.
186 + async fn run_one_migration(
187 + pool: &SqlitePool,
188 + version: i64,
189 + description: &str,
190 + sql: &str,
191 + ) -> Result<()> {
192 + info!(version, description, "running migration");
193 +
194 + // Execute each statement in the migration SQL
195 + for statement in sql.split(';') {
196 + let trimmed = statement.trim();
197 + if !trimmed.is_empty() {
198 + sqlx::query(trimmed).execute(pool).await?;
199 + }
200 + }
201 +
202 + stamp_version(pool, version, description).await?;
203 + Ok(())
204 + }
205 +
206 + /// Record a version in the schema_version table.
207 + async fn stamp_version(pool: &SqlitePool, version: i64, description: &str) -> Result<()> {
208 + let now = chrono::Utc::now().to_rfc3339();
47 209 sqlx::query(
48 - "CREATE TABLE IF NOT EXISTS test_runs (
49 - id INTEGER PRIMARY KEY AUTOINCREMENT,
50 - target TEXT NOT NULL,
51 - started_at TEXT NOT NULL,
52 - finished_at TEXT,
53 - duration_secs INTEGER,
54 - exit_code INTEGER,
55 - passed INTEGER NOT NULL,
56 - summary_json TEXT NOT NULL,
57 - raw_output TEXT NOT NULL,
58 - filter TEXT
59 - )",
210 + "INSERT INTO schema_version (version, description, applied_at) VALUES (?, ?, ?)",
60 211 )
212 + .bind(version)
213 + .bind(description)
214 + .bind(&now)
61 215 .execute(pool)
62 216 .await?;
63 -
64 217 Ok(())
65 218 }
66 219
@@ -69,7 +222,7 @@ async fn init_schema(pool: &SqlitePool) -> Result<(), sqlx::Error> {
69 222 pub async fn insert_health_check(
70 223 pool: &SqlitePool,
71 224 snapshot: &HealthSnapshot,
72 - ) -> Result<i64, sqlx::Error> {
225 + ) -> Result<i64> {
73 226 let status = snapshot.status.to_string();
74 227 let details_json = snapshot
75 228 .details
@@ -96,7 +249,7 @@ pub async fn get_health_history(
96 249 pool: &SqlitePool,
97 250 target: Option<&str>,
98 251 limit: i64,
99 - ) -> Result<Vec<HealthSnapshot>, sqlx::Error> {
252 + ) -> Result<Vec<HealthSnapshot>> {
100 253 let rows = match target {
101 254 Some(t) => {
102 255 sqlx::query_as::<_, HealthCheckRow>(
@@ -125,7 +278,7 @@ pub async fn get_health_history(
125 278 pub async fn get_latest_health(
126 279 pool: &SqlitePool,
127 280 target: &str,
128 - ) -> Result<Option<HealthSnapshot>, sqlx::Error> {
281 + ) -> Result<Option<HealthSnapshot>> {
129 282 let row = sqlx::query_as::<_, HealthCheckRow>(
130 283 "SELECT id, target, status, checked_at, response_time_ms, details_json, error
131 284 FROM health_checks WHERE target = ? ORDER BY id DESC LIMIT 1",
@@ -142,7 +295,7 @@ pub async fn get_latest_health(
142 295 pub async fn insert_test_run(
143 296 pool: &SqlitePool,
144 297 run: &TestRun,
145 - ) -> Result<i64, sqlx::Error> {
298 + ) -> Result<i64> {
146 299 let summary_json = serde_json::to_string(&run.summary).unwrap_or_default();
147 300
148 301 let result = sqlx::query(
@@ -168,7 +321,7 @@ pub async fn get_test_history(
168 321 pool: &SqlitePool,
169 322 target: Option<&str>,
170 323 limit: i64,
171 - ) -> Result<Vec<TestRun>, sqlx::Error> {
324 + ) -> Result<Vec<TestRun>> {
172 325 let rows = match target {
173 326 Some(t) => {
174 327 sqlx::query_as::<_, TestRunRow>(
@@ -197,7 +350,7 @@ pub async fn get_test_history(
197 350 pub async fn get_latest_test_run(
198 351 pool: &SqlitePool,
199 352 target: &str,
200 - ) -> Result<Option<TestRun>, sqlx::Error> {
353 + ) -> Result<Option<TestRun>> {
201 354 let row = sqlx::query_as::<_, TestRunRow>(
202 355 "SELECT id, target, started_at, finished_at, duration_secs, exit_code, passed, summary_json, raw_output, filter
203 356 FROM test_runs WHERE target = ? ORDER BY id DESC LIMIT 1",
@@ -209,10 +362,285 @@ pub async fn get_latest_test_run(
209 362 Ok(row.map(|r| r.into_test_run()))
210 363 }
211 364
365 + /// Get the version from the health check closest to (but before) a given timestamp.
366 + pub async fn get_version_at_time(
367 + pool: &SqlitePool,
368 + target: &str,
369 + before_rfc3339: &str,
370 + ) -> Result<Option<String>> {
371 + let row = sqlx::query_as::<_, (Option<String>,)>(
372 + "SELECT details_json FROM health_checks
373 + WHERE target = ? AND checked_at <= ?
374 + ORDER BY checked_at DESC LIMIT 1",
375 + )
376 + .bind(target)
377 + .bind(before_rfc3339)
378 + .fetch_optional(pool)
379 + .await?;
380 +
381 + let version = row
382 + .and_then(|r| r.0)
383 + .and_then(|json_str| serde_json::from_str::<serde_json::Value>(&json_str).ok())
384 + .and_then(|json| json.get("version").and_then(|v| v.as_str()).map(String::from));
385 +
386 + Ok(version)
387 + }
388 +
389 + /// Calculate uptime percentage for a target over the given number of hours.
390 + /// Returns the percentage of health checks with "operational" status.
391 + pub async fn get_uptime_percent(
392 + pool: &SqlitePool,
393 + target: &str,
394 + hours: i64,
395 + ) -> Result<Option<f64>> {
396 + let cutoff = chrono::Utc::now() - chrono::Duration::hours(hours);
397 + let cutoff_str = cutoff.to_rfc3339();
398 +
399 + let row = sqlx::query_as::<_, (i64, i64)>(
400 + "SELECT
401 + COUNT(*) as total,
402 + SUM(CASE WHEN status = 'operational' THEN 1 ELSE 0 END) as operational
403 + FROM health_checks
404 + WHERE target = ? AND checked_at >= ?",
405 + )
406 + .bind(target)
407 + .bind(&cutoff_str)
408 + .fetch_one(pool)
409 + .await?;
410 +
411 + if row.0 == 0 {
412 + Ok(None)
413 + } else {
414 + Ok(Some(row.1 as f64 / row.0 as f64 * 100.0))
415 + }
416 + }
417 +
418 + // --- Latency trending queries ---
419 +
420 + /// Fetch all response times for a target since a given timestamp, ordered ASC.
421 + pub async fn get_response_times(
422 + pool: &SqlitePool,
423 + target: &str,
424 + since_rfc3339: &str,
425 + ) -> Result<Vec<(String, i64)>> {
426 + let rows = sqlx::query_as::<_, (String, i64)>(
427 + "SELECT checked_at, response_time_ms FROM health_checks
428 + WHERE target = ? AND checked_at >= ?
429 + ORDER BY checked_at ASC",
430 + )
431 + .bind(target)
432 + .bind(since_rfc3339)
433 + .fetch_all(pool)
434 + .await?;
435 + Ok(rows)
436 + }
437 +
438 + /// Fetch the last N response times for **operational** checks only (most recent first).
439 + pub async fn get_recent_response_times(
440 + pool: &SqlitePool,
441 + target: &str,
442 + count: i64,
443 + ) -> Result<Vec<i64>> {
444 + let rows = sqlx::query_as::<_, (i64,)>(
445 + "SELECT response_time_ms FROM health_checks
446 + WHERE target = ? AND status = 'operational'
447 + ORDER BY id DESC LIMIT ?",
448 + )
449 + .bind(target)
450 + .bind(count)
451 + .fetch_all(pool)
452 + .await?;
453 + Ok(rows.into_iter().map(|r| r.0).collect())
454 + }
455 +
456 + // --- Alert queries ---
457 +
458 + #[derive(Debug, sqlx::FromRow)]
459 + pub struct AlertRow {
460 + pub id: i64,
461 + pub target: String,
462 + pub alert_type: String,
463 + pub from_status: Option<String>,
464 + pub to_status: Option<String>,
465 + pub sent_at: String,
466 + pub error: Option<String>,
467 + }
468 +
469 + pub async fn insert_alert(
470 + pool: &SqlitePool,
471 + target: &str,
472 + alert_type: &str,
473 + from_status: Option<&str>,
474 + to_status: Option<&str>,
475 + error: Option<&str>,
476 + ) -> Result<i64> {
477 + let now = chrono::Utc::now().to_rfc3339();
478 + let result = sqlx::query(
479 + "INSERT INTO alerts (target, alert_type, from_status, to_status, sent_at, error)
480 + VALUES (?, ?, ?, ?, ?, ?)",
481 + )
482 + .bind(target)
483 + .bind(alert_type)
484 + .bind(from_status)
485 + .bind(to_status)
486 + .bind(&now)
487 + .bind(error)
488 + .execute(pool)
489 + .await?;
490 + Ok(result.last_insert_rowid())
491 + }
492 +
493 + pub async fn get_latest_alert_for_target(
494 + pool: &SqlitePool,
495 + target: &str,
496 + ) -> Result<Option<AlertRow>> {
497 + Ok(sqlx::query_as::<_, AlertRow>(
498 + "SELECT id, target, alert_type, from_status, to_status, sent_at, error
499 + FROM alerts WHERE target = ? ORDER BY id DESC LIMIT 1",
500 + )
501 + .bind(target)
502 + .fetch_optional(pool)
503 + .await?)
504 + }
505 +
506 + // --- TLS check queries ---
507 +
508 + #[derive(Debug, sqlx::FromRow, serde::Serialize)]
509 + pub struct TlsCheckRow {
510 + pub id: i64,
511 + pub target: String,
512 + pub host: String,
513 + pub valid: bool,
514 + pub days_remaining: i64,
515 + pub not_before: String,
516 + pub not_after: String,
517 + pub subject: String,
518 + pub issuer: String,
519 + pub checked_at: String,
520 + pub error: Option<String>,
521 + }
522 +
523 + pub async fn insert_tls_check(
524 + pool: &SqlitePool,
525 + status: &TlsStatus,
526 + ) -> Result<i64> {
527 + let result = sqlx::query(
528 + "INSERT INTO tls_checks (target, host, valid, days_remaining, not_before, not_after, subject, issuer, checked_at, error)
529 + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
530 + )
531 + .bind(&status.target)
532 + .bind(&status.host)
533 + .bind(status.valid)
534 + .bind(status.days_remaining)
535 + .bind(&status.not_before)
536 + .bind(&status.not_after)
537 + .bind(&status.subject)
538 + .bind(&status.issuer)
539 + .bind(&status.checked_at)
540 + .bind(&status.error)
541 + .execute(pool)
542 + .await?;
543 +
544 + Ok(result.last_insert_rowid())
545 + }
546 +
547 + pub async fn get_latest_tls_check(
548 + pool: &SqlitePool,
549 + target: &str,
550 + ) -> Result<Option<TlsCheckRow>> {
551 + Ok(sqlx::query_as::<_, TlsCheckRow>(
552 + "SELECT id, target, host, valid, days_remaining, not_before, not_after, subject, issuer, checked_at, error
553 + FROM tls_checks WHERE target = ? ORDER BY id DESC LIMIT 1",
554 + )
555 + .bind(target)
556 + .fetch_optional(pool)
557 + .await?)
558 + }
559 +
560 + // --- Incident queries ---
561 +
562 + #[derive(Debug, Clone, sqlx::FromRow, serde::Serialize)]
563 + pub struct IncidentRow {
564 + pub id: i64,
565 + pub target: String,
566 + pub started_at: String,
567 + pub ended_at: Option<String>,
568 + pub duration_secs: Option<i64>,
569 + pub from_status: String,
570 + pub to_status: String,
Lines truncated
@@ -0,0 +1,955 @@
1 + //! Pure formatting functions for CLI display output.
2 + //!
3 + //! Each function takes data types and writes formatted output to a `String`,
4 + //! keeping display logic separate from async I/O for testability.
5 +
6 + use std::fmt::Write;
7 +
8 + use crate::db::{IncidentRow, TlsCheckRow};
9 + use crate::types::{HealthSnapshot, LatencyStats, TestRun, TestStaleness};
10 +
11 + /// Format a single health snapshot as a human-readable line.
12 + pub fn format_health_snapshot(s: &HealthSnapshot) -> String {
13 + let mut out = String::new();
14 + write!(out, "[{}] {} \u{2014} {}", s.status.icon(), s.target, s.status).unwrap();
15 + write!(out, " ({}ms)", s.response_time_ms).unwrap();
16 + if let Some(details) = &s.details {
17 + if let Some(v) = &details.version {
18 + write!(out, " v{v}").unwrap();
19 + }
20 + if let Some(u) = &details.uptime {
21 + write!(out, " up {u}").unwrap();
22 + }
23 + }
24 + writeln!(out).unwrap();
25 + if let Some(err) = &s.error {
26 + writeln!(out, " {err}").unwrap();
27 + }
28 + out
29 + }
30 +
31 + /// Format a list of health snapshots for CLI display.
32 + pub fn format_health_snapshots(snapshots: &[HealthSnapshot]) -> String {
33 + let mut out = String::new();
34 + for s in snapshots {
35 + out.push_str(&format_health_snapshot(s));
36 + }
37 + out
38 + }
39 +
40 + /// Format a test run result for CLI display.
41 + pub fn format_test_result(target_name: &str, run: &TestRun) -> String {
42 + let mut out = String::new();
43 + let result = if run.passed { "PASSED" } else { "FAILED" };
44 + writeln!(out, "{target_name}: {result}").unwrap();
45 + if let Some(d) = run.duration_secs {
46 + writeln!(out, "Duration: {d}s").unwrap();
47 + }
48 + if let (Some(p), Some(f)) = (run.summary.total_passed, run.summary.total_failed) {
49 + writeln!(out, "Tests: {p} passed, {f} failed").unwrap();
50 + }
51 + for step in &run.summary.steps {
52 + let mark = if step.passed { "PASS" } else { "FAIL" };
53 + writeln!(out, " {mark} {}", step.name).unwrap();
54 + }
55 + if !run.passed {
56 + writeln!(out, "\nRaw output:\n{}", run.raw_output).unwrap();
57 + }
58 + out
59 + }
60 +
61 + /// Format a single target's status block (health + latency + TLS + tests + staleness + incident) for CLI display.
62 + #[allow(clippy::too_many_arguments)]
63 + pub fn format_status_target(
64 + name: &str,
65 + label: &str,
66 + health: Option<&HealthSnapshot>,
67 + latency: Option<&LatencyStats>,
68 + tls: Option<&TlsCheckRow>,
69 + test: Option<&TestRun>,
70 + staleness: Option<&TestStaleness>,
71 + incident: Option<&IncidentRow>,
72 + ) -> String {
73 + let mut out = String::new();
74 + writeln!(out, "=== {name} ({label}) ===").unwrap();
75 +
76 + if let Some(h) = health {
77 + write!(out, " Health: [{}] {}", h.status.icon(), h.status).unwrap();
78 + write!(out, " ({}ms)", h.response_time_ms).unwrap();
79 + if let Some(d) = &h.details
80 + && let Some(v) = &d.version
81 + {
82 + write!(out, " v{v}").unwrap();
83 + }
84 + writeln!(out).unwrap();
85 + } else {
86 + writeln!(out, " Health: no data").unwrap();
87 + }
88 +
89 + if let Some(l) = latency {
90 + writeln!(
91 + out,
92 + " Latency (24h): avg {:.0}ms, p95 {}ms, range {}-{}ms ({} samples)",
93 + l.avg_ms, l.p95_ms, l.min_ms, l.max_ms, l.sample_count
94 + )
95 + .unwrap();
96 + }
97 +
98 + if let Some(t) = tls {
99 + if let Some(ref err) = t.error {
100 + writeln!(out, " TLS: [ERR] {} \u{2014} {err}", t.host).unwrap();
101 + } else if t.days_remaining <= 0 {
102 + writeln!(out, " TLS: [ERR] {} \u{2014} EXPIRED (expired {})", t.host, t.not_after).unwrap();
103 + } else if t.days_remaining <= 14 {
104 + writeln!(out, " TLS: [WARN] {} \u{2014} {}d remaining (expires {})", t.host, t.days_remaining, t.not_after).unwrap();
105 + } else {
106 + writeln!(out, " TLS: [OK] {} \u{2014} {}d remaining (expires {})", t.host, t.days_remaining, t.not_after).unwrap();
107 + }
108 + }
109 +
110 + if let Some(t) = test {
111 + let result = if t.passed { "PASSED" } else { "FAILED" };
112 + write!(out, " Tests: {result}").unwrap();
113 + if let Some(d) = t.duration_secs {
114 + write!(out, " ({d}s)").unwrap();
115 + }
116 + writeln!(out).unwrap();
117 + if let (Some(p), Some(f)) = (t.summary.total_passed, t.summary.total_failed) {
118 + writeln!(out, " {p} passed, {f} failed").unwrap();
119 + }
120 + } else {
121 + writeln!(out, " Tests: no data").unwrap();
122 + }
123 +
124 + if let Some(s) = staleness
125 + && s.stale
126 + && let Some(reason) = &s.reason
127 + {
128 + writeln!(out, " Tests: STALE \u{2014} {reason}").unwrap();
129 + }
130 +
131 + if let Some(inc) = incident {
132 + writeln!(out, " Incident: [ACTIVE] {} since {}", inc.to_status, inc.started_at).unwrap();
133 + }
134 +
135 + writeln!(out).unwrap();
136 + out
137 + }
138 +
139 + /// Format health check history for CLI display.
140 + pub fn format_health_history(history: &[HealthSnapshot]) -> String {
141 + if history.is_empty() {
142 + return "No health check history.\n".to_string();
143 + }
144 + let mut out = String::new();
145 + for h in history {
146 + writeln!(
147 + out,
148 + "[{}] {} \u{2014} {} ({}ms) {}",
149 + h.status.icon(),
150 + h.target,
151 + h.status,
152 + h.response_time_ms,
153 + h.checked_at
154 + )
155 + .unwrap();
156 + }
157 + out
158 + }
159 +
160 + /// Format test run history for CLI display.
161 + pub fn format_test_history(history: &[TestRun]) -> String {
162 + if history.is_empty() {
163 + return "No test run history.\n".to_string();
164 + }
165 + let mut out = String::new();
166 + for r in history {
167 + let result = if r.passed { "PASS" } else { "FAIL" };
168 + write!(out, "[{result}] {}", r.target).unwrap();
169 + if let Some(d) = r.duration_secs {
170 + write!(out, " ({d}s)").unwrap();
171 + }
172 + write!(out, " {}", r.started_at).unwrap();
173 + if let (Some(p), Some(f)) = (r.summary.total_passed, r.summary.total_failed) {
174 + write!(out, " \u{2014} {p} passed, {f} failed").unwrap();
175 + }
176 + writeln!(out).unwrap();
177 + }
178 + out
179 + }
180 +
181 + /// Format prune results for CLI display.
182 + pub fn format_prune(health_pruned: u64, test_pruned: u64, heartbeat_pruned: u64, alerts_pruned: u64, tls_pruned: u64, incidents_pruned: u64, days: i64) -> String {
183 + format!("Pruned {health_pruned} health checks, {test_pruned} test runs, {heartbeat_pruned} peer heartbeats, {alerts_pruned} alerts, {tls_pruned} TLS checks, {incidents_pruned} incidents older than {days} days.\n")
184 + }
185 +
186 + /// Format mesh data (from JSON) for human-readable CLI display.
187 + pub fn format_mesh(data: &serde_json::Value) -> String {
188 + let Some(instances) = data.get("instances").and_then(|v| v.as_object()) else {
189 + return "No mesh data available.\n".to_string();
190 + };
191 +
192 + let mut out = String::new();
193 + for (name, instance_data) in instances {
194 + let instance = instance_data.get("instance");
195 + let id = instance
196 + .and_then(|i| i.get("id"))
197 + .and_then(|v| v.as_str())
198 + .unwrap_or("?");
199 + let version = instance
200 + .and_then(|i| i.get("version"))
201 + .and_then(|v| v.as_str())
202 + .unwrap_or("?");
203 +
204 + writeln!(out, "=== {name} ===").unwrap();
205 + writeln!(out, " ID: {id}").unwrap();
206 + writeln!(out, " Version: {version}").unwrap();
207 +
208 + // Targets
209 + if let Some(targets) = instance_data.get("targets").and_then(|v| v.as_object()) {
210 + for (target_name, target_data) in targets {
211 + let status = target_data
212 + .get("status")
213 + .and_then(|v| v.as_str())
214 + .unwrap_or("?");
215 + let ms = target_data
216 + .get("response_time_ms")
217 + .and_then(|v| v.as_i64());
218 + let ms_str = ms.map(|m| format!(" ({m}ms)")).unwrap_or_default();
219 + writeln!(out, " Target {target_name}: {status}{ms_str}").unwrap();
220 + }
221 + }
222 +
223 + // Peers
224 + if let Some(peers) = instance_data.get("peers").and_then(|v| v.as_object()) {
225 + for (peer_name, peer_data) in peers {
226 + let status = peer_data
227 + .get("status")
228 + .and_then(|v| v.as_str())
229 + .unwrap_or("?");
230 + let latency = peer_data
231 + .get("latency_ms")
232 + .and_then(|v| v.as_u64())
233 + .map(|ms| format!(" ({ms}ms)"))
234 + .unwrap_or_default();
235 + writeln!(out, " Peer {peer_name}: {status}{latency}").unwrap();
236 + }
237 + }
238 +
239 + // Error fallback
240 + if let Some(err) = instance_data.get("error").and_then(|v| v.as_str()) {
241 + writeln!(out, " ({err})").unwrap();
242 + }
243 +
244 + writeln!(out).unwrap();
245 + }
246 + out
247 + }
248 +
249 + #[cfg(test)]
250 + mod tests {
251 + use super::*;
252 + use crate::types::*;
253 +
254 + // --- format_health_snapshot ---
255 +
256 + #[test]
257 + fn health_snapshot_operational_with_details() {
258 + let s = HealthSnapshot {
259 + id: None,
260 + target: "mnw".to_string(),
261 + status: HealthStatus::Operational,
262 + checked_at: "2026-03-10T00:00:00Z".to_string(),
263 + response_time_ms: 95,
264 + details: Some(HealthDetails {
265 + version: Some("1.2.0".to_string()),
266 + uptime: Some("5d 3h".to_string()),
267 + checks: None,
268 + monitoring: None,
269 + }),
270 + error: None,
271 + };
272 + let out = format_health_snapshot(&s);
273 + assert!(out.contains("[OK]"));
274 + assert!(out.contains("mnw"));
275 + assert!(out.contains("operational"));
276 + assert!(out.contains("(95ms)"));
277 + assert!(out.contains("v1.2.0"));
278 + assert!(out.contains("up 5d 3h"));
279 + }
280 +
281 + #[test]
282 + fn health_snapshot_unreachable_with_error() {
283 + let s = HealthSnapshot {
284 + id: None,
285 + target: "api".to_string(),
286 + status: HealthStatus::Unreachable,
287 + checked_at: "2026-03-10T00:00:00Z".to_string(),
288 + response_time_ms: 0,
289 + details: None,
290 + error: Some("connection refused".to_string()),
291 + };
292 + let out = format_health_snapshot(&s);
293 + assert!(out.contains("[DOWN]"));
294 + assert!(out.contains("unreachable"));
295 + assert!(out.contains("connection refused"));
296 + }
297 +
298 + #[test]
299 + fn health_snapshot_degraded_no_details() {
300 + let s = HealthSnapshot {
301 + id: None,
302 + target: "svc".to_string(),
303 + status: HealthStatus::Degraded,
304 + checked_at: "2026-03-10T00:00:00Z".to_string(),
305 + response_time_ms: 2500,
306 + details: None,
307 + error: None,
308 + };
309 + let out = format_health_snapshot(&s);
310 + assert!(out.contains("[WARN]"));
311 + assert!(out.contains("degraded"));
312 + assert!(out.contains("(2500ms)"));
313 + assert!(!out.contains("up "));
314 + assert!(!out.contains(" v"));
315 + }
316 +
317 + #[test]
318 + fn health_snapshot_error_status() {
319 + let s = HealthSnapshot {
320 + id: None,
321 + target: "db".to_string(),
322 + status: HealthStatus::Error,
323 + checked_at: "2026-03-10T00:00:00Z".to_string(),
324 + response_time_ms: 500,
325 + details: None,
326 + error: Some("500 internal server error".to_string()),
327 + };
328 + let out = format_health_snapshot(&s);
329 + assert!(out.contains("[ERR]"));
330 + assert!(out.contains("error"));
331 + assert!(out.contains("500 internal server error"));
332 + }
333 +
334 + #[test]
335 + fn health_snapshots_multiple() {
336 + let snapshots = vec![
337 + HealthSnapshot {
338 + id: None,
339 + target: "a".to_string(),
340 + status: HealthStatus::Operational,
341 + checked_at: "2026-03-10T00:00:00Z".to_string(),
342 + response_time_ms: 50,
343 + details: None,
344 + error: None,
345 + },
346 + HealthSnapshot {
347 + id: None,
348 + target: "b".to_string(),
349 + status: HealthStatus::Degraded,
350 + checked_at: "2026-03-10T00:00:00Z".to_string(),
351 + response_time_ms: 3000,
352 + details: None,
353 + error: None,
354 + },
355 + ];
356 + let out = format_health_snapshots(&snapshots);
357 + assert!(out.contains("[OK]"));
358 + assert!(out.contains("[WARN]"));
359 + assert!(out.contains("a"));
360 + assert!(out.contains("b"));
361 + }
362 +
363 + // --- format_test_result ---
364 +
365 + #[test]
366 + fn test_result_passed() {
367 + let run = TestRun {
368 + id: None,
369 + target: "mnw".to_string(),
370 + started_at: "2026-03-10T00:00:00Z".to_string(),
371 + finished_at: Some("2026-03-10T00:02:00Z".to_string()),
372 + duration_secs: Some(120),
373 + exit_code: Some(0),
374 + passed: true,
375 + summary: TestSummary {
376 + steps: vec![
377 + StepResult { name: "cargo check".to_string(), passed: true },
378 + StepResult { name: "cargo test".to_string(), passed: true },
379 + ],
380 + total_passed: Some(759),
381 + total_failed: Some(0),
382 + },
383 + raw_output: String::new(),
384 + filter: None,
385 + };
386 + let out = format_test_result("mnw", &run);
387 + assert!(out.contains("mnw: PASSED"));
388 + assert!(out.contains("Duration: 120s"));
389 + assert!(out.contains("Tests: 759 passed, 0 failed"));
390 + assert!(out.contains("PASS cargo check"));
391 + assert!(out.contains("PASS cargo test"));
392 + assert!(!out.contains("Raw output"));
393 + }
394 +
395 + #[test]
396 + fn test_result_failed_shows_raw_output() {
397 + let run = TestRun {
398 + id: None,
399 + target: "mnw".to_string(),
400 + started_at: "2026-03-10T00:00:00Z".to_string(),
401 + finished_at: Some("2026-03-10T00:01:00Z".to_string()),
402 + duration_secs: Some(60),
403 + exit_code: Some(1),
404 + passed: false,
405 + summary: TestSummary {
406 + steps: vec![
407 + StepResult { name: "cargo check".to_string(), passed: true },
408 + StepResult { name: "cargo test".to_string(), passed: false },
409 + ],
410 + total_passed: Some(750),
411 + total_failed: Some(9),
412 + },
413 + raw_output: "thread 'test_foo' panicked at 'assertion failed'".to_string(),
414 + filter: None,
415 + };
416 + let out = format_test_result("mnw", &run);
417 + assert!(out.contains("mnw: FAILED"));
418 + assert!(out.contains("PASS cargo check"));
419 + assert!(out.contains("FAIL cargo test"));
420 + assert!(out.contains("750 passed, 9 failed"));
421 + assert!(out.contains("Raw output:"));
422 + assert!(out.contains("assertion failed"));
423 + }
424 +
425 + #[test]
426 + fn test_result_no_duration_or_counts() {
427 + let run = TestRun {
428 + id: None,
429 + target: "svc".to_string(),
430 + started_at: "2026-03-10T00:00:00Z".to_string(),
431 + finished_at: None,
432 + duration_secs: None,
433 + exit_code: None,
434 + passed: true,
435 + summary: TestSummary {
436 + steps: vec![],
437 + total_passed: None,
438 + total_failed: None,
439 + },
440 + raw_output: String::new(),
441 + filter: None,
442 + };
443 + let out = format_test_result("svc", &run);
444 + assert!(out.contains("svc: PASSED"));
445 + assert!(!out.contains("Duration:"));
446 + assert!(!out.contains("Tests:"));
447 + }
448 +
449 + // --- format_status_target ---
450 +
451 + #[test]
452 + fn status_target_with_health_and_tests() {
453 + let health = HealthSnapshot {
454 + id: None,
455 + target: "mnw".to_string(),
456 + status: HealthStatus::Operational,
457 + checked_at: "2026-03-10T00:00:00Z".to_string(),
458 + response_time_ms: 95,
459 + details: Some(HealthDetails {
460 + version: Some("2.1.0".to_string()),
461 + uptime: None,
462 + checks: None,
463 + monitoring: None,
464 + }),
465 + error: None,
466 + };
467 + let test = TestRun {
468 + id: None,
469 + target: "mnw".to_string(),
470 + started_at: "2026-03-10T00:00:00Z".to_string(),
471 + finished_at: Some("2026-03-10T00:01:00Z".to_string()),
472 + duration_secs: Some(60),
473 + exit_code: Some(0),
474 + passed: true,
475 + summary: TestSummary {
476 + steps: vec![],
477 + total_passed: Some(100),
478 + total_failed: Some(0),
479 + },
480 + raw_output: String::new(),
481 + filter: None,
482 + };
483 + let out = format_status_target("mnw", "MakeNotWork", Some(&health), None, None, Some(&test), None, None);
484 + assert!(out.contains("=== mnw (MakeNotWork) ==="));
485 + assert!(out.contains("Health: [OK] operational (95ms) v2.1.0"));
486 + assert!(out.contains("Tests: PASSED (60s)"));
487 + assert!(out.contains("100 passed, 0 failed"));
488 + }
489 +
490 + #[test]
491 + fn status_target_no_data() {
492 + let out = format_status_target("mnw", "MakeNotWork", None, None, None, None, None, None);
493 + assert!(out.contains("=== mnw (MakeNotWork) ==="));
494 + assert!(out.contains("Health: no data"));
495 + assert!(out.contains("Tests: no data"));
496 + }
497 +
498 + #[test]
499 + fn status_target_health_only() {
500 + let health = HealthSnapshot {
Lines truncated
A src/error.rs +32
@@ -0,0 +1,32 @@
1 + //! Typed error enum for PoM, replacing `Box<dyn Error>` throughout the crate.
2 +
3 + use thiserror::Error;
4 +
5 + #[derive(Debug, Error)]
6 + pub enum PomError {
7 + #[error(transparent)]
8 + Io(#[from] std::io::Error),
9 +
10 + #[error(transparent)]
11 + Db(#[from] sqlx::Error),
12 +
13 + #[error(transparent)]
14 + TomlParse(#[from] toml::de::Error),
15 +
16 + #[error(transparent)]
17 + Http(#[from] reqwest::Error),
18 +
19 + #[error(transparent)]
20 + Json(#[from] serde_json::Error),
21 +
22 + #[error(transparent)]
23 + LogFilter(#[from] tracing_subscriber::filter::ParseError),
24 +
25 + #[error(transparent)]
26 + Join(#[from] tokio::task::JoinError),
27 +
28 + #[error("{0}")]
29 + Config(String),
30 + }
31 +
32 + pub type Result<T> = std::result::Result<T, PomError>;
M src/lib.rs +9
@@ -1,5 +1,14 @@
1 + //! PoM — Production Operations Monitor.
2 + //!
3 + //! Health checks, test orchestration, and peer mesh for monitoring deployed services.
4 +
5 + pub mod alerts;
6 + pub mod api;
1 7 pub mod checks;
2 8 pub mod config;
3 9 pub mod db;
10 + pub mod display;
11 + pub mod error;
12 + pub mod peer;
4 13 pub mod tools;
5 14 pub mod types;
M src/main.rs +23 -358
@@ -1,14 +1,17 @@
1 + //! PoM CLI entry point — parses subcommands and dispatches to handlers or MCP server.
2 +
1 3 use clap::{Parser, Subcommand};
2 4 use rmcp::ServiceExt;
3 5 use tokio::io::{stdin, stdout};
4 6 use tracing::info;
5 7 use tracing_subscriber::{fmt, prelude::*, EnvFilter};
6 8
7 - use pom::checks::{http, ssh};
8 9 use pom::config::{self, Config};
9 10 use pom::db;
11 + use pom::error::Result;
10 12 use pom::tools::PomServer;
11 - use pom::types::HealthStatus;
13 +
14 + mod cli;
12 15
13 16 #[derive(Parser)]
14 17 #[command(name = "pom", about = "Peace of Mind — health checks and test orchestration")]
@@ -51,7 +54,7 @@ enum Commands {
51 54 /// View history
52 55 History {
53 56 #[command(subcommand)]
54 - kind: HistoryKind,
57 + kind: cli::HistoryKind,
55 58 },
56 59 /// Prune old records
57 60 Prune {
@@ -61,28 +64,8 @@ enum Commands {
61 64 },
62 65 /// Run as a daemon, checking health at intervals
63 66 Serve,
64 - }
65 -
66 - #[derive(Subcommand)]
67 - enum HistoryKind {
68 - /// Health check history
69 - Health {
70 - /// Filter by target
71 - target: Option<String>,
72 - /// Number of results
73 - #[arg(short, default_value = "10")]
74 - n: i64,
75 - /// Output as JSON
76 - #[arg(long)]
77 - json: bool,
78 - },
79 - /// Test run history
80 - Tests {
81 - /// Filter by target
82 - target: Option<String>,
83 - /// Number of results
84 - #[arg(short, default_value = "10")]
85 - n: i64,
67 + /// Show peer mesh status
68 + Mesh {
86 69 /// Output as JSON
87 70 #[arg(long)]
88 71 json: bool,
@@ -90,7 +73,12 @@ enum HistoryKind {
90 73 }
91 74
92 75 #[tokio::main]
93 - async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
76 + async fn main() -> Result<()> {
77 + // Install the default rustls crypto provider before any TLS operations.
78 + // Both aws-lc-rs and ring are in the dependency tree (via reqwest and tokio-rustls),
79 + // so rustls can't auto-detect which to use.
80 + let _ = tokio_rustls::rustls::crypto::ring::default_provider().install_default();
81 +
94 82 let cli = Cli::parse();
95 83
96 84 let config_path = cli.config.as_deref();
@@ -102,7 +90,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
102 90 }
103 91 }
104 92
105 - async fn run_mcp_server(config: Config) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
93 + async fn run_mcp_server(config: Config) -> Result<()> {
106 94 tracing_subscriber::registry()
107 95 .with(fmt::layer().with_writer(std::io::stderr))
108 96 .with(EnvFilter::from_default_env().add_directive("pom=info".parse()?))
@@ -128,7 +116,7 @@ async fn run_mcp_server(config: Config) -> Result<(), Box<dyn std::error::Error
128 116 async fn run_cli(
129 117 cmd: Commands,
130 118 config: Config,
131 - ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
119 + ) -> Result<()> {
132 120 let log_level = if matches!(cmd, Commands::Serve) { "pom=info" } else { "pom=warn" };
133 121 tracing_subscriber::registry()
134 122 .with(fmt::layer().with_writer(std::io::stderr))
@@ -139,335 +127,12 @@ async fn run_cli(
139 127 let pool = db::connect(&db_path).await?;
140 128
141 129 match cmd {
142 - Commands::Health { target, json } => cmd_health(&pool, &config, target.as_deref(), json).await,
143 - Commands::Test { target, filter, json } => cmd_test(&pool, &config, &target, filter.as_deref(), json).await,
144 - Commands::Status { json } => cmd_status(&pool, &config, json).await,
145 - Commands::History { kind } => cmd_history(&pool, kind).await,
146 - Commands::Prune { days } => cmd_prune(&pool, days).await,
147 - Commands::Serve => cmd_serve(&pool, &config).await,
148 - }
149 - }
150 -
151 - async fn cmd_health(
152 - pool: &sqlx::SqlitePool,
153 - config: &Config,
154 - target: Option<&str>,
155 - json: bool,
156 - ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
157 - let targets: Vec<String> = match target {
158 - Some(t) => {
159 - if config.get_target(t).is_none() {
160 - eprintln!("Unknown target: {t}");
161 - std::process::exit(1);
162 - }
163 - vec![t.to_string()]
164 - }
165 - None => config.target_names(),
166 - };
167 -
168 - let mut snapshots = Vec::new();
169 -
170 - for name in &targets {
171 - let target_config = config.get_target(name).unwrap();
172 - if let Some(health_config) = &target_config.health {
173 - let snapshot = http::check_health(name, health_config).await;
174 - db::insert_health_check(pool, &snapshot).await?;
175 - snapshots.push(snapshot);
176 - } else {
177 - eprintln!("{name}: no health endpoint configured");
178 - }
179 - }
180 -
181 - if json {
182 - println!("{}", serde_json::to_string_pretty(&snapshots)?);
183 - } else {
184 - for s in &snapshots {
185 - let icon = match s.status {
186 - HealthStatus::Operational => "OK",
187 - HealthStatus::Degraded => "WARN",
188 - HealthStatus::Error => "ERR",
189 - HealthStatus::Unreachable => "DOWN",
190 - };
191 - print!("[{icon}] {} — {}", s.target, s.status);
192 - print!(" ({}ms)", s.response_time_ms);
193 - if let Some(details) = &s.details {
194 - if let Some(v) = &details.version {
195 - print!(" v{v}");
196 - }
197 - if let Some(u) = &details.uptime {
198 - print!(" up {u}");
199 - }
200 - }
201 - println!();
202 - if let Some(err) = &s.error {
203 - println!(" {err}");
204 - }
205 - }
206 - }
207 -
208 - Ok(())
209 - }
210 -
211 - async fn cmd_test(
212 - pool: &sqlx::SqlitePool,
213 - config: &Config,
214 - target_name: &str,
215 - filter: Option<&str>,
216 - json: bool,
217 - ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
218 - let target = config.get_target(target_name).ok_or_else(|| {
219 - format!("Unknown target: {target_name}")
220 - })?;
221 - let tests_config = target.tests.as_ref().ok_or_else(|| {
222 - format!("Target '{target_name}' has no test configuration")
223 - })?;
224 -
225 - eprintln!("Running tests on {target_name}...");
226 - let run = ssh::run_tests(target_name, tests_config, filter).await;
227 - db::insert_test_run(pool, &run).await?;
228 -
229 - if json {
230 - let summary = serde_json::json!({
231 - "target": run.target,
232 - "passed": run.passed,
233 - "exit_code": run.exit_code,
234 - "duration_secs": run.duration_secs,
235 - "started_at": run.started_at,
236 - "finished_at": run.finished_at,
237 - "filter": run.filter,
238 - "summary": run.summary,
239 - });
240 - println!("{}", serde_json::to_string_pretty(&summary)?);
241 - } else {
242 - let result = if run.passed { "PASSED" } else { "FAILED" };
243 - println!("{target_name}: {result}");
244 - if let Some(d) = run.duration_secs {
245 - println!("Duration: {d}s");
246 - }
247 - if let (Some(p), Some(f)) = (run.summary.total_passed, run.summary.total_failed) {
248 - println!("Tests: {p} passed, {f} failed");
249 - }
250 - for step in &run.summary.steps {
251 - let mark = if step.passed { "PASS" } else { "FAIL" };
252 - println!(" {mark} {}", step.name);
253 - }
254 - if !run.passed {
255 - println!("\nRaw output:\n{}", run.raw_output);
256 - }
130 + Commands::Health { target, json } => cli::cmd_health(&pool, &config, target.as_deref(), json).await,
131 + Commands::Test { target, filter, json } => cli::cmd_test(&pool, &config, &target, filter.as_deref(), json).await,
132 + Commands::Status { json } => cli::cmd_status(&pool, &config, json).await,
133 + Commands::History { kind } => cli::cmd_history(&pool, kind).await,
134 + Commands::Prune { days } => cli::cmd_prune(&pool, days).await,
135 + Commands::Serve => cli::cmd_serve(&pool, &config).await,
136 + Commands::Mesh { json } => cli::cmd_mesh(&config, json).await,
257 137 }
258 -
259 - Ok(())
260 - }
261 -
262 - async fn cmd_status(
263 - pool: &sqlx::SqlitePool,
264 - config: &Config,
265 - json: bool,
266 - ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
267 - let mut target_statuses = Vec::new();
268 -
269 - for name in config.target_names() {
270 - let target = config.get_target(&name).unwrap();
271 - let health = db::get_latest_health(pool, &name).await?;
272 - let test = db::get_latest_test_run(pool, &name).await?;
273 -
274 - if json {
275 - target_statuses.push(serde_json::json!({
276 - "target": name,
277 - "label": target.label,
278 - "health": health,
279 - "last_test": test.map(|t| serde_json::json!({
280 - "passed": t.passed,
281 - "exit_code": t.exit_code,
282 - "duration_secs": t.duration_secs,
283 - "started_at": t.started_at,
284 - "summary": t.summary,
285 - })),
286 - }));
287 - } else {
288 - println!("=== {} ({}) ===", name, target.label);
289 - if let Some(h) = &health {
290 - let icon = match h.status {
291 - HealthStatus::Operational => "OK",
292 - HealthStatus::Degraded => "WARN",
293 - HealthStatus::Error => "ERR",
294 - HealthStatus::Unreachable => "DOWN",
295 - };
296 - print!(" Health: [{icon}] {}", h.status);
297 - print!(" ({}ms)", h.response_time_ms);
298 - if let Some(d) = &h.details {
299 - if let Some(v) = &d.version {
300 - print!(" v{v}");
301 - }
302 - }
303 - println!();
304 - } else {
305 - println!(" Health: no data");
306 - }
307 -
308 - if let Some(t) = &test {
309 - let result = if t.passed { "PASSED" } else { "FAILED" };
310 - print!(" Tests: {result}");
311 - if let Some(d) = t.duration_secs {
312 - print!(" ({d}s)");
313 - }
314 - println!();
315 - if let (Some(p), Some(f)) = (t.summary.total_passed, t.summary.total_failed) {
316 - println!(" {p} passed, {f} failed");
317 - }
318 - } else {
319 - println!(" Tests: no data");
320 - }
321 - println!();
322 - }
323 - }
324 -
325 - if json {
326 - println!("{}", serde_json::to_string_pretty(&target_statuses)?);
327 - }
328 -
329 - Ok(())
330 - }
331 -
332 - async fn cmd_history(
333 - pool: &sqlx::SqlitePool,
334 - kind: HistoryKind,
335 - ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
336 - match kind {
337 - HistoryKind::Health { target, n, json } => {
338 - let history = db::get_health_history(pool, target.as_deref(), n).await?;
339 - if json {
340 - println!("{}", serde_json::to_string_pretty(&history)?);
341 - } else if history.is_empty() {
342 - println!("No health check history.");
343 - } else {
344 - for h in &history {
345 - let icon = match h.status {
346 - HealthStatus::Operational => "OK",
347 - HealthStatus::Degraded => "WARN",
348 - HealthStatus::Error => "ERR",
349 - HealthStatus::Unreachable => "DOWN",
350 - };
351 - println!("[{icon}] {} — {} ({}ms) {}", h.target, h.status, h.response_time_ms, h.checked_at);
352 - }
353 - }
354 - }
355 - HistoryKind::Tests { target, n, json } => {
356 - let history = db::get_test_history(pool, target.as_deref(), n).await?;
357 - if json {
358 - let summaries: Vec<serde_json::Value> = history
359 - .iter()
360 - .map(|r| serde_json::json!({
361 - "id": r.id,
362 - "target": r.target,
363 - "passed": r.passed,
364 - "exit_code": r.exit_code,
365 - "duration_secs": r.duration_secs,
366 - "started_at": r.started_at,
367 - "summary": r.summary,
368 - }))
369 - .collect();
370 - println!("{}", serde_json::to_string_pretty(&summaries)?);
371 - } else if history.is_empty() {
372 - println!("No test run history.");
373 - } else {
374 - for r in &history {
375 - let result = if r.passed { "PASS" } else { "FAIL" };
376 - print!("[{result}] {}", r.target);
377 - if let Some(d) = r.duration_secs {
378 - print!(" ({d}s)");
379 - }
380 - print!(" {}", r.started_at);
381 - if let (Some(p), Some(f)) = (r.summary.total_passed, r.summary.total_failed) {
382 - print!(" — {p} passed, {f} failed");
383 - }
384 - println!();
385 - }
386 - }
387 - }
388 - }
389 -
390 - Ok(())
391 - }
392 -
393 - async fn cmd_prune(
394 - pool: &sqlx::SqlitePool,
395 - days: i64,
396 - ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
397 - let (health_pruned, test_pruned) = db::prune_old_records(pool, days).await?;
398 - println!("Pruned {health_pruned} health checks and {test_pruned} test runs older than {days} days.");
399 - Ok(())
400 - }
401 -
402 - async fn cmd_serve(
403 - pool: &sqlx::SqlitePool,
404 - config: &Config,
405 - ) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
406 - let default_interval = config.serve.interval_secs;
407 - let prune_days = config.serve.prune_days;
408 -
409 - info!("Starting serve mode (default interval: {default_interval}s, prune: {prune_days}d)");
410 -
411 - // Spawn a health check task per target
412 - let mut handles = Vec::new();
413 -
414 - for name in config.target_names() {
415 - let target_config = config.get_target(&name).unwrap().clone();
416 - if let Some(health_config) = target_config.health {
417 - let interval_secs = health_config.interval_secs.unwrap_or(default_interval);
418 - let pool = pool.clone();
419 - let name = name.clone();
420 -
421 - info!("{name}: health check every {interval_secs}s");
422 -
423 - handles.push(tokio::spawn(async move {
424 - let mut interval = tokio::time::interval(
425 - std::time::Duration::from_secs(interval_secs),
426 - );
427 - loop {
428 - interval.tick().await;
429 - let snapshot = http::check_health(&name, &health_config).await;
430 - info!("{}: {} ({}ms)", name, snapshot.status, snapshot.response_time_ms);
431 - if let Err(e) = db::insert_health_check(&pool, &snapshot).await {
432 - tracing::error!("{name}: failed to store health check: {e}");
433 - }
434 - }
435 - }));
436 - }
437 - }
438 -
439 - // Spawn daily prune task
440 - let prune_pool = pool.clone();
441 - handles.push(tokio::spawn(async move {
442 - let mut interval = tokio::time::interval(
443 - std::time::Duration::from_secs(86400),
444 - );
445 - loop {
446 - interval.tick().await;
447 - match db::prune_old_records(&prune_pool, prune_days).await {
448 - Ok((h, t)) => info!("Pruned {h} health checks, {t} test runs"),
449 - Err(e) => tracing::error!("Prune failed: {e}"),
450 - }
451 - }
452 - }));
453 -
454 - // Wait for shutdown signal
455 - let mut sigterm = tokio::signal::unix::signal(
456 - tokio::signal::unix::SignalKind::terminate(),
457 - )?;
458 -
459 - tokio::select! {
460 - _ = tokio::signal::ctrl_c() => {
461 - info!("Received SIGINT, shutting down");
462 - }
463 - _ = sigterm.recv() => {
464 - info!("Received SIGTERM, shutting down");
465 - }
466 - }
467 -
468 - for handle in handles {
469 - handle.abort();
470 - }
471 -
472 - Ok(())
473 138 }
A src/peer.rs +500
@@ -0,0 +1,536 @@
1 + //! Peer mesh — identity, heartbeat monitoring, and mesh state aggregation.
2 +
3 + use std::collections::HashMap;
4 + use std::sync::Arc;
5 + use tokio::sync::RwLock;
6 + use serde::Serialize;
7 +
8 + use crate::alerts::Alerter;
9 + use crate::config::PeerConfig;
10 + use crate::error::{PomError, Result};
11 +
12 + // --- Identity ---
13 +
14 + /// Load or create a persistent instance ID (UUID v4).
15 + /// Stored at `~/.local/share/pom/instance_id`, same directory as `pom.db`.
16 + pub fn load_or_create_instance_id(
17 + override_id: Option<&str>,
18 + ) -> Result<String> {
19 + if let Some(id) = override_id {
20 + return Ok(id.to_string());
21 + }
22 +
23 + let data_dir = dirs::data_local_dir()
24 + .ok_or_else(|| PomError::Config("Could not determine data directory".into()))?;
25 + let pom_dir = data_dir.join("pom");
26 + std::fs::create_dir_all(&pom_dir)?;
27 + let id_path = pom_dir.join("instance_id");
28 +
29 + if id_path.exists() {
30 + let id = std::fs::read_to_string(&id_path)?.trim().to_string();
31 + if !id.is_empty() {
32 + return Ok(id);
33 + }
34 + }
35 +
36 + let id = uuid::Uuid::new_v4().to_string();
37 + std::fs::write(&id_path, &id)?;
38 + Ok(id)
39 + }
40 +
41 + // --- Types ---
42 +
43 + #[derive(Debug, Clone, Serialize, serde::Deserialize)]
44 + pub struct InstanceInfo {
45 + pub id: String,
46 + pub name: String,
47 + pub version: String,
48 + pub targets: Vec<String>,
49 + pub started_at: String,
50 + }
51 +
52 + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
53 + #[serde(rename_all = "lowercase")]
54 + pub enum PeerStatus {
55 + Online,
56 + GracePeriod,
57 + Missing,
58 + Unknown,
59 + }
60 +
61 + impl std::fmt::Display for PeerStatus {
62 + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
63 + match self {
64 + Self::Online => write!(f, "online"),
65 + Self::GracePeriod => write!(f, "grace_period"),
66 + Self::Missing => write!(f, "missing"),
67 + Self::Unknown => write!(f, "unknown"),
68 + }
69 + }
70 + }
71 +
72 + #[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, serde::Deserialize)]
73 + #[serde(rename_all = "lowercase")]
74 + pub enum OnMissing {
75 + Alert,
76 + #[default]
77 + Log,
78 + Ignore,
79 + }
80 +
81 + #[derive(Debug, Clone, Serialize)]
82 + pub struct PeerState {
83 + pub address: String,
84 + pub on_missing: OnMissing,
85 + pub grace_count: u32,
86 + pub status: PeerStatus,
87 + pub info: Option<InstanceInfo>,
88 + pub last_seen: Option<String>,
89 + pub latency_ms: Option<u64>,
90 + pub consecutive_failures: u32,
91 + #[serde(skip)]
92 + pub known_id: Option<String>,
93 + /// Cached status data from the peer's /api/peer/status endpoint.
94 + #[serde(skip)]
95 + pub status_data: Option<serde_json::Value>,
96 + }
97 +
98 + // --- Mesh State ---
99 +
100 + #[derive(Debug)]
101 + pub struct MeshState {
102 + pub instance: InstanceInfo,
103 + pub peers: HashMap<String, PeerState>,
104 + }
105 +
106 + pub type SharedMeshState = Arc<RwLock<MeshState>>;
107 +
108 + pub fn new_mesh_state(
109 + instance: InstanceInfo,
110 + peer_configs: &HashMap<String, PeerConfig>,
111 + ) -> SharedMeshState {
112 + let mut peers = HashMap::new();
113 + for (name, cfg) in peer_configs {
114 + peers.insert(
115 + name.clone(),
116 + PeerState {
117 + address: cfg.address.clone(),
118 + on_missing: cfg.on_missing,
119 + grace_count: cfg.grace_count.unwrap_or(3),
120 + status: PeerStatus::Unknown,
121 + info: None,
122 + last_seen: None,
123 + latency_ms: None,
124 + consecutive_failures: 0,
125 + known_id: None,
126 + status_data: None,
127 + },
128 + );
129 + }
130 + Arc::new(RwLock::new(MeshState { instance, peers }))
131 + }
132 +
133 + // --- Heartbeat ---
134 +
135 + pub async fn spawn_heartbeat_tasks(
136 + mesh: SharedMeshState,
137 + pool: sqlx::SqlitePool,
138 + interval_secs: u64,
139 + alerter: Option<Alerter>,
140 + ) -> Vec<tokio::task::JoinHandle<()>> {
141 + let peer_names: Vec<String> = {
142 + let mesh_guard = mesh.read().await;
143 + mesh_guard.peers.keys().cloned().collect()
144 + };
145 +
146 + let mut handles = Vec::new();
147 + for peer_name in peer_names {
148 + let mesh = Arc::clone(&mesh);
149 + let pool = pool.clone();
150 + let alerter = alerter.clone();
151 + handles.push(tokio::spawn(async move {
152 + heartbeat_loop(&peer_name, mesh, pool, interval_secs, alerter).await;
153 + }));
154 + }
155 + handles
156 + }
157 +
158 + async fn heartbeat_loop(
159 + peer_name: &str,
160 + mesh: SharedMeshState,
161 + pool: sqlx::SqlitePool,
162 + interval_secs: u64,
163 + alerter: Option<Alerter>,
164 + ) {
165 + let mut interval = tokio::time::interval(std::time::Duration::from_secs(interval_secs));
166 + // Skip the first immediate tick — give peers time to start up
167 + interval.tick().await;
168 +
169 + let address = {
170 + let state = mesh.read().await;
171 + match state.peers.get(peer_name) {
172 + Some(p) => p.address.clone(),
173 + None => return,
174 + }
175 + };
176 +
177 + let client = reqwest::Client::builder()
178 + .timeout(std::time::Duration::from_secs(10))
179 + .build()
180 + .unwrap_or_default();
181 +
182 + loop {
183 + interval.tick().await;
184 +
185 + let start = std::time::Instant::now();
186 + let result = client
187 + .get(format!("http://{address}/api/peer/info"))
188 + .send()
189 + .await;
190 + let latency_ms = start.elapsed().as_millis() as u64;
191 +
192 + match result.and_then(|r| r.error_for_status()) {
193 + Ok(response) => {
194 + let info: Option<InstanceInfo> = response.json().await.ok();
195 + handle_heartbeat_success(peer_name, &mesh, &pool, info, latency_ms, &alerter).await;
196 + }
197 + Err(_e) => {
198 + handle_heartbeat_failure(peer_name, &mesh, &pool, latency_ms, &alerter).await;
199 + }
200 + }
201 +
202 + // Also fetch /api/peer/status for mesh aggregation
203 + let status_result = client
204 + .get(format!("http://{address}/api/peer/status"))
205 + .send()
206 + .await;
207 + match status_result {
208 + Ok(resp) => {
209 + if let Ok(data) = resp.json::<serde_json::Value>().await {
210 + let mut state = mesh.write().await;
211 + if let Some(peer) = state.peers.get_mut(peer_name) {
212 + peer.status_data = Some(data);
213 + }
214 + }
215 + }
216 + Err(e) => {
217 + tracing::debug!("{peer_name}: failed to fetch /api/peer/status: {e}");
218 + }
219 + }
220 + }
221 + }
222 +
223 + async fn handle_heartbeat_success(
224 + peer_name: &str,
225 + mesh: &SharedMeshState,
226 + pool: &sqlx::SqlitePool,
227 + info: Option<InstanceInfo>,
228 + latency_ms: u64,
229 + alerter: &Option<Alerter>,
230 + ) {
231 + let now = chrono::Utc::now().to_rfc3339();
232 +
233 + // Update in-memory state under lock, collect data for DB writes
234 + let (first_contact_id, recovery_info) = {
235 + let mut state = mesh.write().await;
236 + let Some(peer) = state.peers.get_mut(peer_name) else {
237 + return;
238 + };
239 +
240 + let was_missing = peer.status == PeerStatus::Missing || peer.status == PeerStatus::GracePeriod;
241 +
242 + // Check UUID consistency
243 + let mut first_contact = None;
244 + if let Some(ref info) = info {
245 + match &peer.known_id {
246 + None => {
247 + peer.known_id = Some(info.id.clone());
248 + first_contact = Some(info.id.clone());
249 + tracing::info!("{peer_name}: first contact, id={}", info.id);
250 + }
251 + Some(known) if known != &info.id => {
252 + tracing::warn!(
253 + "{peer_name}: UUID mismatch! expected={known}, got={}. Possible impersonation.",
254 + info.id
255 + );
256 + }
257 + _ => {}
258 + }
259 + }
260 +
261 + // Collect recovery data before mutating state
262 + let recovery = if was_missing && peer.on_missing == OnMissing::Alert {
263 + Some(peer.address.clone())
264 + } else {
265 + None
266 + };
267 +
268 + if was_missing {
269 + tracing::info!("{peer_name}: recovered (was {:?})", peer.status);
270 + }
271 +
272 + peer.status = PeerStatus::Online;
273 + peer.info = info;
274 + peer.last_seen = Some(now);
275 + peer.latency_ms = Some(latency_ms);
276 + peer.consecutive_failures = 0;
277 +
278 + (first_contact, recovery)
279 + };
280 + // Lock dropped — DB writes and alerts happen without holding mesh lock
281 +
282 + if let Some(id) = first_contact_id {
283 + let _ = crate::db::store_peer_identity(pool, peer_name, &id).await;
284 + }
285 + let _ = crate::db::insert_peer_heartbeat(pool, peer_name, "online", latency_ms as i64).await;
286 +
287 + if let (Some(address), Some(alerter)) = (recovery_info, alerter) {
288 + alerter.send_peer_recovery(peer_name, &address).await;
289 + }
290 + }
291 +
292 + async fn handle_heartbeat_failure(
293 + peer_name: &str,
294 + mesh: &SharedMeshState,
295 + pool: &sqlx::SqlitePool,
296 + latency_ms: u64,
297 + alerter: &Option<Alerter>,
298 + ) {
299 + // Update in-memory state under lock, collect data for alert after lock drop
300 + let (new_status, alert_info) = {
301 + let mut state = mesh.write().await;
302 + let Some(peer) = state.peers.get_mut(peer_name) else {
303 + return;
304 + };
305 +
306 + peer.consecutive_failures += 1;
307 +
308 + let new_status = match peer.status {
309 + PeerStatus::Online | PeerStatus::Unknown | PeerStatus::GracePeriod => {
310 + if peer.consecutive_failures >= peer.grace_count {
311 + PeerStatus::Missing
312 + } else {
313 + PeerStatus::GracePeriod
314 + }
315 + }
316 + PeerStatus::Missing => PeerStatus::Missing,
317 + };
318 +
319 + let transitioned_to_missing = new_status == PeerStatus::Missing && peer.status != PeerStatus::Missing;
320 +
321 + // Collect alert data before mutating state
322 + let alert_info = if transitioned_to_missing {
323 + match peer.on_missing {
324 + OnMissing::Alert => {
325 + tracing::warn!(
326 + "{peer_name}: MISSING after {} consecutive failures (action: alert)",
327 + peer.consecutive_failures
328 + );
329 + Some((peer.address.clone(), peer.consecutive_failures))
330 + }
331 + OnMissing::Log => {
332 + tracing::info!(
333 + "{peer_name}: missing after {} consecutive failures",
334 + peer.consecutive_failures
335 + );
336 + None
337 + }
338 + OnMissing::Ignore => None,
339 + }
340 + } else {
341 + None
342 + };
343 +
344 + peer.status = new_status;
345 +
346 + (new_status, alert_info)
347 + };
348 + // Lock dropped — DB write and alerts happen without holding mesh lock
349 +
350 + let status_str = new_status.to_string();
351 + let _ = crate::db::insert_peer_heartbeat(pool, peer_name, &status_str, latency_ms as i64).await;
352 +
353 + if let (Some((address, failures)), Some(alerter)) = (alert_info, alerter) {
354 + alerter.send_peer_missing(peer_name, &address, failures).await;
355 + }
356 + }
357 +
358 + #[cfg(test)]
359 + mod tests {
360 + use super::*;
361 +
362 + #[test]
363 + fn override_id_takes_precedence() {
364 + let id = load_or_create_instance_id(Some("override-id")).unwrap();
365 + assert_eq!(id, "override-id");
366 + }
367 +
368 + #[test]
369 + fn auto_id_is_valid_uuid() {
370 + let id = load_or_create_instance_id(None).unwrap();
371 + assert!(uuid::Uuid::parse_str(&id).is_ok());
372 + }
373 +
374 + #[test]
375 + fn on_missing_deserialize() {
376 + #[derive(serde::Deserialize)]
377 + struct Wrapper {
378 + #[serde(default)]
379 + on_missing: OnMissing,
380 + }
381 +
382 + let w: Wrapper = toml::from_str(r#"on_missing = "alert""#).unwrap();
383 + assert_eq!(w.on_missing, OnMissing::Alert);
384 +
385 + let w: Wrapper = toml::from_str(r#"on_missing = "log""#).unwrap();
386 + assert_eq!(w.on_missing, OnMissing::Log);
387 +
388 + let w: Wrapper = toml::from_str(r#"on_missing = "ignore""#).unwrap();
389 + assert_eq!(w.on_missing, OnMissing::Ignore);
390 +
391 + // Default is Log
392 + let w: Wrapper = toml::from_str("").unwrap();
393 + assert_eq!(w.on_missing, OnMissing::Log);
394 + }
395 +
396 + fn test_instance_info() -> InstanceInfo {
397 + InstanceInfo {
398 + id: "test-id".to_string(),
399 + name: "test".to_string(),
400 + version: "0.1.0".to_string(),
401 + targets: vec![],
402 + started_at: "2026-03-10T00:00:00Z".to_string(),
403 + }
404 + }
405 +
406 + fn test_mesh_with_peer(grace_count: u32) -> SharedMeshState {
407 + let mut peer_configs = HashMap::new();
408 + peer_configs.insert(
409 + "peer1".to_string(),
410 + PeerConfig {
411 + address: "10.0.0.1:9100".to_string(),
412 + on_missing: OnMissing::Alert,
413 + grace_count: Some(grace_count),
414 + },
415 + );
416 + new_mesh_state(test_instance_info(), &peer_configs)
417 + }
418 +
419 + #[tokio::test]
420 + async fn heartbeat_failure_transitions_through_grace_to_missing() {
421 + let pool = crate::db::connect_in_memory().await.unwrap();
422 + let mesh = test_mesh_with_peer(3);
423 +
424 + // Start at Unknown
425 + assert_eq!(mesh.read().await.peers["peer1"].status, PeerStatus::Unknown);
426 +
427 + // First failure → GracePeriod
428 + handle_heartbeat_failure("peer1", &mesh, &pool, 0, &None).await;
429 + assert_eq!(mesh.read().await.peers["peer1"].status, PeerStatus::GracePeriod);
430 +
431 + // Second failure → still GracePeriod
432 + handle_heartbeat_failure("peer1", &mesh, &pool, 0, &None).await;
433 + assert_eq!(mesh.read().await.peers["peer1"].status, PeerStatus::GracePeriod);
434 +
435 + // Third failure (= grace_count) → Missing
436 + handle_heartbeat_failure("peer1", &mesh, &pool, 0, &None).await;
437 + assert_eq!(mesh.read().await.peers["peer1"].status, PeerStatus::Missing);
438 +
439 + // Fourth failure → stays Missing
440 + handle_heartbeat_failure("peer1", &mesh, &pool, 0, &None).await;
441 + assert_eq!(mesh.read().await.peers["peer1"].status, PeerStatus::Missing);
442 + }
443 +
444 + #[tokio::test]
445 + async fn heartbeat_success_recovers_from_missing() {
446 + let pool = crate::db::connect_in_memory().await.unwrap();
447 + let mesh = test_mesh_with_peer(1);
448 +
449 + // Drive to Missing
450 + handle_heartbeat_failure("peer1", &mesh, &pool, 0, &None).await;
451 + assert_eq!(mesh.read().await.peers["peer1"].status, PeerStatus::Missing);
452 +
453 + // Success → Online
454 + let info = InstanceInfo {
455 + id: "remote-id".to_string(),
456 + name: "remote".to_string(),
457 + version: "0.1.0".to_string(),
458 + targets: vec![],
459 + started_at: "2026-03-10T00:00:00Z".to_string(),
460 + };
461 + handle_heartbeat_success("peer1", &mesh, &pool, Some(info), 42, &None).await;
462 +
463 + let state = mesh.read().await;
464 + let peer = &state.peers["peer1"];
465 + assert_eq!(peer.status, PeerStatus::Online);
466 + assert_eq!(peer.consecutive_failures, 0);
467 + assert_eq!(peer.latency_ms, Some(42));
468 + assert_eq!(peer.known_id.as_deref(), Some("remote-id"));
469 + }
470 +
471 + #[tokio::test]
472 + async fn heartbeat_success_detects_uuid_stored_on_first_contact() {
473 + let pool = crate::db::connect_in_memory().await.unwrap();
474 + let mesh = test_mesh_with_peer(3);
475 +
476 + let info = InstanceInfo {
477 + id: "uuid-abc".to_string(),
478 + name: "remote".to_string(),
479 + version: "0.1.0".to_string(),
480 + targets: vec![],
481 + started_at: "2026-03-10T00:00:00Z".to_string(),
482 + };
483 + handle_heartbeat_success("peer1", &mesh, &pool, Some(info), 10, &None).await;
484 +
485 + // UUID should be persisted in DB
486 + let stored = crate::db::get_peer_identity(&pool, "peer1").await.unwrap();
487 + assert_eq!(stored, Some("uuid-abc".to_string()));
488 + }
489 +
490 + #[tokio::test]
491 + async fn heartbeat_records_to_db() {
492 + let pool = crate::db::connect_in_memory().await.unwrap();
493 + let mesh = test_mesh_with_peer(3);
494 +
495 + handle_heartbeat_failure("peer1", &mesh, &pool, 0, &None).await;
496 + handle_heartbeat_success("peer1", &mesh, &pool, None, 55, &None).await;
497 +
498 + let history = crate::db::get_peer_heartbeat_history(&pool, "peer1", 10).await.unwrap();
499 + assert_eq!(history.len(), 2);
500 + // Most recent first
Lines truncated
M src/tools/health.rs +145 -11
M src/tools/mod.rs +15 -5
M src/types.rs +255