Skip to main content

max / makenotwork

Fail closed when ClamAV can't fully scan (CHRONIC S1) A reachable clamd that replies with an error (INSTREAM size-limit, scan limits, unparseable reply) was collapsed into one LayerVerdict::Error under the FailOpen `clamav` layer, so sizing a payload past clamd's StreamMaxLength dropped the ClamAV layer → Clean on a healthy daemon (open across runs #18-#21). Structural fix: parse_clamav_response now returns a typed ClamavResponse (Clean | Infected | NotFullyScanned), mapped at one place. NotFullyScanned is emitted under a dedicated `clamav_incomplete` layer whose error policy is FailClosed — distinct from the FailOpen `clamav` layer kept for an unreachable daemon/timeout (the transport-error path). A future reply kind can't silently fall into the FailOpen bucket. Second half of the fix: resolve_pass_status no longer downgrades a pipeline HeldForReview to Clean for trusted uploaders. The ErrorPolicy::FailClosed contract is "hold for admin review" unconditionally; the old code ignored result.status and would have turned the clamav_incomplete hold (and every other FailClosed layer error, incl. oversize-to-spool) back into Clean for trusted users. Tests: typed-parse + clamav_incomplete-routing, final_status holds on clamav_incomplete, resolve_pass_status no-downgrade for trusted users. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Author: Max Johnson <me@maxj.phd> · 2026-06-15 23:10 UTC
Commit: aa2b1fe4ad82fc2e5b0e8879da17ed38d62ad0d4
Parent: deae4c6
3 files changed, +176 insertions, -54 deletions
@@ -153,7 +153,7 @@ async fn scan_instream(socket_path: &str, data: &[u8]) -> Result<LayerResult, St
153 153 let response_str = String::from_utf8_lossy(&response);
154 154 let response_str = response_str.trim_end_matches('\0').trim();
155 155
156 - Ok(parse_clamav_response(response_str))
156 + Ok(clamav_layer_result(response_str))
157 157 }
158 158
159 159 async fn scan_instream_streaming<R>(
@@ -221,23 +221,41 @@ where
221 221 let response_str = String::from_utf8_lossy(&response);
222 222 let response_str = response_str.trim_end_matches('\0').trim();
223 223
224 - Ok(parse_clamav_response(response_str))
224 + Ok(clamav_layer_result(response_str))
225 225 }
226 226
227 - /// Parse a ClamAV INSTREAM response string into a LayerResult.
227 + /// Classified outcome of a clamd INSTREAM reply we *successfully received*.
228 + ///
229 + /// The distinction this enum forces is the whole point of CHRONIC S1's fix:
230 + /// transport failures (unreachable daemon / timeout) are handled in the scan
231 + /// entry points and are legitimately FailOpen — clamd being down must not block
232 + /// every upload. But by the time we are parsing a reply, **clamd is reachable**.
233 + /// An error reply at that point (size/scan-size/recursion limit, or anything we
234 + /// can't parse) means the file was *not cleanly scanned* — a coverage gap that
235 + /// must fail CLOSED, never be conflated into the FailOpen unreachable bucket.
236 + /// Because this is an exhaustive enum mapped at one place
237 + /// ([`clamav_layer_result`]), a future reply kind can't silently fall into the
238 + /// FailOpen path the way the old single `LayerVerdict::Error` return did.
239 + #[derive(Debug, PartialEq, Eq)]
240 + enum ClamavResponse {
241 + Clean,
242 + Infected(String),
243 + /// clamd responded but produced no clean verdict — a limit was hit or the
244 + /// reply was unparseable. The file was not fully scanned ⇒ fail closed.
245 + NotFullyScanned(String),
246 + }
247 +
248 + /// Parse a clamd INSTREAM response string into a [`ClamavResponse`].
228 249 /// Extracted for testability — the socket/IO layer just feeds the string in.
229 250 ///
230 251 /// ClamAV response format:
231 252 /// - `"stream: OK"` — clean
232 253 /// - `"stream: <virus_name> FOUND"` — infected
233 - /// - `"stream: <error> ERROR"` — scan error
234 - fn parse_clamav_response(response: &str) -> LayerResult {
254 + /// - `"stream: <error> ERROR"` (e.g. `"INSTREAM size limit exceeded"`),
255 + /// empty, or anything else — clamd is reachable but did not fully scan.
256 + fn parse_clamav_response(response: &str) -> ClamavResponse {
235 257 if response == "stream: OK" {
236 - LayerResult {
237 - layer: "clamav",
238 - verdict: LayerVerdict::Pass,
239 - detail: None,
240 - }
258 + ClamavResponse::Clean
241 259 } else if response.ends_with("FOUND") {
242 260 // Extract virus name: "stream: Eicar-Signature FOUND" → "Eicar-Signature"
243 261 let virus_name = response
@@ -245,18 +263,36 @@ fn parse_clamav_response(response: &str) -> LayerResult {
245 263 .unwrap_or(response)
246 264 .strip_suffix(" FOUND")
247 265 .unwrap_or(response);
266 + ClamavResponse::Infected(virus_name.to_string())
267 + } else {
268 + ClamavResponse::NotFullyScanned(response.to_string())
269 + }
270 + }
248 271
249 - LayerResult {
272 + /// Map a *received* clamd reply to a [`LayerResult`]. A `NotFullyScanned`
273 + /// outcome is emitted under the dedicated `clamav_incomplete` layer, whose
274 + /// `error_policy_for` entry is **FailClosed**, so an un-scanned file is held for
275 + /// review. A genuinely unreachable daemon / timeout is reported separately in
276 + /// the scan entry points under the `clamav` layer (FailOpen). Routing the two
277 + /// error kinds to different layer identities is the same mechanism the
278 + /// `scan_panic` / `scan_size_limit` pseudo-layers already use to pick a policy.
279 + fn clamav_layer_result(response: &str) -> LayerResult {
280 + match parse_clamav_response(response) {
281 + ClamavResponse::Clean => LayerResult {
250 282 layer: "clamav",
251 - verdict: LayerVerdict::Fail,
252 - detail: Some(format!("ClamAV detection: {}", virus_name)),
253 - }
254 - } else {
255 - LayerResult {
283 + verdict: LayerVerdict::Pass,
284 + detail: None,
285 + },
286 + ClamavResponse::Infected(name) => LayerResult {
256 287 layer: "clamav",
288 + verdict: LayerVerdict::Fail,
289 + detail: Some(format!("ClamAV detection: {name}")),
290 + },
291 + ClamavResponse::NotFullyScanned(resp) => LayerResult {
292 + layer: "clamav_incomplete",
257 293 verdict: LayerVerdict::Error,
258 - detail: Some(format!("Unexpected ClamAV response: {}", response)),
259 - }
294 + detail: Some(format!("ClamAV did not fully scan (held for review): {resp}")),
295 + },
260 296 }
261 297 }
262 298
@@ -268,8 +304,10 @@ mod tests {
268 304
269 305 #[test]
270 306 fn clean_response_passes() {
271 - let result = parse_clamav_response("stream: OK");
307 + assert_eq!(parse_clamav_response("stream: OK"), ClamavResponse::Clean);
308 + let result = clamav_layer_result("stream: OK");
272 309 assert_eq!(result.verdict, LayerVerdict::Pass);
310 + assert_eq!(result.layer, "clamav");
273 311 assert!(result.detail.is_none());
274 312 }
275 313
@@ -277,47 +315,67 @@ mod tests {
277 315
278 316 #[test]
279 317 fn eicar_detection_fails() {
280 - let result = parse_clamav_response("stream: Eicar-Signature FOUND");
318 + assert_eq!(
319 + parse_clamav_response("stream: Eicar-Signature FOUND"),
320 + ClamavResponse::Infected("Eicar-Signature".to_string())
321 + );
322 + let result = clamav_layer_result("stream: Eicar-Signature FOUND");
281 323 assert_eq!(result.verdict, LayerVerdict::Fail);
282 - let detail = result.detail.unwrap();
283 - assert!(detail.contains("Eicar-Signature"));
324 + assert_eq!(result.layer, "clamav");
325 + assert!(result.detail.unwrap().contains("Eicar-Signature"));
284 326 }
285 327
286 328 #[test]
287 329 fn complex_virus_name_extracted() {
288 - let result = parse_clamav_response("stream: Win.Test.EICAR_HDB-1 FOUND");
330 + let result = clamav_layer_result("stream: Win.Test.EICAR_HDB-1 FOUND");
289 331 assert_eq!(result.verdict, LayerVerdict::Fail);
290 - let detail = result.detail.unwrap();
291 - assert!(detail.contains("Win.Test.EICAR_HDB-1"));
332 + assert!(result.detail.unwrap().contains("Win.Test.EICAR_HDB-1"));
292 333 }
293 334
294 335 #[test]
295 336 fn trojan_detection_fails() {
296 - let result = parse_clamav_response("stream: Win.Trojan.Agent-123456 FOUND");
337 + let result = clamav_layer_result("stream: Win.Trojan.Agent-123456 FOUND");
297 338 assert_eq!(result.verdict, LayerVerdict::Fail);
298 - let detail = result.detail.unwrap();
299 - assert!(detail.contains("Win.Trojan.Agent-123456"));
339 + assert!(result.detail.unwrap().contains("Win.Trojan.Agent-123456"));
300 340 }
301 341
302 - // -- Error responses --
342 + // -- Not-fully-scanned responses (clamd reachable, no clean verdict) --
343 + //
344 + // CHRONIC S1: every one of these must FAIL CLOSED, not FailOpen. They are
345 + // emitted under the `clamav_incomplete` layer (FailClosed in
346 + // `error_policy_for`), distinct from the FailOpen `clamav` layer used for an
347 + // unreachable daemon.
303 348
304 349 #[test]
305 - fn empty_response_is_error() {
306 - let result = parse_clamav_response("");
307 - assert_eq!(result.verdict, LayerVerdict::Error);
350 + fn empty_response_is_not_fully_scanned() {
351 + assert_eq!(
352 + parse_clamav_response(""),
353 + ClamavResponse::NotFullyScanned(String::new())
354 + );
355 + assert_eq!(clamav_layer_result("").layer, "clamav_incomplete");
356 + assert_eq!(clamav_layer_result("").verdict, LayerVerdict::Error);
308 357 }
309 358
310 359 #[test]
311 - fn garbage_response_is_error() {
312 - let result = parse_clamav_response("this is not a valid response");
360 + fn garbage_response_is_not_fully_scanned() {
361 + let result = clamav_layer_result("this is not a valid response");
362 + assert_eq!(result.layer, "clamav_incomplete");
313 363 assert_eq!(result.verdict, LayerVerdict::Error);
314 - assert!(result.detail.unwrap().contains("Unexpected ClamAV response"));
315 364 }
316 365
317 366 #[test]
318 - fn error_keyword_in_response_is_error() {
319 - // ClamAV may respond with "stream: <message> ERROR" for scan errors
320 - let result = parse_clamav_response("stream: INSTREAM size limit exceeded ERROR");
367 + fn size_limit_error_fails_closed_not_open() {
368 + // The regression that defined CHRONIC S1: a payload sized past clamd's
369 + // StreamMaxLength yields this reply on a HEALTHY clamd. It must route to
370 + // the FailClosed `clamav_incomplete` layer, never the FailOpen `clamav`
371 + // layer (which would skip → Clean).
372 + let response = "stream: INSTREAM size limit exceeded ERROR";
373 + assert!(matches!(
374 + parse_clamav_response(response),
375 + ClamavResponse::NotFullyScanned(_)
376 + ));
377 + let result = clamav_layer_result(response);
378 + assert_eq!(result.layer, "clamav_incomplete");
321 379 assert_eq!(result.verdict, LayerVerdict::Error);
322 380 }
323 381 }
@@ -79,6 +79,12 @@ fn error_policy_for(layer: &str) -> ErrorPolicy {
79 79 "archive" => archive::ERROR_POLICY,
80 80 "yara" => yara::ERROR_POLICY,
81 81 "clamav" => clamav::ERROR_POLICY,
82 + // A *reachable* clamd that couldn't fully scan (size/scan limit,
83 + // unparseable reply) is a coverage gap, not an outage — fail closed,
84 + // distinct from the FailOpen `clamav` layer used for an unreachable
85 + // daemon. Splitting the policy by layer identity is what keeps CHRONIC
86 + // S1 (size-limit → FailOpen → Clean) from recurring.
87 + "clamav_incomplete" => ErrorPolicy::FailClosed,
82 88 "malwarebazaar" => hash_lookup::ERROR_POLICY,
83 89 "urlhaus" => urlhaus::ERROR_POLICY,
84 90 "signing_macos" => signing_macos::ERROR_POLICY,
@@ -809,6 +815,17 @@ mod tests {
809 815 }
810 816
811 817 #[test]
818 + fn clamav_incomplete_is_fail_closed_and_holds() {
819 + // CHRONIC S1: a reachable-but-incomplete clamav scan is emitted under the
820 + // `clamav_incomplete` layer, which must be FailClosed → HeldForReview,
821 + // distinct from the FailOpen `clamav` layer used for an unreachable daemon.
822 + assert_eq!(error_policy_for("clamav_incomplete"), ErrorPolicy::FailClosed);
823 + assert_eq!(error_policy_for("clamav"), ErrorPolicy::FailOpen);
824 + let layers = vec![pass("content_type"), pass("structural"), pass("archive"), skip("yara"), err("clamav_incomplete")];
825 + assert_eq!(final_status(&layers), FileScanStatus::HeldForReview);
826 + }
827 +
828 + #[test]
812 829 fn final_status_held_on_unknown_layer_error() {
813 830 // Defensive default: an unknown layer name that errors falls through
814 831 // to FailClosed. This is what catches a new layer added without
@@ -52,15 +52,31 @@ pub struct WorkerContext {
52 52
53 53 /// Decide the final status for a non-quarantined scan.
54 54 ///
55 - /// A trusted uploader normally passes (`Clean`); an untrusted one always routes
56 - /// to admin review. The ClamAV degraded-mode overlay adds one exception: if the
57 - /// runtime probe has found clamd DOWN *and* this scan's clamav layer errored
58 - /// (FailOpen would otherwise skip it), even a trusted upload is held for review
59 - /// rather than passed on zero real AV coverage. A single transient scan error
60 - /// while the probe still reports clamd healthy does NOT trip this — only a
61 - /// sustained outage the probe has observed — so transient blips don't flood the
62 - /// review queue.
63 - fn resolve_pass_status(is_trusted: bool, clamav_down: bool, layers: &[LayerResult]) -> FileScanStatus {
55 + /// First, the pipeline's own status is authoritative when it already says
56 + /// `HeldForReview`: that means a `FailClosed` layer errored (a structural/YARA
57 + /// panic, an oversize-to-spool file, or a reachable-but-incomplete clamav scan),
58 + /// i.e. the file was *not fully scanned*. `ErrorPolicy::FailClosed` means "hold
59 + /// for admin review" unconditionally — including for trusted uploaders — so we
60 + /// never downgrade it here. (CHRONIC S1's fix for trusted uploaders depends on
61 + /// this: the `clamav_incomplete` hold would otherwise be silently turned back
62 + /// into `Clean` below.)
63 + ///
64 + /// Otherwise the pipeline returned `Clean`: a trusted uploader normally passes;
65 + /// an untrusted one always routes to admin review. The ClamAV degraded-mode
66 + /// overlay adds one exception — if the runtime probe has found clamd DOWN *and*
67 + /// this scan's clamav (transport) layer errored (FailOpen made `final_status`
68 + /// skip it → Clean), even a trusted upload is held rather than passed on zero AV
69 + /// coverage. A single transient error while the probe still reports clamd
70 + /// healthy does NOT trip this — only a sustained outage the probe has observed.
71 + fn resolve_pass_status(
72 + pipeline_status: FileScanStatus,
73 + is_trusted: bool,
74 + clamav_down: bool,
75 + layers: &[LayerResult],
76 + ) -> FileScanStatus {
77 + if pipeline_status == FileScanStatus::HeldForReview {
78 + return FileScanStatus::HeldForReview;
79 + }
64 80 let clamav_errored = layers
65 81 .iter()
66 82 .any(|l| l.layer == "clamav" && l.verdict == LayerVerdict::Error);
@@ -385,7 +401,7 @@ async fn run_pipeline_and_decide(
385 401 // and this scan's clamav layer errored). See `resolve_pass_status`.
386 402 let is_trusted = db::users::is_upload_trusted(&ctx.db, job.user_id).await?;
387 403 let clamav_down = !ctx.clamav_healthy.load(Ordering::Relaxed);
388 - Ok(resolve_pass_status(is_trusted, clamav_down, &result.layers))
404 + Ok(resolve_pass_status(result.status, is_trusted, clamav_down, &result.layers))
389 405 }
390 406
391 407 /// Update the per-entity `scan_status` column for the kinds that have one.
@@ -430,19 +446,25 @@ mod tests {
430 446 LayerResult { layer: name, verdict, detail: None }
431 447 }
432 448
449 + // The first arg is the pipeline's own status (Clean unless a FailClosed
450 + // layer errored). The cases below exercise a Clean pipeline; the
451 + // *_not_downgraded tests cover a HeldForReview pipeline.
452 + const CLEAN: FileScanStatus = FileScanStatus::Clean;
453 +
433 454 #[test]
434 455 fn trusted_passes_when_clamav_healthy() {
435 456 let layers = [layer("clamav", LayerVerdict::Pass)];
436 - assert_eq!(resolve_pass_status(true, false, &layers), FileScanStatus::Clean);
457 + assert_eq!(resolve_pass_status(CLEAN, true, false, &layers), FileScanStatus::Clean);
437 458 }
438 459
439 460 #[test]
440 461 fn trusted_held_when_clamav_down_and_errored() {
441 462 // The vulnerability window: clamd died post-boot (probe says down) AND
442 - // this scan's clamav layer errored (FailOpen would otherwise skip it).
463 + // this scan's clamav (transport) layer errored (FailOpen would otherwise
464 + // skip it). final_status saw FailOpen → Clean; the overlay holds it.
443 465 let layers = [layer("clamav", LayerVerdict::Error)];
444 466 assert_eq!(
445 - resolve_pass_status(true, true, &layers),
467 + resolve_pass_status(CLEAN, true, true, &layers),
446 468 FileScanStatus::HeldForReview
447 469 );
448 470 }
@@ -452,7 +474,7 @@ mod tests {
452 474 // Probe flag stale/racing but the clamav layer actually passed — not a
453 475 // coverage gap, so don't hold.
454 476 let layers = [layer("clamav", LayerVerdict::Pass)];
455 - assert_eq!(resolve_pass_status(true, true, &layers), FileScanStatus::Clean);
477 + assert_eq!(resolve_pass_status(CLEAN, true, true, &layers), FileScanStatus::Clean);
456 478 }
457 479
458 480 #[test]
@@ -460,14 +482,39 @@ mod tests {
460 482 // Single transient clamav error but the probe still reports healthy —
461 483 // don't flood the review queue on a blip.
462 484 let layers = [layer("clamav", LayerVerdict::Error)];
463 - assert_eq!(resolve_pass_status(true, false, &layers), FileScanStatus::Clean);
485 + assert_eq!(resolve_pass_status(CLEAN, true, false, &layers), FileScanStatus::Clean);
464 486 }
465 487
466 488 #[test]
467 489 fn untrusted_always_held() {
468 490 let layers = [layer("clamav", LayerVerdict::Pass)];
469 491 assert_eq!(
470 - resolve_pass_status(false, false, &layers),
492 + resolve_pass_status(CLEAN, false, false, &layers),
493 + FileScanStatus::HeldForReview
494 + );
495 + }
496 +
497 + #[test]
498 + fn trusted_held_when_pipeline_already_held_not_downgraded() {
499 + // CHRONIC S1: a reachable-but-incomplete clamav scan (or any FailClosed
500 + // layer error) makes final_status = HeldForReview. A trusted uploader on
501 + // a HEALTHY clamd must NOT have that downgraded to Clean — the prior code
502 + // ignored the pipeline status and would have passed it.
503 + let layers = [layer("clamav_incomplete", LayerVerdict::Error)];
504 + assert_eq!(
505 + resolve_pass_status(FileScanStatus::HeldForReview, true, false, &layers),
506 + FileScanStatus::HeldForReview
507 + );
508 + }
509 +
510 + #[test]
511 + fn pipeline_hold_survives_even_with_no_clamav_layer() {
512 + // The no-downgrade guard is layer-agnostic: any FailClosed hold (e.g. an
513 + // oversize-to-spool file or a structural panic) is honored for trusted
514 + // uploaders too.
515 + let layers = [layer("scan_size_limit", LayerVerdict::Error)];
516 + assert_eq!(
517 + resolve_pass_status(FileScanStatus::HeldForReview, true, false, &layers),
471 518 FileScanStatus::HeldForReview
472 519 );
473 520 }