max / makenotwork

Fail closed when ClamAV can't fully scan (CHRONIC S1) A reachable clamd that replies with an error (INSTREAM size-limit, scan limits, unparseable reply) was collapsed into one LayerVerdict::Error under the FailOpen `clamav` layer, so sizing a payload past clamd's StreamMaxLength dropped the ClamAV layer → Clean on a healthy daemon (open across runs #18-#21). Structural fix: parse_clamav_response now returns a typed ClamavResponse (Clean | Infected | NotFullyScanned), mapped at one place. NotFullyScanned is emitted under a dedicated `clamav_incomplete` layer whose error policy is FailClosed — distinct from the FailOpen `clamav` layer kept for an unreachable daemon/timeout (the transport-error path). A future reply kind can't silently fall into the FailOpen bucket. Second half of the fix: resolve_pass_status no longer downgrades a pipeline HeldForReview to Clean for trusted uploaders. The ErrorPolicy::FailClosed contract is "hold for admin review" unconditionally; the old code ignored result.status and would have turned the clamav_incomplete hold (and every other FailClosed layer error, incl. oversize-to-spool) back into Clean for trusted users. Tests: typed-parse + clamav_incomplete-routing, final_status holds on clamav_incomplete, resolve_pass_status no-downgrade for trusted users. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Author: Max Johnson <me@maxj.phd> · 2026-06-15 23:10 UTC

Commit: aa2b1fe4ad82fc2e5b0e8879da17ed38d62ad0d4

Parent: deae4c6

3 files changed, +176 insertions, -54 deletions

M server/src/scanning/clamav.rs +96 -38

			@@ -153,7 +153,7 @@ async fn scan_instream(socket_path: &str, data: &[u8]) -> Result<LayerResult, St
153	153		let response_str = String::from_utf8_lossy(&response);
154	154		let response_str = response_str.trim_end_matches('\0').trim();
155	155
156		-	Ok(parse_clamav_response(response_str))
	156	+	Ok(clamav_layer_result(response_str))
157	157		}
158	158
159	159		async fn scan_instream_streaming<R>(
			@@ -221,23 +221,41 @@ where
221	221		let response_str = String::from_utf8_lossy(&response);
222	222		let response_str = response_str.trim_end_matches('\0').trim();
223	223
224		-	Ok(parse_clamav_response(response_str))
	224	+	Ok(clamav_layer_result(response_str))
225	225		}
226	226
227		-	/// Parse a ClamAV INSTREAM response string into a LayerResult.
	227	+	/// Classified outcome of a clamd INSTREAM reply we successfully received.
	228	+	///
	229	+	/// The distinction this enum forces is the whole point of CHRONIC S1's fix:
	230	+	/// transport failures (unreachable daemon / timeout) are handled in the scan
	231	+	/// entry points and are legitimately FailOpen — clamd being down must not block
	232	+	/// every upload. But by the time we are parsing a reply, clamd is reachable.
	233	+	/// An error reply at that point (size/scan-size/recursion limit, or anything we
	234	+	/// can't parse) means the file was not cleanly scanned — a coverage gap that
	235	+	/// must fail CLOSED, never be conflated into the FailOpen unreachable bucket.
	236	+	/// Because this is an exhaustive enum mapped at one place
	237	+	/// ([`clamav_layer_result`]), a future reply kind can't silently fall into the
	238	+	/// FailOpen path the way the old single `LayerVerdict::Error` return did.
	239	+	#[derive(Debug, PartialEq, Eq)]
	240	+	enum ClamavResponse {
	241	+	Clean,
	242	+	Infected(String),
	243	+	/// clamd responded but produced no clean verdict — a limit was hit or the
	244	+	/// reply was unparseable. The file was not fully scanned ⇒ fail closed.
	245	+	NotFullyScanned(String),
	246	+	}
	247	+
	248	+	/// Parse a clamd INSTREAM response string into a [`ClamavResponse`].
228	249		/// Extracted for testability — the socket/IO layer just feeds the string in.
229	250		///
230	251		/// ClamAV response format:
231	252		/// - `"stream: OK"` — clean
232	253		/// - `"stream: <virus_name> FOUND"` — infected
233		-	/// - `"stream: <error> ERROR"` — scan error
234		-	fn parse_clamav_response(response: &str) -> LayerResult {
	254	+	/// - `"stream: <error> ERROR"` (e.g. `"INSTREAM size limit exceeded"`),
	255	+	/// empty, or anything else — clamd is reachable but did not fully scan.
	256	+	fn parse_clamav_response(response: &str) -> ClamavResponse {
235	257		if response == "stream: OK" {
236		-	LayerResult {
237		-	layer: "clamav",
238		-	verdict: LayerVerdict::Pass,
239		-	detail: None,
240		-	}
	258	+	ClamavResponse::Clean
241	259		} else if response.ends_with("FOUND") {
242	260		// Extract virus name: "stream: Eicar-Signature FOUND" → "Eicar-Signature"
243	261		let virus_name = response
			@@ -245,18 +263,36 @@ fn parse_clamav_response(response: &str) -> LayerResult {
245	263		.unwrap_or(response)
246	264		.strip_suffix(" FOUND")
247	265		.unwrap_or(response);
	266	+	ClamavResponse::Infected(virus_name.to_string())
	267	+	} else {
	268	+	ClamavResponse::NotFullyScanned(response.to_string())
	269	+	}
	270	+	}
248	271
249		-	LayerResult {
	272	+	/// Map a received clamd reply to a [`LayerResult`]. A `NotFullyScanned`
	273	+	/// outcome is emitted under the dedicated `clamav_incomplete` layer, whose
	274	+	/// `error_policy_for` entry is FailClosed, so an un-scanned file is held for
	275	+	/// review. A genuinely unreachable daemon / timeout is reported separately in
	276	+	/// the scan entry points under the `clamav` layer (FailOpen). Routing the two
	277	+	/// error kinds to different layer identities is the same mechanism the
	278	+	/// `scan_panic` / `scan_size_limit` pseudo-layers already use to pick a policy.
	279	+	fn clamav_layer_result(response: &str) -> LayerResult {
	280	+	match parse_clamav_response(response) {
	281	+	ClamavResponse::Clean => LayerResult {
250	282		layer: "clamav",
251		-	verdict: LayerVerdict::Fail,
252		-	detail: Some(format!("ClamAV detection: {}", virus_name)),
253		-	}
254		-	} else {
255		-	LayerResult {
	283	+	verdict: LayerVerdict::Pass,
	284	+	detail: None,
	285	+	},
	286	+	ClamavResponse::Infected(name) => LayerResult {
256	287		layer: "clamav",
	288	+	verdict: LayerVerdict::Fail,
	289	+	detail: Some(format!("ClamAV detection: {name}")),
	290	+	},
	291	+	ClamavResponse::NotFullyScanned(resp) => LayerResult {
	292	+	layer: "clamav_incomplete",
257	293		verdict: LayerVerdict::Error,
258		-	detail: Some(format!("Unexpected ClamAV response: {}", response)),
259		-	}
	294	+	detail: Some(format!("ClamAV did not fully scan (held for review): {resp}")),
	295	+	},
260	296		}
261	297		}
262	298
			@@ -268,8 +304,10 @@ mod tests {
268	304
269	305		#[test]
270	306		fn clean_response_passes() {
271		-	let result = parse_clamav_response("stream: OK");
	307	+	assert_eq!(parse_clamav_response("stream: OK"), ClamavResponse::Clean);
	308	+	let result = clamav_layer_result("stream: OK");
272	309		assert_eq!(result.verdict, LayerVerdict::Pass);
	310	+	assert_eq!(result.layer, "clamav");
273	311		assert!(result.detail.is_none());
274	312		}
275	313
			@@ -277,47 +315,67 @@ mod tests {
277	315
278	316		#[test]
279	317		fn eicar_detection_fails() {
280		-	let result = parse_clamav_response("stream: Eicar-Signature FOUND");
	318	+	assert_eq!(
	319	+	parse_clamav_response("stream: Eicar-Signature FOUND"),
	320	+	ClamavResponse::Infected("Eicar-Signature".to_string())
	321	+	);
	322	+	let result = clamav_layer_result("stream: Eicar-Signature FOUND");
281	323		assert_eq!(result.verdict, LayerVerdict::Fail);
282		-	let detail = result.detail.unwrap();
283		-	assert!(detail.contains("Eicar-Signature"));
	324	+	assert_eq!(result.layer, "clamav");
	325	+	assert!(result.detail.unwrap().contains("Eicar-Signature"));
284	326		}
285	327
286	328		#[test]
287	329		fn complex_virus_name_extracted() {
288		-	let result = parse_clamav_response("stream: Win.Test.EICAR_HDB-1 FOUND");
	330	+	let result = clamav_layer_result("stream: Win.Test.EICAR_HDB-1 FOUND");
289	331		assert_eq!(result.verdict, LayerVerdict::Fail);
290		-	let detail = result.detail.unwrap();
291		-	assert!(detail.contains("Win.Test.EICAR_HDB-1"));
	332	+	assert!(result.detail.unwrap().contains("Win.Test.EICAR_HDB-1"));
292	333		}
293	334
294	335		#[test]
295	336		fn trojan_detection_fails() {
296		-	let result = parse_clamav_response("stream: Win.Trojan.Agent-123456 FOUND");
	337	+	let result = clamav_layer_result("stream: Win.Trojan.Agent-123456 FOUND");
297	338		assert_eq!(result.verdict, LayerVerdict::Fail);
298		-	let detail = result.detail.unwrap();
299		-	assert!(detail.contains("Win.Trojan.Agent-123456"));
	339	+	assert!(result.detail.unwrap().contains("Win.Trojan.Agent-123456"));
300	340		}
301	341
302		-	// -- Error responses --
	342	+	// -- Not-fully-scanned responses (clamd reachable, no clean verdict) --
	343	+	//
	344	+	// CHRONIC S1: every one of these must FAIL CLOSED, not FailOpen. They are
	345	+	// emitted under the `clamav_incomplete` layer (FailClosed in
	346	+	// `error_policy_for`), distinct from the FailOpen `clamav` layer used for an
	347	+	// unreachable daemon.
303	348
304	349		#[test]
305		-	fn empty_response_is_error() {
306		-	let result = parse_clamav_response("");
307		-	assert_eq!(result.verdict, LayerVerdict::Error);
	350	+	fn empty_response_is_not_fully_scanned() {
	351	+	assert_eq!(
	352	+	parse_clamav_response(""),
	353	+	ClamavResponse::NotFullyScanned(String::new())
	354	+	);
	355	+	assert_eq!(clamav_layer_result("").layer, "clamav_incomplete");
	356	+	assert_eq!(clamav_layer_result("").verdict, LayerVerdict::Error);
308	357		}
309	358
310	359		#[test]
311		-	fn garbage_response_is_error() {
312		-	let result = parse_clamav_response("this is not a valid response");
	360	+	fn garbage_response_is_not_fully_scanned() {
	361	+	let result = clamav_layer_result("this is not a valid response");
	362	+	assert_eq!(result.layer, "clamav_incomplete");
313	363		assert_eq!(result.verdict, LayerVerdict::Error);
314		-	assert!(result.detail.unwrap().contains("Unexpected ClamAV response"));
315	364		}
316	365
317	366		#[test]
318		-	fn error_keyword_in_response_is_error() {
319		-	// ClamAV may respond with "stream: <message> ERROR" for scan errors
320		-	let result = parse_clamav_response("stream: INSTREAM size limit exceeded ERROR");
	367	+	fn size_limit_error_fails_closed_not_open() {
	368	+	// The regression that defined CHRONIC S1: a payload sized past clamd's
	369	+	// StreamMaxLength yields this reply on a HEALTHY clamd. It must route to
	370	+	// the FailClosed `clamav_incomplete` layer, never the FailOpen `clamav`
	371	+	// layer (which would skip → Clean).
	372	+	let response = "stream: INSTREAM size limit exceeded ERROR";
	373	+	assert!(matches!(
	374	+	parse_clamav_response(response),
	375	+	ClamavResponse::NotFullyScanned(_)
	376	+	));
	377	+	let result = clamav_layer_result(response);
	378	+	assert_eq!(result.layer, "clamav_incomplete");
321	379		assert_eq!(result.verdict, LayerVerdict::Error);
322	380		}
323	381		}

M server/src/scanning/mod.rs +17

			@@ -79,6 +79,12 @@ fn error_policy_for(layer: &str) -> ErrorPolicy {
79	79		"archive" => archive::ERROR_POLICY,
80	80		"yara" => yara::ERROR_POLICY,
81	81		"clamav" => clamav::ERROR_POLICY,
	82	+	// A reachable clamd that couldn't fully scan (size/scan limit,
	83	+	// unparseable reply) is a coverage gap, not an outage — fail closed,
	84	+	// distinct from the FailOpen `clamav` layer used for an unreachable
	85	+	// daemon. Splitting the policy by layer identity is what keeps CHRONIC
	86	+	// S1 (size-limit → FailOpen → Clean) from recurring.
	87	+	"clamav_incomplete" => ErrorPolicy::FailClosed,
82	88		"malwarebazaar" => hash_lookup::ERROR_POLICY,
83	89		"urlhaus" => urlhaus::ERROR_POLICY,
84	90		"signing_macos" => signing_macos::ERROR_POLICY,
			@@ -809,6 +815,17 @@ mod tests {
809	815		}
810	816
811	817		#[test]
	818	+	fn clamav_incomplete_is_fail_closed_and_holds() {
	819	+	// CHRONIC S1: a reachable-but-incomplete clamav scan is emitted under the
	820	+	// `clamav_incomplete` layer, which must be FailClosed → HeldForReview,
	821	+	// distinct from the FailOpen `clamav` layer used for an unreachable daemon.
	822	+	assert_eq!(error_policy_for("clamav_incomplete"), ErrorPolicy::FailClosed);
	823	+	assert_eq!(error_policy_for("clamav"), ErrorPolicy::FailOpen);
	824	+	let layers = vec![pass("content_type"), pass("structural"), pass("archive"), skip("yara"), err("clamav_incomplete")];
	825	+	assert_eq!(final_status(&layers), FileScanStatus::HeldForReview);
	826	+	}
	827	+
	828	+	#[test]
812	829		fn final_status_held_on_unknown_layer_error() {
813	830		// Defensive default: an unknown layer name that errors falls through
814	831		// to FailClosed. This is what catches a new layer added without

M server/src/scanning/worker.rs +63 -16

			@@ -52,15 +52,31 @@ pub struct WorkerContext {
52	52
53	53		/// Decide the final status for a non-quarantined scan.
54	54		///
55		-	/// A trusted uploader normally passes (`Clean`); an untrusted one always routes
56		-	/// to admin review. The ClamAV degraded-mode overlay adds one exception: if the
57		-	/// runtime probe has found clamd DOWN and this scan's clamav layer errored
58		-	/// (FailOpen would otherwise skip it), even a trusted upload is held for review
59		-	/// rather than passed on zero real AV coverage. A single transient scan error
60		-	/// while the probe still reports clamd healthy does NOT trip this — only a
61		-	/// sustained outage the probe has observed — so transient blips don't flood the
62		-	/// review queue.
63		-	fn resolve_pass_status(is_trusted: bool, clamav_down: bool, layers: &[LayerResult]) -> FileScanStatus {
	55	+	/// First, the pipeline's own status is authoritative when it already says
	56	+	/// `HeldForReview`: that means a `FailClosed` layer errored (a structural/YARA
	57	+	/// panic, an oversize-to-spool file, or a reachable-but-incomplete clamav scan),
	58	+	/// i.e. the file was not fully scanned. `ErrorPolicy::FailClosed` means "hold
	59	+	/// for admin review" unconditionally — including for trusted uploaders — so we
	60	+	/// never downgrade it here. (CHRONIC S1's fix for trusted uploaders depends on
	61	+	/// this: the `clamav_incomplete` hold would otherwise be silently turned back
	62	+	/// into `Clean` below.)
	63	+	///
	64	+	/// Otherwise the pipeline returned `Clean`: a trusted uploader normally passes;
	65	+	/// an untrusted one always routes to admin review. The ClamAV degraded-mode
	66	+	/// overlay adds one exception — if the runtime probe has found clamd DOWN and
	67	+	/// this scan's clamav (transport) layer errored (FailOpen made `final_status`
	68	+	/// skip it → Clean), even a trusted upload is held rather than passed on zero AV
	69	+	/// coverage. A single transient error while the probe still reports clamd
	70	+	/// healthy does NOT trip this — only a sustained outage the probe has observed.
	71	+	fn resolve_pass_status(
	72	+	pipeline_status: FileScanStatus,
	73	+	is_trusted: bool,
	74	+	clamav_down: bool,
	75	+	layers: &[LayerResult],
	76	+	) -> FileScanStatus {
	77	+	if pipeline_status == FileScanStatus::HeldForReview {
	78	+	return FileScanStatus::HeldForReview;
	79	+	}
64	80		let clamav_errored = layers
65	81		.iter()
66	82		.any(\|l\| l.layer == "clamav" && l.verdict == LayerVerdict::Error);
			@@ -385,7 +401,7 @@ async fn run_pipeline_and_decide(
385	401		// and this scan's clamav layer errored). See `resolve_pass_status`.
386	402		let is_trusted = db::users::is_upload_trusted(&ctx.db, job.user_id).await?;
387	403		let clamav_down = !ctx.clamav_healthy.load(Ordering::Relaxed);
388		-	Ok(resolve_pass_status(is_trusted, clamav_down, &result.layers))
	404	+	Ok(resolve_pass_status(result.status, is_trusted, clamav_down, &result.layers))
389	405		}
390	406
391	407		/// Update the per-entity `scan_status` column for the kinds that have one.
			@@ -430,19 +446,25 @@ mod tests {
430	446		LayerResult { layer: name, verdict, detail: None }
431	447		}
432	448
	449	+	// The first arg is the pipeline's own status (Clean unless a FailClosed
	450	+	// layer errored). The cases below exercise a Clean pipeline; the
	451	+	// *_not_downgraded tests cover a HeldForReview pipeline.
	452	+	const CLEAN: FileScanStatus = FileScanStatus::Clean;
	453	+
433	454		#[test]
434	455		fn trusted_passes_when_clamav_healthy() {
435	456		let layers = [layer("clamav", LayerVerdict::Pass)];
436		-	assert_eq!(resolve_pass_status(true, false, &layers), FileScanStatus::Clean);
	457	+	assert_eq!(resolve_pass_status(CLEAN, true, false, &layers), FileScanStatus::Clean);
437	458		}
438	459
439	460		#[test]
440	461		fn trusted_held_when_clamav_down_and_errored() {
441	462		// The vulnerability window: clamd died post-boot (probe says down) AND
442		-	// this scan's clamav layer errored (FailOpen would otherwise skip it).
	463	+	// this scan's clamav (transport) layer errored (FailOpen would otherwise
	464	+	// skip it). final_status saw FailOpen → Clean; the overlay holds it.
443	465		let layers = [layer("clamav", LayerVerdict::Error)];
444	466		assert_eq!(
445		-	resolve_pass_status(true, true, &layers),
	467	+	resolve_pass_status(CLEAN, true, true, &layers),
446	468		FileScanStatus::HeldForReview
447	469		);
448	470		}
			@@ -452,7 +474,7 @@ mod tests {
452	474		// Probe flag stale/racing but the clamav layer actually passed — not a
453	475		// coverage gap, so don't hold.
454	476		let layers = [layer("clamav", LayerVerdict::Pass)];
455		-	assert_eq!(resolve_pass_status(true, true, &layers), FileScanStatus::Clean);
	477	+	assert_eq!(resolve_pass_status(CLEAN, true, true, &layers), FileScanStatus::Clean);
456	478		}
457	479
458	480		#[test]
			@@ -460,14 +482,39 @@ mod tests {
460	482		// Single transient clamav error but the probe still reports healthy —
461	483		// don't flood the review queue on a blip.
462	484		let layers = [layer("clamav", LayerVerdict::Error)];
463		-	assert_eq!(resolve_pass_status(true, false, &layers), FileScanStatus::Clean);
	485	+	assert_eq!(resolve_pass_status(CLEAN, true, false, &layers), FileScanStatus::Clean);
464	486		}
465	487
466	488		#[test]
467	489		fn untrusted_always_held() {
468	490		let layers = [layer("clamav", LayerVerdict::Pass)];
469	491		assert_eq!(
470		-	resolve_pass_status(false, false, &layers),
	492	+	resolve_pass_status(CLEAN, false, false, &layers),
	493	+	FileScanStatus::HeldForReview
	494	+	);
	495	+	}
	496	+
	497	+	#[test]
	498	+	fn trusted_held_when_pipeline_already_held_not_downgraded() {
	499	+	// CHRONIC S1: a reachable-but-incomplete clamav scan (or any FailClosed
	500	+	// layer error) makes final_status = HeldForReview. A trusted uploader on
	501	+	// a HEALTHY clamd must NOT have that downgraded to Clean — the prior code
	502	+	// ignored the pipeline status and would have passed it.
	503	+	let layers = [layer("clamav_incomplete", LayerVerdict::Error)];
	504	+	assert_eq!(
	505	+	resolve_pass_status(FileScanStatus::HeldForReview, true, false, &layers),
	506	+	FileScanStatus::HeldForReview
	507	+	);
	508	+	}
	509	+
	510	+	#[test]
	511	+	fn pipeline_hold_survives_even_with_no_clamav_layer() {
	512	+	// The no-downgrade guard is layer-agnostic: any FailClosed hold (e.g. an
	513	+	// oversize-to-spool file or a structural panic) is honored for trusted
	514	+	// uploaders too.
	515	+	let layers = [layer("scan_size_limit", LayerVerdict::Error)];
	516	+	assert_eq!(
	517	+	resolve_pass_status(FileScanStatus::HeldForReview, true, false, &layers),
471	518		FileScanStatus::HeldForReview
472	519		);
473	520		}