max / makenotwork

server: document the buffered/streaming dual-path in the scan worker

Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-05-27 19:30 UTC

Commit: 4b7b5163ed637189eb31ad8a76f83a580e250857

Parent: 04cf1e6

1 file changed, +11 insertions, -4 deletions

M server/src/scanning/worker.rs +11 -4

			@@ -156,10 +156,17 @@ async fn run_pipeline_and_decide(
156	156		job: &ScanJob,
157	157		file_type: FileType,
158	158		) -> Result<FileScanStatus, Box<dyn std::error::Error + Send + Sync>> {
159		-	// Both branches run S3 IO outside the scan_semaphore: the permit
160		-	// bounds the CPU/clamd-heavy scan phase, not network IO. Holding it
161		-	// across the GET serializes downloads at SCAN_MAX_CONCURRENT and lets
162		-	// a scan backlog starve the DB pool.
	159	+	// Two paths, gated on file size. Small files go through the original
	160	+	// buffered `Pipeline::scan(Vec<u8>)`: a single S3 GET into a heap
	161	+	// buffer, then layers walk the slice. Big files (>= SCAN_MAX_MEMORY_BYTES)
	162	+	// stream from S3 into a tempfile under SCAN_SPOOL_DIR, then layers
	163	+	// run against the spooled path (mmap or streamed). The buffered path
	164	+	// stays alive: it's the hot path for tip-jar avatars / small audio /
	165	+	// download files, and avoiding the tempfile syscall + write matters at
	166	+	// that scale. Both branches run S3 IO outside the scan_semaphore:
	167	+	// the permit bounds the CPU/clamd-heavy scan phase, not network IO.
	168	+	// Holding it across the GET serializes downloads at SCAN_MAX_CONCURRENT
	169	+	// and lets a scan backlog starve the DB pool.
163	170		let result: ScanResult = if (job.file_size_bytes as usize) < constants::SCAN_MAX_MEMORY_BYTES {
164	171		let data = ctx.s3.download_object(&job.s3_key).await?;
165	172		let _permit = ctx.scan_semaphore.acquire().await?;