max / makenotwork
1 file changed,
+11 insertions,
-4 deletions
| @@ -156,10 +156,17 @@ async fn run_pipeline_and_decide( | |||
| 156 | 156 | job: &ScanJob, | |
| 157 | 157 | file_type: FileType, | |
| 158 | 158 | ) -> Result<FileScanStatus, Box<dyn std::error::Error + Send + Sync>> { | |
| 159 | - | // Both branches run S3 IO *outside* the scan_semaphore: the permit | |
| 160 | - | // bounds the CPU/clamd-heavy scan phase, not network IO. Holding it | |
| 161 | - | // across the GET serializes downloads at SCAN_MAX_CONCURRENT and lets | |
| 162 | - | // a scan backlog starve the DB pool. | |
| 159 | + | // Two paths, gated on file size. Small files go through the original | |
| 160 | + | // buffered `Pipeline::scan(Vec<u8>)`: a single S3 GET into a heap | |
| 161 | + | // buffer, then layers walk the slice. Big files (>= SCAN_MAX_MEMORY_BYTES) | |
| 162 | + | // stream from S3 into a tempfile under SCAN_SPOOL_DIR, then layers | |
| 163 | + | // run against the spooled path (mmap or streamed). The buffered path | |
| 164 | + | // stays alive: it's the hot path for tip-jar avatars / small audio / | |
| 165 | + | // download files, and avoiding the tempfile syscall + write matters at | |
| 166 | + | // that scale. Both branches run S3 IO *outside* the scan_semaphore: | |
| 167 | + | // the permit bounds the CPU/clamd-heavy scan phase, not network IO. | |
| 168 | + | // Holding it across the GET serializes downloads at SCAN_MAX_CONCURRENT | |
| 169 | + | // and lets a scan backlog starve the DB pool. | |
| 163 | 170 | let result: ScanResult = if (job.file_size_bytes as usize) < constants::SCAN_MAX_MEMORY_BYTES { | |
| 164 | 171 | let data = ctx.s3.download_object(&job.s3_key).await?; | |
| 165 | 172 | let _permit = ctx.scan_semaphore.acquire().await?; |