Skip to main content

max / makenotwork

server: hourly retention sweep on scan_jobs scan_jobs is append-only today: every upload-confirm enqueues a row and the worker stamps it 'done' or 'failed', then the row lives forever. Long-term that bloats claim_next, backups, and (mildly) privacy posture on s3_key/user_id for files long since deleted. Adds db::scan_jobs::purge_old_terminal — DELETE WHERE status IN ('done','failed') AND COALESCE(completed_at, started_at, enqueued_at) < cutoff. Verdicts (Clean/Quarantined/HeldForReview) live on the entity's scan_status column, not in scan_jobs, so dropping a 'done' row loses queue history only, never malware-detection state. Wired into the existing hourly tier in scheduler/mod.rs alongside the synckit warning sweep. Records to scheduler_jobs as 'scan_jobs_retention' so the count is visible in the admin health panel. SCAN_JOB_RETENTION_DAYS = 30 in constants.rs, guarded by a >= 7 floor assertion to block accidental same-day purges. No supporting index yet: seq-scan is fine at current volume, revisit on EXPLAIN. Plan: _private/docs/mnw/server-docs/plans/scan-jobs-retention.md.
Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-05-27 14:35 UTC
Commit: c5cc1b2f98c4750c43d8a34f23ffb171617daf35
Parent: 5c2f7b2
4 files changed, +59 insertions, -0 deletions
@@ -146,6 +146,9 @@ pub const PAGINATION_WINDOW_SIZE: u32 = 5;
146 146 pub const SCAN_MAX_MEMORY_BYTES: usize = 100 * 1024 * 1024; // 100 MB in-memory threshold
147 147 pub const SCAN_MAX_CONCURRENT: usize = 4; // Max concurrent file scans (each can use up to 100 MB RAM)
148 148 pub const SCAN_WORKER_COUNT: usize = 2; // Background worker tasks draining scan_jobs queue
149 + /// Retention window for terminal-state (`done`, `failed`) `scan_jobs` rows.
150 + /// Queued/running rows are operational queue state and not affected.
151 + pub const SCAN_JOB_RETENTION_DAYS: u32 = 30;
149 152
150 153 // -- Caddy on-demand TLS --
151 154 // Caps concurrent cache-miss DB lookups in `/api/domains/caddy-ask`. Cache hits
@@ -451,6 +454,13 @@ mod tests {
451 454 }
452 455
453 456 #[test]
457 + fn scan_job_retention_days_safe_floor() {
458 + // Guards against an accidental same-day purge that would race the
459 + // worker stamping completed_at.
460 + assert!(SCAN_JOB_RETENTION_DAYS >= 7);
461 + }
462 +
463 + #[test]
454 464 fn scan_zip_max_uncompressed_exceeds_memory_threshold() {
455 465 assert!(SCAN_ZIP_MAX_UNCOMPRESSED > SCAN_MAX_MEMORY_BYTES as u64);
456 466 }
@@ -208,6 +208,36 @@ pub async fn reap_stuck(db: &PgPool, max_age_secs: i64) -> Result<u64, sqlx::Err
208 208 Ok(affected)
209 209 }
210 210
211 + /// Delete terminal-state rows older than `older_than`. Returns the count.
212 + ///
213 + /// Only touches `done`/`failed` rows — operational state (`queued`,
214 + /// `running`) is owned by the worker loop and `reap_stuck`. The verdict
215 + /// (Clean / Quarantined / HeldForReview) lives on the entity's
216 + /// `scan_status` column, not here, so dropping a `done` row loses queue
217 + /// history only, not malware-detection state.
218 + ///
219 + /// No supporting index today: at soft-launch volume Postgres seq-scans the
220 + /// table fine. Revisit once `EXPLAIN ANALYZE` shows it as a bottleneck.
221 + #[tracing::instrument(skip_all)]
222 + pub async fn purge_old_terminal(
223 + db: &PgPool,
224 + older_than: chrono::Duration,
225 + ) -> Result<u64, sqlx::Error> {
226 + let cutoff = chrono::Utc::now() - older_than;
227 + let n = sqlx::query(
228 + r#"
229 + DELETE FROM scan_jobs
230 + WHERE status IN ('done', 'failed')
231 + AND COALESCE(completed_at, started_at, enqueued_at) < $1
232 + "#,
233 + )
234 + .bind(cutoff)
235 + .execute(db)
236 + .await?
237 + .rows_affected();
238 + Ok(n)
239 + }
240 +
211 241 /// Count of currently-queued jobs. Used by the admin dashboard health panel
212 242 /// (Phase 2 of the audit). Allowed dead code until that route lands.
213 243 #[allow(dead_code)]
@@ -2,9 +2,27 @@
2 2 //! removal, IP scrubbing, stale pending transactions, orphaned uploads, cart
3 3 //! items, soft-deleted item purges, and pending S3 deletion retries.
4 4
5 + use crate::constants;
5 6 use crate::db;
6 7 use crate::AppState;
7 8
9 + /// Hourly: purge `scan_jobs` rows in a terminal state older than the
10 + /// retention window. Queued/running rows are not touched.
11 + pub(super) async fn purge_old_scan_jobs(state: &AppState) {
12 + let window = chrono::Duration::days(constants::SCAN_JOB_RETENTION_DAYS as i64);
13 + match db::scan_jobs::purge_old_terminal(&state.db, window).await {
14 + Ok(n) => {
15 + if n > 0 {
16 + tracing::info!(purged = n, "scan_jobs retention sweep");
17 + }
18 + let _ = db::scheduler_jobs::record_job_run(
19 + &state.db, "scan_jobs_retention", n as i64,
20 + ).await;
21 + }
22 + Err(e) => tracing::error!(error = ?e, "scan_jobs retention sweep failed"),
23 + }
24 + }
25 +
8 26 /// Delete expired sandbox accounts and their S3 objects.
9 27 pub(super) async fn cleanup_sandbox_accounts(state: &AppState) {
10 28 let expired_ids = match db::users::get_expired_sandbox_ids(&state.db).await {
@@ -204,6 +204,7 @@ pub fn spawn_scheduler(
204 204 // the app owner. Cheap query — single JOIN on a small table.
205 205 if run_hourly {
206 206 synckit_warnings::check_and_send_warnings(&state).await;
207 + cleanup::purge_old_scan_jobs(&state).await;
207 208 }
208 209
209 210 // Weekly storage drift correction + integrity checks