max / makenotwork

server: hourly retention sweep on scan_jobs scan_jobs is append-only today: every upload-confirm enqueues a row and the worker stamps it 'done' or 'failed', then the row lives forever. Long-term that bloats claim_next, backups, and (mildly) privacy posture on s3_key/user_id for files long since deleted. Adds db::scan_jobs::purge_old_terminal — DELETE WHERE status IN ('done','failed') AND COALESCE(completed_at, started_at, enqueued_at) < cutoff. Verdicts (Clean/Quarantined/HeldForReview) live on the entity's scan_status column, not in scan_jobs, so dropping a 'done' row loses queue history only, never malware-detection state. Wired into the existing hourly tier in scheduler/mod.rs alongside the synckit warning sweep. Records to scheduler_jobs as 'scan_jobs_retention' so the count is visible in the admin health panel. SCAN_JOB_RETENTION_DAYS = 30 in constants.rs, guarded by a >= 7 floor assertion to block accidental same-day purges. No supporting index yet: seq-scan is fine at current volume, revisit on EXPLAIN. Plan: _private/docs/mnw/server-docs/plans/scan-jobs-retention.md.

Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-05-27 14:35 UTC

Commit: c5cc1b2f98c4750c43d8a34f23ffb171617daf35

Parent: 5c2f7b2

4 files changed, +59 insertions, -0 deletions

M server/src/constants.rs +10

			@@ -146,6 +146,9 @@ pub const PAGINATION_WINDOW_SIZE: u32 = 5;
146	146		pub const SCAN_MAX_MEMORY_BYTES: usize = 100 * 1024 * 1024; // 100 MB in-memory threshold
147	147		pub const SCAN_MAX_CONCURRENT: usize = 4; // Max concurrent file scans (each can use up to 100 MB RAM)
148	148		pub const SCAN_WORKER_COUNT: usize = 2; // Background worker tasks draining scan_jobs queue
	149	+	/// Retention window for terminal-state (`done`, `failed`) `scan_jobs` rows.
	150	+	/// Queued/running rows are operational queue state and not affected.
	151	+	pub const SCAN_JOB_RETENTION_DAYS: u32 = 30;
149	152
150	153		// -- Caddy on-demand TLS --
151	154		// Caps concurrent cache-miss DB lookups in `/api/domains/caddy-ask`. Cache hits
			@@ -451,6 +454,13 @@ mod tests {
451	454		}
452	455
453	456		#[test]
	457	+	fn scan_job_retention_days_safe_floor() {
	458	+	// Guards against an accidental same-day purge that would race the
	459	+	// worker stamping completed_at.
	460	+	assert!(SCAN_JOB_RETENTION_DAYS >= 7);
	461	+	}
	462	+
	463	+	#[test]
454	464		fn scan_zip_max_uncompressed_exceeds_memory_threshold() {
455	465		assert!(SCAN_ZIP_MAX_UNCOMPRESSED > SCAN_MAX_MEMORY_BYTES as u64);
456	466		}

M server/src/db/scan_jobs.rs +30

			@@ -208,6 +208,36 @@ pub async fn reap_stuck(db: &PgPool, max_age_secs: i64) -> Result<u64, sqlx::Err
208	208		Ok(affected)
209	209		}
210	210
	211	+	/// Delete terminal-state rows older than `older_than`. Returns the count.
	212	+	///
	213	+	/// Only touches `done`/`failed` rows — operational state (`queued`,
	214	+	/// `running`) is owned by the worker loop and `reap_stuck`. The verdict
	215	+	/// (Clean / Quarantined / HeldForReview) lives on the entity's
	216	+	/// `scan_status` column, not here, so dropping a `done` row loses queue
	217	+	/// history only, not malware-detection state.
	218	+	///
	219	+	/// No supporting index today: at soft-launch volume Postgres seq-scans the
	220	+	/// table fine. Revisit once `EXPLAIN ANALYZE` shows it as a bottleneck.
	221	+	#[tracing::instrument(skip_all)]
	222	+	pub async fn purge_old_terminal(
	223	+	db: &PgPool,
	224	+	older_than: chrono::Duration,
	225	+	) -> Result<u64, sqlx::Error> {
	226	+	let cutoff = chrono::Utc::now() - older_than;
	227	+	let n = sqlx::query(
	228	+	r#"
	229	+	DELETE FROM scan_jobs
	230	+	WHERE status IN ('done', 'failed')
	231	+	AND COALESCE(completed_at, started_at, enqueued_at) < $1
	232	+	"#,
	233	+	)
	234	+	.bind(cutoff)
	235	+	.execute(db)
	236	+	.await?
	237	+	.rows_affected();
	238	+	Ok(n)
	239	+	}
	240	+
211	241		/// Count of currently-queued jobs. Used by the admin dashboard health panel
212	242		/// (Phase 2 of the audit). Allowed dead code until that route lands.
213	243		#[allow(dead_code)]

M server/src/scheduler/cleanup.rs +18

			@@ -2,9 +2,27 @@
2	2		//! removal, IP scrubbing, stale pending transactions, orphaned uploads, cart
3	3		//! items, soft-deleted item purges, and pending S3 deletion retries.
4	4
	5	+	use crate::constants;
5	6		use crate::db;
6	7		use crate::AppState;
7	8
	9	+	/// Hourly: purge `scan_jobs` rows in a terminal state older than the
	10	+	/// retention window. Queued/running rows are not touched.
	11	+	pub(super) async fn purge_old_scan_jobs(state: &AppState) {
	12	+	let window = chrono::Duration::days(constants::SCAN_JOB_RETENTION_DAYS as i64);
	13	+	match db::scan_jobs::purge_old_terminal(&state.db, window).await {
	14	+	Ok(n) => {
	15	+	if n > 0 {
	16	+	tracing::info!(purged = n, "scan_jobs retention sweep");
	17	+	}
	18	+	let _ = db::scheduler_jobs::record_job_run(
	19	+	&state.db, "scan_jobs_retention", n as i64,
	20	+	).await;
	21	+	}
	22	+	Err(e) => tracing::error!(error = ?e, "scan_jobs retention sweep failed"),
	23	+	}
	24	+	}
	25	+
8	26		/// Delete expired sandbox accounts and their S3 objects.
9	27		pub(super) async fn cleanup_sandbox_accounts(state: &AppState) {
10	28		let expired_ids = match db::users::get_expired_sandbox_ids(&state.db).await {

M server/src/scheduler/mod.rs +1

			@@ -204,6 +204,7 @@ pub fn spawn_scheduler(
204	204		// the app owner. Cheap query — single JOIN on a small table.
205	205		if run_hourly {
206	206		synckit_warnings::check_and_send_warnings(&state).await;
	207	+	cleanup::purge_old_scan_jobs(&state).await;
207	208		}
208	209
209	210		// Weekly storage drift correction + integrity checks