server: hourly retention sweep on scan_jobs
scan_jobs is append-only today: every upload-confirm enqueues a row and
the worker stamps it 'done' or 'failed', then the row lives forever.
Long-term that bloats claim_next, backups, and (mildly) privacy posture
on s3_key/user_id for files long since deleted.
Adds db::scan_jobs::purge_old_terminal — DELETE WHERE status IN
('done','failed') AND COALESCE(completed_at, started_at, enqueued_at) <
cutoff. Verdicts (Clean/Quarantined/HeldForReview) live on the entity's
scan_status column, not in scan_jobs, so dropping a 'done' row loses
queue history only, never malware-detection state.
Wired into the existing hourly tier in scheduler/mod.rs alongside the
synckit warning sweep. Records to scheduler_jobs as 'scan_jobs_retention'
so the count is visible in the admin health panel.
SCAN_JOB_RETENTION_DAYS = 30 in constants.rs, guarded by a >= 7 floor
assertion to block accidental same-day purges. No supporting index yet:
seq-scan is fine at current volume, revisit on EXPLAIN.
Plan: _private/docs/mnw/server-docs/plans/scan-jobs-retention.md.
4 files changed,
+59 insertions,
-0 deletions
| 146 |
146 |
|
pub const SCAN_MAX_MEMORY_BYTES: usize = 100 * 1024 * 1024; // 100 MB in-memory threshold
|
| 147 |
147 |
|
pub const SCAN_MAX_CONCURRENT: usize = 4; // Max concurrent file scans (each can use up to 100 MB RAM)
|
| 148 |
148 |
|
pub const SCAN_WORKER_COUNT: usize = 2; // Background worker tasks draining scan_jobs queue
|
|
149 |
+ |
/// Retention window for terminal-state (`done`, `failed`) `scan_jobs` rows.
|
|
150 |
+ |
/// Queued/running rows are operational queue state and not affected.
|
|
151 |
+ |
pub const SCAN_JOB_RETENTION_DAYS: u32 = 30;
|
| 149 |
152 |
|
|
| 150 |
153 |
|
// -- Caddy on-demand TLS --
|
| 151 |
154 |
|
// Caps concurrent cache-miss DB lookups in `/api/domains/caddy-ask`. Cache hits
|
| 451 |
454 |
|
}
|
| 452 |
455 |
|
|
| 453 |
456 |
|
#[test]
|
|
457 |
+ |
fn scan_job_retention_days_safe_floor() {
|
|
458 |
+ |
// Guards against an accidental same-day purge that would race the
|
|
459 |
+ |
// worker stamping completed_at.
|
|
460 |
+ |
assert!(SCAN_JOB_RETENTION_DAYS >= 7);
|
|
461 |
+ |
}
|
|
462 |
+ |
|
|
463 |
+ |
#[test]
|
| 454 |
464 |
|
fn scan_zip_max_uncompressed_exceeds_memory_threshold() {
|
| 455 |
465 |
|
assert!(SCAN_ZIP_MAX_UNCOMPRESSED > SCAN_MAX_MEMORY_BYTES as u64);
|
| 456 |
466 |
|
}
|
| 208 |
208 |
|
Ok(affected)
|
| 209 |
209 |
|
}
|
| 210 |
210 |
|
|
|
211 |
+ |
/// Delete terminal-state rows older than `older_than`. Returns the count.
|
|
212 |
+ |
///
|
|
213 |
+ |
/// Only touches `done`/`failed` rows — operational state (`queued`,
|
|
214 |
+ |
/// `running`) is owned by the worker loop and `reap_stuck`. The verdict
|
|
215 |
+ |
/// (Clean / Quarantined / HeldForReview) lives on the entity's
|
|
216 |
+ |
/// `scan_status` column, not here, so dropping a `done` row loses queue
|
|
217 |
+ |
/// history only, not malware-detection state.
|
|
218 |
+ |
///
|
|
219 |
+ |
/// No supporting index today: at soft-launch volume Postgres seq-scans the
|
|
220 |
+ |
/// table fine. Revisit once `EXPLAIN ANALYZE` shows it as a bottleneck.
|
|
221 |
+ |
#[tracing::instrument(skip_all)]
|
|
222 |
+ |
pub async fn purge_old_terminal(
|
|
223 |
+ |
db: &PgPool,
|
|
224 |
+ |
older_than: chrono::Duration,
|
|
225 |
+ |
) -> Result<u64, sqlx::Error> {
|
|
226 |
+ |
let cutoff = chrono::Utc::now() - older_than;
|
|
227 |
+ |
let n = sqlx::query(
|
|
228 |
+ |
r#"
|
|
229 |
+ |
DELETE FROM scan_jobs
|
|
230 |
+ |
WHERE status IN ('done', 'failed')
|
|
231 |
+ |
AND COALESCE(completed_at, started_at, enqueued_at) < $1
|
|
232 |
+ |
"#,
|
|
233 |
+ |
)
|
|
234 |
+ |
.bind(cutoff)
|
|
235 |
+ |
.execute(db)
|
|
236 |
+ |
.await?
|
|
237 |
+ |
.rows_affected();
|
|
238 |
+ |
Ok(n)
|
|
239 |
+ |
}
|
|
240 |
+ |
|
| 211 |
241 |
|
/// Count of currently-queued jobs. Used by the admin dashboard health panel
|
| 212 |
242 |
|
/// (Phase 2 of the audit). Allowed dead code until that route lands.
|
| 213 |
243 |
|
#[allow(dead_code)]
|
| 2 |
2 |
|
//! removal, IP scrubbing, stale pending transactions, orphaned uploads, cart
|
| 3 |
3 |
|
//! items, soft-deleted item purges, and pending S3 deletion retries.
|
| 4 |
4 |
|
|
|
5 |
+ |
use crate::constants;
|
| 5 |
6 |
|
use crate::db;
|
| 6 |
7 |
|
use crate::AppState;
|
| 7 |
8 |
|
|
|
9 |
+ |
/// Hourly: purge `scan_jobs` rows in a terminal state older than the
|
|
10 |
+ |
/// retention window. Queued/running rows are not touched.
|
|
11 |
+ |
pub(super) async fn purge_old_scan_jobs(state: &AppState) {
|
|
12 |
+ |
let window = chrono::Duration::days(constants::SCAN_JOB_RETENTION_DAYS as i64);
|
|
13 |
+ |
match db::scan_jobs::purge_old_terminal(&state.db, window).await {
|
|
14 |
+ |
Ok(n) => {
|
|
15 |
+ |
if n > 0 {
|
|
16 |
+ |
tracing::info!(purged = n, "scan_jobs retention sweep");
|
|
17 |
+ |
}
|
|
18 |
+ |
let _ = db::scheduler_jobs::record_job_run(
|
|
19 |
+ |
&state.db, "scan_jobs_retention", n as i64,
|
|
20 |
+ |
).await;
|
|
21 |
+ |
}
|
|
22 |
+ |
Err(e) => tracing::error!(error = ?e, "scan_jobs retention sweep failed"),
|
|
23 |
+ |
}
|
|
24 |
+ |
}
|
|
25 |
+ |
|
| 8 |
26 |
|
/// Delete expired sandbox accounts and their S3 objects.
|
| 9 |
27 |
|
pub(super) async fn cleanup_sandbox_accounts(state: &AppState) {
|
| 10 |
28 |
|
let expired_ids = match db::users::get_expired_sandbox_ids(&state.db).await {
|
| 204 |
204 |
|
// the app owner. Cheap query — single JOIN on a small table.
|
| 205 |
205 |
|
if run_hourly {
|
| 206 |
206 |
|
synckit_warnings::check_and_send_warnings(&state).await;
|
|
207 |
+ |
cleanup::purge_old_scan_jobs(&state).await;
|
| 207 |
208 |
|
}
|
| 208 |
209 |
|
|
| 209 |
210 |
|
// Weekly storage drift correction + integrity checks
|