max / makenotwork

server: scan worker resets entity from Scanning on pipeline error When run_pipeline_and_decide errored (e.g. S3 download failure on a stale s3_key), the worker returned the error to the outer loop — which mark_failed'd the job but left the entity stuck at scan_status = 'scanning' forever, since the second update_entity_status call never ran. Two GO/AF .dmg versions hit this on the post-deploy bulk re-scan. Fix: on pipeline error, reset the entity to HeldForReview before propagating. Operators see the orphan on the dashboard and decide whether to delete the row.

Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-05-24 21:52 UTC

Commit: 1e2c4ebb82d66f7bed18a0f61b5791dc4dff3bc0

Parent: ca3516c

1 file changed, +15 insertions, -1 deletion

M server/src/scanning/worker.rs +15 -1

			@@ -116,6 +116,11 @@ pub fn spawn_pool(n: usize, ctx: Arc<WorkerContext>, shutdown_rx: tokio::sync::w
116	116
117	117		/// Run a single scan job end-to-end. On success the job is marked done; the
118	118		/// caller marks failed if this returns an error.
	119	+	///
	120	+	/// On pipeline error (e.g. S3 download failure), reset the entity from
	121	+	/// Scanning back to HeldForReview before bubbling the error up. Otherwise
	122	+	/// the entity stays stuck at Scanning forever — a real regression we hit
	123	+	/// in production with stale s3_keys.
119	124		#[tracing::instrument(skip_all, fields(%job_id = job.id, target_kind = %job.target_kind, %target_id = job.target_id, attempts = job.attempts))]
120	125		async fn process_job(ctx: &WorkerContext, job: ScanJob) -> Result<(), Box<dyn std::error::Error + Send + Sync>> {
121	126		let job_id = job.id;
			@@ -128,7 +133,16 @@ async fn process_job(ctx: &WorkerContext, job: ScanJob) -> Result<(), Box<dyn st
128	133		// queue panel.
129	134		update_entity_status(&ctx.db, kind, target_id, FileScanStatus::Scanning).await.ok();
130	135
131		-	let entity_status = run_pipeline_and_decide(ctx, &job, file_type).await?;
	136	+	let entity_status = match run_pipeline_and_decide(ctx, &job, file_type).await {
	137	+	Ok(s) => s,
	138	+	Err(e) => {
	139	+	// Pipeline blew up — most often a stale s3_key. Reset entity to
	140	+	// HeldForReview so admins see it on the dashboard and decide
	141	+	// whether to delete the orphan record.
	142	+	update_entity_status(&ctx.db, kind, target_id, FileScanStatus::HeldForReview).await.ok();
	143	+	return Err(e);
	144	+	}
	145	+	};
132	146		update_entity_status(&ctx.db, kind, target_id, entity_status).await?;
133	147
134	148		db::scan_jobs::mark_done(&ctx.db, job_id).await?;