Skip to main content

max / makenotwork

6.9 KB · 193 lines History Blame Raw
1 //! Webhook retry with exponential backoff and stale refund escalation.
2
3 use crate::db;
4 use crate::AppState;
5
6 /// Maximum webhook retry attempts before marking as dead letter.
7 const WEBHOOK_MAX_RETRIES: i32 = 5;
8
9 /// Determine whether a webhook retry attempt should be treated as a dead letter.
10 pub(super) fn is_webhook_dead(attempt: i32) -> bool {
11 attempt >= WEBHOOK_MAX_RETRIES
12 }
13
14 /// Retry failed webhook events with exponential backoff.
15 pub(super) async fn retry_failed_webhooks(state: &AppState) {
16 let events = match db::webhook_events::get_retryable_events(&state.db).await {
17 Ok(e) if e.is_empty() => return,
18 Ok(e) => e,
19 Err(e) => {
20 tracing::error!(error = ?e, "failed to fetch retryable webhook events");
21 return;
22 }
23 };
24
25 if state.stripe.is_none() { return; }
26
27 for event in events {
28 let attempt = event.attempts + 1;
29 tracing::info!(
30 event_id = %event.id, source = %event.source, event_type = %event.event_type,
31 attempt = attempt, "retrying webhook event"
32 );
33
34 // Retry re-runs the full event handler. All handlers must be idempotent
35 // (use ON CONFLICT / WHERE status='pending' guards) since steps completed
36 // before the original failure are not rolled back.
37 let result = if event.source == "stripe" {
38 match crate::payments::UntypedEvent::from_payload(&event.payload) {
39 Ok(parsed) => {
40 let crate::payments::UntypedEvent { id, type_, data_object } = parsed;
41 crate::routes::stripe::process_webhook_event(state, &type_, &id, data_object).await
42 }
43 Err(e) => Err(e),
44 }
45 } else {
46 Err(crate::error::AppError::BadRequest(format!("Unknown webhook source: {}", event.source)))
47 };
48
49 match result {
50 Ok(()) => {
51 tracing::info!(event_id = %event.id, "webhook retry succeeded");
52 if let Err(e) = db::webhook_events::mark_processed(&state.db, event.id).await {
53 tracing::error!(error = ?e, "failed to mark webhook event as processed");
54 }
55 }
56 Err(e) => {
57 let is_dead = is_webhook_dead(attempt);
58 tracing::warn!(
59 event_id = %event.id, attempt = attempt, error = ?e,
60 dead = is_dead,
61 "webhook retry failed"
62 );
63 if let Err(e) = db::webhook_events::schedule_retry(
64 &state.db, event.id, attempt, &format!("{:?}", e),
65 ).await {
66 tracing::error!(error = ?e, "failed to schedule webhook retry");
67 }
68
69 if is_dead
70 && let Some(ref wam) = state.wam
71 {
72 let title = format!(
73 "Dead webhook: {} ({})",
74 event.event_type, event.id
75 );
76 let body = format!(
77 "Webhook event exhausted all {} retry attempts.\n\
78 Source: {}\nType: {}\nLast error: {:?}",
79 attempt, event.source, event.event_type, e,
80 );
81 wam.create_ticket(
82 &title,
83 Some(&body),
84 "high",
85 "webhook-dead-letter",
86 Some(&event.id.to_string()),
87 )
88 .await;
89 }
90 }
91 }
92 }
93 }
94
95 /// Alert the admin about pending refunds that have gone unmatched for >24 hours.
96 pub(super) async fn escalate_stale_refunds(state: &AppState) {
97 let stale = match db::pending_refunds::get_stale_refunds(
98 &state.db,
99 chrono::Duration::hours(24),
100 )
101 .await
102 {
103 Ok(s) if s.is_empty() => return,
104 Ok(s) => s,
105 Err(e) => {
106 tracing::error!(error = ?e, "failed to query stale pending refunds");
107 return;
108 }
109 };
110
111 let alert_email = std::env::var("ALERT_EMAIL").ok();
112
113 for refund in &stale {
114 // Mark escalated FIRST to prevent duplicate alerts on retry
115 if let Err(e) = db::pending_refunds::mark_escalated(&state.db, refund.id).await {
116 tracing::error!(error = ?e, "failed to mark pending refund as escalated, skipping alerts");
117 continue;
118 }
119
120 tracing::error!(
121 payment_intent_id = %refund.payment_intent_id,
122 amount = refund.amount.as_i64(),
123 amount_refunded = refund.amount_refunded.as_i64(),
124 created_at = %refund.created_at,
125 "STALE PENDING REFUND: unmatched for >24h, needs manual investigation"
126 );
127
128 if let Some(ref to) = alert_email {
129 let subject = format!(
130 "Unmatched refund: {} ({}c refunded)",
131 refund.payment_intent_id, refund.amount_refunded
132 );
133 let body = format!(
134 "A charge.refunded webhook for payment intent {} has been pending for >24 hours \
135 with no matching completed transaction.\n\n\
136 Amount: {}c\nAmount refunded: {}c\nReceived: {}\n\n\
137 This likely means the checkout.session.completed webhook was lost. \
138 Check the Stripe dashboard and reconcile manually.",
139 refund.payment_intent_id,
140 refund.amount,
141 refund.amount_refunded,
142 refund.created_at,
143 );
144 if let Err(e) = state.email.send_alert(to, &subject, &body).await {
145 tracing::error!(error = ?e, "failed to send stale refund alert email");
146 }
147 }
148
149 if let Some(ref wam) = state.wam {
150 let title = format!(
151 "Unmatched refund: {} ({}c)",
152 refund.payment_intent_id, refund.amount_refunded
153 );
154 let body = format!(
155 "charge.refunded webhook pending >24h with no matching completed transaction.\n\
156 Amount: {}c\nRefunded: {}c\nReceived: {}\n\
157 Check Stripe dashboard and reconcile manually.",
158 refund.amount, refund.amount_refunded, refund.created_at,
159 );
160 wam.create_ticket(
161 &title,
162 Some(&body),
163 "critical",
164 "refund-escalation",
165 Some(&refund.payment_intent_id),
166 )
167 .await;
168 }
169 }
170 }
171
172 #[cfg(test)]
173 mod tests {
174 use super::*;
175
176 #[test]
177 fn webhook_not_dead_under_threshold() {
178 assert!(!is_webhook_dead(1));
179 assert!(!is_webhook_dead(4));
180 }
181
182 #[test]
183 fn webhook_dead_at_threshold() {
184 assert!(is_webhook_dead(5));
185 }
186
187 #[test]
188 fn webhook_dead_above_threshold() {
189 assert!(is_webhook_dead(6));
190 assert!(is_webhook_dead(100));
191 }
192 }
193