Skip to main content

max / makenotwork

v0.6.16: pg_stat_activity saturation alert + caddy-ask concurrency cap monitor.rs: per-cycle probe of shared Postgres via pg_stat_activity. Fires WAM ticket pg-stat-activity-saturation (priority high) when client backends exceed 80% of max_connections. Catches MT + ad hoc clients the local pool counter cannot see. routes/api/domains.rs caddy-ask: cheap structural reject (no dot, > 253 chars, whitespace/control) before any DB lookup, plus an 8-slot semaphore on the cache-miss path. At capacity returns 503 so Caddy backs off instead of stampeding the pool. Cache hits unbounded. tests/load/runner.rs: backfill four founder-pricing Config fields that broke the load harness build. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-05-21 03:47 UTC
Commit: 858f328ef52b24f76eae0f24bf50fae5ee4cd108
Parent: d8b95b6
9 files changed, +107 insertions, -6 deletions
@@ -3551,7 +3551,7 @@ dependencies = [
3551 3551
3552 3552 [[package]]
3553 3553 name = "makenotwork"
3554 - version = "0.6.15"
3554 + version = "0.6.16"
3555 3555 dependencies = [
3556 3556 "anyhow",
3557 3557 "argon2",
@@ -1,6 +1,6 @@
1 1 [package]
2 2 name = "makenotwork"
3 - version = "0.6.15"
3 + version = "0.6.16"
4 4 edition = "2024"
5 5 license-file = "LICENSE"
6 6
@@ -138,6 +138,13 @@ pub const PAGINATION_WINDOW_SIZE: u32 = 5;
138 138 // -- File scanning --
139 139 pub const SCAN_MAX_MEMORY_BYTES: usize = 100 * 1024 * 1024; // 100 MB in-memory threshold
140 140 pub const SCAN_MAX_CONCURRENT: usize = 4; // Max concurrent file scans (each can use up to 100 MB RAM)
141 +
142 + // -- Caddy on-demand TLS --
143 + // Caps concurrent cache-miss DB lookups in `/api/domains/caddy-ask`. Cache hits
144 + // are unbounded (DashMap). At capacity, the handler returns 503 so Caddy retries
145 + // later instead of stampeding the DB pool or driving ACME issuance for garbage
146 + // domains. Sized small because the slow path is one indexed lookup.
147 + pub const CADDY_ASK_MAX_CONCURRENT: usize = 8;
141 148 pub const SCAN_ZIP_MAX_RATIO: f64 = 100.0; // Max compression ratio before ZIP bomb
142 149 pub const SCAN_ZIP_MAX_DEPTH: u32 = 2; // Max nested archives (detection is 1 level deep; decompressed size limit is the primary defense)
143 150 pub const SCAN_ZIP_MAX_UNCOMPRESSED: u64 = 2 * 1024 * 1024 * 1024; // 2 GB uncompressed limit
@@ -89,6 +89,9 @@ pub struct AppState {
89 89 /// Limits concurrent file scans to prevent memory exhaustion (each scan
90 90 /// downloads up to SCAN_MAX_MEMORY_BYTES into RAM).
91 91 pub scan_semaphore: Arc<tokio::sync::Semaphore>,
92 + /// Caps concurrent cache-miss DB lookups in `caddy-ask` so a flood of
93 + /// unknown-domain queries can't saturate the pool or drive ACME issuance.
94 + pub caddy_ask_semaphore: Arc<tokio::sync::Semaphore>,
92 95 /// Unix timestamp when the server will restart (0 = no restart pending).
93 96 /// Set by the deploy script via the internal API before uploading a new binary.
94 97 pub restart_at: Arc<std::sync::atomic::AtomicI64>,
@@ -302,6 +302,7 @@ async fn main() {
302 302 wam,
303 303 domain_cache,
304 304 scan_semaphore: std::sync::Arc::new(tokio::sync::Semaphore::new(makenotwork::constants::SCAN_MAX_CONCURRENT)),
305 + caddy_ask_semaphore: std::sync::Arc::new(tokio::sync::Semaphore::new(makenotwork::constants::CADDY_ASK_MAX_CONCURRENT)),
305 306 restart_at: std::sync::Arc::new(std::sync::atomic::AtomicI64::new(0)),
306 307 sync_notify: std::sync::Arc::new(dashmap::DashMap::new()),
307 308 sse_connections: std::sync::Arc::new(dashmap::DashMap::new()),
@@ -115,6 +115,7 @@ pub fn spawn_monitor(
115 115 let mut previous_status: Option<MonitorStatus> = None;
116 116 let mut last_alert_at: Option<Instant> = None;
117 117 let mut last_pool_alert_at: Option<Instant> = None;
118 + let mut last_pg_activity_alert_at: Option<Instant> = None;
118 119 let mut prune_counter: u64 = 0;
119 120
120 121 loop {
@@ -248,6 +249,64 @@ pub fn spawn_monitor(
248 249 }
249 250 }
250 251
252 + // Postgres server-wide saturation check via pg_stat_activity. This
253 + // catches cases where the *shared* Postgres (MNW + MT pools + ad hoc
254 + // clients) is approaching `max_connections`, which the local pool
255 + // pressure check above cannot see.
256 + {
257 + let row: Result<(i64, i64), _> = sqlx::query_as(
258 + "SELECT \
259 + (SELECT count(*) FROM pg_stat_activity \
260 + WHERE state IS NOT NULL AND backend_type = 'client backend')::bigint, \
261 + current_setting('max_connections')::bigint",
262 + )
263 + .fetch_one(&state.db)
264 + .await;
265 +
266 + if let Ok((active, max_conn)) = row
267 + && max_conn > 0
268 + {
269 + let pct = active * 100 / max_conn;
270 + if pct > 80 {
271 + tracing::warn!(
272 + active,
273 + max_conn,
274 + pct,
275 + "Postgres pg_stat_activity saturation >80%"
276 + );
277 + let cooldown_ok = last_pg_activity_alert_at
278 + .is_none_or(|t| t.elapsed().as_secs() >= constants::ALERT_COOLDOWN_SECS);
279 + if cooldown_ok
280 + && let Some(ref wam) = state.wam
281 + {
282 + let title = format!(
283 + "Postgres saturation: {active}/{max_conn} client backends ({pct}%)"
284 + );
285 + let body = format!(
286 + "pg_stat_activity client-backend count is at {pct}% of \
287 + max_connections ({active}/{max_conn}). Shared Postgres serves \
288 + MNW + MT + ad hoc clients; exhaustion will fail new connections \
289 + for all of them. Investigate which role/application is holding \
290 + connections via:\n\n \
291 + SELECT usename, application_name, state, count(*) \
292 + FROM pg_stat_activity GROUP BY 1,2,3 ORDER BY 4 DESC;"
293 + );
294 + wam.create_ticket(
295 + &title,
296 + Some(&body),
297 + "high",
298 + "pg-stat-activity-saturation",
299 + None,
300 + )
301 + .await;
302 + last_pg_activity_alert_at = Some(Instant::now());
303 + }
304 + }
305 + } else if let Err(e) = row {
306 + tracing::debug!(error = ?e, "pg_stat_activity probe failed");
307 + }
308 + }
309 +
251 310 // Persist snapshot (best-effort)
252 311 if let Err(e) = db::monitor::insert_health_history(
253 312 &state.db,
@@ -162,7 +162,15 @@ pub(super) struct CaddyAskQuery {
162 162 /// GET /api/domains/caddy-ask — Caddy on-demand TLS check.
163 163 ///
164 164 /// Unauthenticated (called by Caddy, not users). Returns 200 if the domain
165 - /// is verified, 404 if not.
165 + /// is verified, 404 if not. Caddy treats anything outside the 2xx range as
166 + /// "do not issue."
167 + ///
168 + /// Abuse model: every request reaches MNW from Caddy's IP, so per-IP rate
169 + /// limiting can't distinguish a legitimate visitor from an attacker probing
170 + /// thousands of bogus hostnames. Defenses here are content-based:
171 + /// 1. Reject syntactically-invalid hostnames before touching the DB.
172 + /// 2. Bound concurrent cache-miss DB lookups via `caddy_ask_semaphore`;
173 + /// at capacity, return 503 so Caddy backs off and the pool stays free.
166 174 #[tracing::instrument(skip_all, name = "api::domains::caddy_ask")]
167 175 pub(super) async fn caddy_ask(
168 176 State(state): State<AppState>,
@@ -170,15 +178,32 @@ pub(super) async fn caddy_ask(
170 178 ) -> impl IntoResponse {
171 179 let domain = q.domain.to_lowercase();
172 180
173 - // Fast path: check cache
181 + // Cheap structural reject: hostnames with no dot, spaces, control chars,
182 + // or absurd length can't be verified domains. Skipping the DB on these
183 + // shuts down the easiest flood vector.
184 + if domain.is_empty()
185 + || domain.len() > 253
186 + || !domain.contains('.')
187 + || domain.contains(|c: char| c.is_whitespace() || c.is_control())
188 + {
189 + return axum::http::StatusCode::NOT_FOUND;
190 + }
191 +
192 + // Fast path: cache hit, no DB, no semaphore.
174 193 if state.domain_cache.contains_key(&domain) {
175 194 return axum::http::StatusCode::OK;
176 195 }
177 196
178 - // Slow path: check DB
197 + // Slow path: cap concurrent DB lookups. `try_acquire` is non-blocking;
198 + // hitting the cap means we're already under pressure and want Caddy to
199 + // retry rather than queue.
200 + let Ok(_permit) = state.caddy_ask_semaphore.try_acquire() else {
201 + tracing::warn!(domain = %domain, "caddy-ask: cache-miss concurrency cap reached");
202 + return axum::http::StatusCode::SERVICE_UNAVAILABLE;
203 + };
204 +
179 205 match db::custom_domains::get_verified_domain(&state.db, &domain).await {
180 206 Ok(Some(d)) => {
181 - // Populate cache for next time
182 207 state.domain_cache.insert(d.domain, d.user_id);
183 208 axum::http::StatusCode::OK
184 209 }
@@ -346,6 +346,7 @@ impl TestHarness {
346 346 sse_connections: Arc::new(dashmap::DashMap::new()),
347 347 metrics_handle: None,
348 348 scan_semaphore: Arc::new(tokio::sync::Semaphore::new(4)),
349 + caddy_ask_semaphore: Arc::new(tokio::sync::Semaphore::new(8)),
349 350 };
350 351
351 352 let app = build_app(state, session_layer);
@@ -68,6 +68,10 @@ pub async fn run(config: LoadConfig) {
68 68 mt_base_url: None,
69 69 fan_plus_price_id: None,
70 70 creator_tier_prices: std::collections::HashMap::new(),
71 + creator_tier_annual_prices: std::collections::HashMap::new(),
72 + creator_tier_founder_prices: std::collections::HashMap::new(),
73 + creator_tier_founder_annual_prices: std::collections::HashMap::new(),
74 + creator_founder_window_open: false,
71 75 build_trigger_token: None,
72 76 build_host_linux: None,
73 77 build_host_darwin: None,
@@ -122,6 +126,7 @@ pub async fn run(config: LoadConfig) {
122 126 sse_connections: Arc::new(dashmap::DashMap::new()),
123 127 metrics_handle: None,
124 128 scan_semaphore: Arc::new(tokio::sync::Semaphore::new(4)),
129 + caddy_ask_semaphore: Arc::new(tokio::sync::Semaphore::new(8)),
125 130 };
126 131
127 132 let app = build_app(state, session_layer);