max / makenotwork
9 files changed,
+107 insertions,
-6 deletions
| @@ -3551,7 +3551,7 @@ dependencies = [ | |||
| 3551 | 3551 | ||
| 3552 | 3552 | [[package]] | |
| 3553 | 3553 | name = "makenotwork" | |
| 3554 | - | version = "0.6.15" | |
| 3554 | + | version = "0.6.16" | |
| 3555 | 3555 | dependencies = [ | |
| 3556 | 3556 | "anyhow", | |
| 3557 | 3557 | "argon2", |
| @@ -1,6 +1,6 @@ | |||
| 1 | 1 | [package] | |
| 2 | 2 | name = "makenotwork" | |
| 3 | - | version = "0.6.15" | |
| 3 | + | version = "0.6.16" | |
| 4 | 4 | edition = "2024" | |
| 5 | 5 | license-file = "LICENSE" | |
| 6 | 6 |
| @@ -138,6 +138,13 @@ pub const PAGINATION_WINDOW_SIZE: u32 = 5; | |||
| 138 | 138 | // -- File scanning -- | |
| 139 | 139 | pub const SCAN_MAX_MEMORY_BYTES: usize = 100 * 1024 * 1024; // 100 MB in-memory threshold | |
| 140 | 140 | pub const SCAN_MAX_CONCURRENT: usize = 4; // Max concurrent file scans (each can use up to 100 MB RAM) | |
| 141 | + | ||
| 142 | + | // -- Caddy on-demand TLS -- | |
| 143 | + | // Caps concurrent cache-miss DB lookups in `/api/domains/caddy-ask`. Cache hits | |
| 144 | + | // are unbounded (DashMap). At capacity, the handler returns 503 so Caddy retries | |
| 145 | + | // later instead of stampeding the DB pool or driving ACME issuance for garbage | |
| 146 | + | // domains. Sized small because the slow path is one indexed lookup. | |
| 147 | + | pub const CADDY_ASK_MAX_CONCURRENT: usize = 8; | |
| 141 | 148 | pub const SCAN_ZIP_MAX_RATIO: f64 = 100.0; // Max compression ratio before ZIP bomb | |
| 142 | 149 | pub const SCAN_ZIP_MAX_DEPTH: u32 = 2; // Max nested archives (detection is 1 level deep; decompressed size limit is the primary defense) | |
| 143 | 150 | pub const SCAN_ZIP_MAX_UNCOMPRESSED: u64 = 2 * 1024 * 1024 * 1024; // 2 GB uncompressed limit |
| @@ -89,6 +89,9 @@ pub struct AppState { | |||
| 89 | 89 | /// Limits concurrent file scans to prevent memory exhaustion (each scan | |
| 90 | 90 | /// downloads up to SCAN_MAX_MEMORY_BYTES into RAM). | |
| 91 | 91 | pub scan_semaphore: Arc<tokio::sync::Semaphore>, | |
| 92 | + | /// Caps concurrent cache-miss DB lookups in `caddy-ask` so a flood of | |
| 93 | + | /// unknown-domain queries can't saturate the pool or drive ACME issuance. | |
| 94 | + | pub caddy_ask_semaphore: Arc<tokio::sync::Semaphore>, | |
| 92 | 95 | /// Unix timestamp when the server will restart (0 = no restart pending). | |
| 93 | 96 | /// Set by the deploy script via the internal API before uploading a new binary. | |
| 94 | 97 | pub restart_at: Arc<std::sync::atomic::AtomicI64>, |
| @@ -302,6 +302,7 @@ async fn main() { | |||
| 302 | 302 | wam, | |
| 303 | 303 | domain_cache, | |
| 304 | 304 | scan_semaphore: std::sync::Arc::new(tokio::sync::Semaphore::new(makenotwork::constants::SCAN_MAX_CONCURRENT)), | |
| 305 | + | caddy_ask_semaphore: std::sync::Arc::new(tokio::sync::Semaphore::new(makenotwork::constants::CADDY_ASK_MAX_CONCURRENT)), | |
| 305 | 306 | restart_at: std::sync::Arc::new(std::sync::atomic::AtomicI64::new(0)), | |
| 306 | 307 | sync_notify: std::sync::Arc::new(dashmap::DashMap::new()), | |
| 307 | 308 | sse_connections: std::sync::Arc::new(dashmap::DashMap::new()), |
| @@ -115,6 +115,7 @@ pub fn spawn_monitor( | |||
| 115 | 115 | let mut previous_status: Option<MonitorStatus> = None; | |
| 116 | 116 | let mut last_alert_at: Option<Instant> = None; | |
| 117 | 117 | let mut last_pool_alert_at: Option<Instant> = None; | |
| 118 | + | let mut last_pg_activity_alert_at: Option<Instant> = None; | |
| 118 | 119 | let mut prune_counter: u64 = 0; | |
| 119 | 120 | ||
| 120 | 121 | loop { | |
| @@ -248,6 +249,64 @@ pub fn spawn_monitor( | |||
| 248 | 249 | } | |
| 249 | 250 | } | |
| 250 | 251 | ||
| 252 | + | // Postgres server-wide saturation check via pg_stat_activity. This | |
| 253 | + | // catches cases where the *shared* Postgres (MNW + MT pools + ad hoc | |
| 254 | + | // clients) is approaching `max_connections`, which the local pool | |
| 255 | + | // pressure check above cannot see. | |
| 256 | + | { | |
| 257 | + | let row: Result<(i64, i64), _> = sqlx::query_as( | |
| 258 | + | "SELECT \ | |
| 259 | + | (SELECT count(*) FROM pg_stat_activity \ | |
| 260 | + | WHERE state IS NOT NULL AND backend_type = 'client backend')::bigint, \ | |
| 261 | + | current_setting('max_connections')::bigint", | |
| 262 | + | ) | |
| 263 | + | .fetch_one(&state.db) | |
| 264 | + | .await; | |
| 265 | + | ||
| 266 | + | if let Ok((active, max_conn)) = row | |
| 267 | + | && max_conn > 0 | |
| 268 | + | { | |
| 269 | + | let pct = active * 100 / max_conn; | |
| 270 | + | if pct > 80 { | |
| 271 | + | tracing::warn!( | |
| 272 | + | active, | |
| 273 | + | max_conn, | |
| 274 | + | pct, | |
| 275 | + | "Postgres pg_stat_activity saturation >80%" | |
| 276 | + | ); | |
| 277 | + | let cooldown_ok = last_pg_activity_alert_at | |
| 278 | + | .is_none_or(|t| t.elapsed().as_secs() >= constants::ALERT_COOLDOWN_SECS); | |
| 279 | + | if cooldown_ok | |
| 280 | + | && let Some(ref wam) = state.wam | |
| 281 | + | { | |
| 282 | + | let title = format!( | |
| 283 | + | "Postgres saturation: {active}/{max_conn} client backends ({pct}%)" | |
| 284 | + | ); | |
| 285 | + | let body = format!( | |
| 286 | + | "pg_stat_activity client-backend count is at {pct}% of \ | |
| 287 | + | max_connections ({active}/{max_conn}). Shared Postgres serves \ | |
| 288 | + | MNW + MT + ad hoc clients; exhaustion will fail new connections \ | |
| 289 | + | for all of them. Investigate which role/application is holding \ | |
| 290 | + | connections via:\n\n \ | |
| 291 | + | SELECT usename, application_name, state, count(*) \ | |
| 292 | + | FROM pg_stat_activity GROUP BY 1,2,3 ORDER BY 4 DESC;" | |
| 293 | + | ); | |
| 294 | + | wam.create_ticket( | |
| 295 | + | &title, | |
| 296 | + | Some(&body), | |
| 297 | + | "high", | |
| 298 | + | "pg-stat-activity-saturation", | |
| 299 | + | None, | |
| 300 | + | ) | |
| 301 | + | .await; | |
| 302 | + | last_pg_activity_alert_at = Some(Instant::now()); | |
| 303 | + | } | |
| 304 | + | } | |
| 305 | + | } else if let Err(e) = row { | |
| 306 | + | tracing::debug!(error = ?e, "pg_stat_activity probe failed"); | |
| 307 | + | } | |
| 308 | + | } | |
| 309 | + | ||
| 251 | 310 | // Persist snapshot (best-effort) | |
| 252 | 311 | if let Err(e) = db::monitor::insert_health_history( | |
| 253 | 312 | &state.db, |
| @@ -162,7 +162,15 @@ pub(super) struct CaddyAskQuery { | |||
| 162 | 162 | /// GET /api/domains/caddy-ask — Caddy on-demand TLS check. | |
| 163 | 163 | /// | |
| 164 | 164 | /// Unauthenticated (called by Caddy, not users). Returns 200 if the domain | |
| 165 | - | /// is verified, 404 if not. | |
| 165 | + | /// is verified, 404 if not. Caddy treats anything outside the 2xx range as | |
| 166 | + | /// "do not issue." | |
| 167 | + | /// | |
| 168 | + | /// Abuse model: every request reaches MNW from Caddy's IP, so per-IP rate | |
| 169 | + | /// limiting can't distinguish a legitimate visitor from an attacker probing | |
| 170 | + | /// thousands of bogus hostnames. Defenses here are content-based: | |
| 171 | + | /// 1. Reject syntactically-invalid hostnames before touching the DB. | |
| 172 | + | /// 2. Bound concurrent cache-miss DB lookups via `caddy_ask_semaphore`; | |
| 173 | + | /// at capacity, return 503 so Caddy backs off and the pool stays free. | |
| 166 | 174 | #[tracing::instrument(skip_all, name = "api::domains::caddy_ask")] | |
| 167 | 175 | pub(super) async fn caddy_ask( | |
| 168 | 176 | State(state): State<AppState>, | |
| @@ -170,15 +178,32 @@ pub(super) async fn caddy_ask( | |||
| 170 | 178 | ) -> impl IntoResponse { | |
| 171 | 179 | let domain = q.domain.to_lowercase(); | |
| 172 | 180 | ||
| 173 | - | // Fast path: check cache | |
| 181 | + | // Cheap structural reject: hostnames with no dot, spaces, control chars, | |
| 182 | + | // or absurd length can't be verified domains. Skipping the DB on these | |
| 183 | + | // shuts down the easiest flood vector. | |
| 184 | + | if domain.is_empty() | |
| 185 | + | || domain.len() > 253 | |
| 186 | + | || !domain.contains('.') | |
| 187 | + | || domain.contains(|c: char| c.is_whitespace() || c.is_control()) | |
| 188 | + | { | |
| 189 | + | return axum::http::StatusCode::NOT_FOUND; | |
| 190 | + | } | |
| 191 | + | ||
| 192 | + | // Fast path: cache hit, no DB, no semaphore. | |
| 174 | 193 | if state.domain_cache.contains_key(&domain) { | |
| 175 | 194 | return axum::http::StatusCode::OK; | |
| 176 | 195 | } | |
| 177 | 196 | ||
| 178 | - | // Slow path: check DB | |
| 197 | + | // Slow path: cap concurrent DB lookups. `try_acquire` is non-blocking; | |
| 198 | + | // hitting the cap means we're already under pressure and want Caddy to | |
| 199 | + | // retry rather than queue. | |
| 200 | + | let Ok(_permit) = state.caddy_ask_semaphore.try_acquire() else { | |
| 201 | + | tracing::warn!(domain = %domain, "caddy-ask: cache-miss concurrency cap reached"); | |
| 202 | + | return axum::http::StatusCode::SERVICE_UNAVAILABLE; | |
| 203 | + | }; | |
| 204 | + | ||
| 179 | 205 | match db::custom_domains::get_verified_domain(&state.db, &domain).await { | |
| 180 | 206 | Ok(Some(d)) => { | |
| 181 | - | // Populate cache for next time | |
| 182 | 207 | state.domain_cache.insert(d.domain, d.user_id); | |
| 183 | 208 | axum::http::StatusCode::OK | |
| 184 | 209 | } |
| @@ -346,6 +346,7 @@ impl TestHarness { | |||
| 346 | 346 | sse_connections: Arc::new(dashmap::DashMap::new()), | |
| 347 | 347 | metrics_handle: None, | |
| 348 | 348 | scan_semaphore: Arc::new(tokio::sync::Semaphore::new(4)), | |
| 349 | + | caddy_ask_semaphore: Arc::new(tokio::sync::Semaphore::new(8)), | |
| 349 | 350 | }; | |
| 350 | 351 | ||
| 351 | 352 | let app = build_app(state, session_layer); |
| @@ -68,6 +68,10 @@ pub async fn run(config: LoadConfig) { | |||
| 68 | 68 | mt_base_url: None, | |
| 69 | 69 | fan_plus_price_id: None, | |
| 70 | 70 | creator_tier_prices: std::collections::HashMap::new(), | |
| 71 | + | creator_tier_annual_prices: std::collections::HashMap::new(), | |
| 72 | + | creator_tier_founder_prices: std::collections::HashMap::new(), | |
| 73 | + | creator_tier_founder_annual_prices: std::collections::HashMap::new(), | |
| 74 | + | creator_founder_window_open: false, | |
| 71 | 75 | build_trigger_token: None, | |
| 72 | 76 | build_host_linux: None, | |
| 73 | 77 | build_host_darwin: None, | |
| @@ -122,6 +126,7 @@ pub async fn run(config: LoadConfig) { | |||
| 122 | 126 | sse_connections: Arc::new(dashmap::DashMap::new()), | |
| 123 | 127 | metrics_handle: None, | |
| 124 | 128 | scan_semaphore: Arc::new(tokio::sync::Semaphore::new(4)), | |
| 129 | + | caddy_ask_semaphore: Arc::new(tokio::sync::Semaphore::new(8)), | |
| 125 | 130 | }; | |
| 126 | 131 | ||
| 127 | 132 | let app = build_app(state, session_layer); |