Skip to main content

max / makenotwork

launch-eve audit pass: Ultra Fuzz Runs #8-9 + cross-cutting sweeps server/ Run #8 (earlier today): all 5 axes brought to A-; new src/background.rs bounded mpsc + semaphore queue replaces 22 per-request tokio::spawn sites; cart min_price_cents/chain-break MEDs fixed; item-wizard pricing_model silent fallback fixed; inline-JS copy-link migrated to delegated handler; cart free-claim N+1 closed; commit_rescan helper extends chronic-disease seal to admin paths; new migrations 123-130, 133 (backup-code prefix, completion_effects, scan_status index, sync_apps loopback, dead-letter table, ip index, non-negative duration); 7-wave backlog sweep closed 24/26 carried items. Run #9 (this session): launch-eve deep pass surfaced + fixed - UX-CRITICAL: signup TOCTOU race 23505 -> 500 + form loss (join_wizard.rs catches 23505 with constraint-name routing) - Sec-SERIOUS: delete_all_sessions_for_user non-atomic JWT bump (sessions.rs wrapped in pool.begin() / tx.commit()) - Sec-SERIOUS: 2FA login-email IP spoofable via bare x-forwarded-for (two_factor.rs uses helpers::extract_client_ip) - Pay-SERIOUS: webhook dual-failure 503 short-circuited Stripe retry (webhook/mod.rs calls unmark_event_processed before 503) Deferred with rationale in docs/audit_review.md + todo.md: 1 SERIOUS (subscription webhook ordering), 3 HIGH (dead-letter unused, reqwest per-request x5, unbounded cleanup spawn), 7 MED, 8 LOW. §1.1 public surface: OG/Twitter meta in base.html, static/manifest.json, error.html contact link, sitemap.rs with in-memory cache. info@ email pin across 8 files. doc-fuzz/exorcise/nitpick/security-review passes complete. sando/ daemon build + main updates, deploy systemd unit + config example, post-receive hook, BOM doc edits. Launchplan §1.5 A- bar holds across all 5 axes.
Author: Max Johnson <me@maxj.phd> · 2026-06-01 00:37 UTC
Commit: 3dc8dca7a120c483dd2fcef8c9f34ccad19847a6
Parent: eee96a7
155 files changed, +3743 insertions, -1100 deletions
@@ -38,6 +38,8 @@ pub async fn run(
38 38 let version = read_pkg_version(&server_dir.join("Cargo.toml")).await
39 39 .with_context(|| format!("reading version from {}/Cargo.toml", server_dir.display()))?;
40 40
41 + tracing::info!(sha = %sha, version = %version, dir = %server_dir.display(), "cargo build --release start");
42 + let started = std::time::Instant::now();
41 43 let out = Command::new("cargo")
42 44 .arg("build")
43 45 .arg("--release")
@@ -45,6 +47,12 @@ pub async fn run(
45 47 .output()
46 48 .await
47 49 .context("spawning cargo build")?;
50 + let elapsed_s = started.elapsed().as_secs();
51 + if !out.status.success() {
52 + tracing::error!(sha = %sha, version = %version, elapsed_s, "cargo build --release failed");
53 + } else {
54 + tracing::info!(sha = %sha, version = %version, elapsed_s, "cargo build --release ok");
55 + }
48 56 anyhow::ensure!(
49 57 out.status.success(),
50 58 "cargo build --release failed:\n{}",
@@ -20,9 +20,15 @@ mod topology;
20 20 #[tokio::main]
21 21 async fn main() -> Result<()> {
22 22 tracing_subscriber::fmt()
23 + // stdout is block-buffered under systemd (no TTY) so events never
24 + // reach journald until the buffer fills or the process exits. stderr
25 + // is line-buffered, which is what we want for a long-running service.
26 + .with_writer(std::io::stderr)
23 27 .with_env_filter(
24 28 tracing_subscriber::EnvFilter::try_from_default_env()
25 - .unwrap_or_else(|_| "sando_daemon=info,tower_http=info".into()),
29 + // bin target name is `sandod`, NOT the package name `sando-daemon` —
30 + // `module_path!()` uses the binary's crate name, so events come from `sandod::*`.
31 + .unwrap_or_else(|_| "sandod=info,tower_http=info".into()),
26 32 )
27 33 .init();
28 34
@@ -0,0 +1,9 @@
1 + # Sando daemon config (production).
2 + # Install at /etc/sando/sando-daemon.toml on the Sando host.
3 +
4 + listen = "100.103.89.95:7766" # pop-os tailnet IP; bind tailnet-only, not 0.0.0.0
5 + db_path = "/srv/sando/state/sando.db"
6 + topology_path = "/etc/sando/sando.toml"
7 + workdir = "/srv/sando/work"
8 + release_root = "/srv/sando/releases"
9 + scratch_db_url = "postgres:///sando_scratch?host=/var/run/postgresql"
@@ -0,0 +1,55 @@
1 + # Sando daemon systemd service
2 + # Place at /etc/systemd/system/sandod.service on the Sando host (pop-os).
3 + #
4 + # Commands:
5 + # sudo systemctl daemon-reload
6 + # sudo systemctl enable sandod
7 + # sudo systemctl start sandod
8 + # sudo systemctl status sandod
9 + # journalctl -u sandod -f
10 +
11 + [Unit]
12 + Description=Sando deploy controller
13 + Documentation=https://github.com/maxjmath/MNW
14 + After=network.target postgresql.service
15 + Wants=postgresql.service
16 +
17 + [Service]
18 + Type=simple
19 + User=sando
20 + Group=sando
21 + WorkingDirectory=/srv/sando
22 + ExecStart=/usr/local/bin/sandod
23 + Restart=on-failure
24 + RestartSec=5
25 +
26 + Environment=SANDO_CONFIG=/etc/sando/sando-daemon.toml
27 + Environment=PATH=/srv/sando/.cargo/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
28 + Environment=HOME=/srv/sando
29 + EnvironmentFile=-/etc/sando/sando.env
30 +
31 + # Security hardening. Sando needs ssh outbound, git over fs, postgres over
32 + # unix socket, and read/write on its own state dirs.
33 + NoNewPrivileges=true
34 + ProtectSystem=strict
35 + ProtectHome=true
36 + PrivateTmp=true
37 + ReadWritePaths=/srv/sando
38 + RestrictAddressFamilies=AF_UNIX AF_INET AF_INET6
39 + RestrictNamespaces=true
40 + RestrictRealtime=true
41 + RestrictSUIDSGID=true
42 + LockPersonality=true
43 + ProtectKernelTunables=true
44 + ProtectKernelModules=true
45 + ProtectControlGroups=true
46 + SystemCallArchitectures=native
47 +
48 + LimitNOFILE=65535
49 +
50 + StandardOutput=journal
51 + StandardError=journal
52 + SyslogIdentifier=sandod
53 +
54 + [Install]
55 + WantedBy=multi-user.target
@@ -8,6 +8,16 @@
8 8
9 9 set -euo pipefail
10 10
11 + # Pick up SANDO_DAEMON / SANDO_BRANCH from the daemon's env file when present
12 + # (the same file systemd's EnvironmentFile= points at). Lets the deployed hook
13 + # reach a non-loopback listen address without changing the hook source.
14 + if [[ -r /etc/sando/sando.env ]]; then
15 + set -a
16 + # shellcheck disable=SC1091
17 + source /etc/sando/sando.env
18 + set +a
19 + fi
20 +
11 21 DAEMON_URL="${SANDO_DAEMON:-http://127.0.0.1:7766}"
12 22 DEPLOY_BRANCH="${SANDO_BRANCH:-main}"
13 23
@@ -1,6 +1,6 @@
1 1 # MakeMachine Hardware BOM
2 2
3 - Settled 2026-05-23. Top-of-line host platform; GPUs are fungible and live on the EveryCycle GPU thread (see `~/hardware/everycycle/docs/roadmap.md`).
3 + Settled 2026-05-23. Top-of-line host platform; GPUs are fungible and live on the EveryCycle GPU thread (see `~/Code/everycycle/docs/roadmap.md`).
4 4
5 5 The substrate is built once and kept stable; GPU experimentation happens above it without revisiting motherboard, CPU, or RAM.
6 6
@@ -4,9 +4,13 @@
4 4 # unlock promotion *to* the next tier, the nodes it ships to, and the canary
5 5 # policy for shipping within the tier.
6 6 #
7 - # Day-one wiring: MM (local) -> A (testnot.work) -> B (prod-1). C is declared
8 - # but not provisioned; adding the second prod node later is a config edit
9 - # (set provisioned = true, fill in [[tier.node]]).
7 + # Day-one wiring: host (pop-os, local) -> A (testnot.work) -> B (prod-1). C is
8 + # declared but not provisioned; adding the second prod node later is a config
9 + # edit (set provisioned = true, fill in [[tier.node]]).
10 + #
11 + # Note: the host tier is named "mm" for legacy reasons (string identifier baked
12 + # into the schema + code). It refers to whatever machine sandod runs on —
13 + # currently pop-os, not a MakeMachine. Rename is a follow-up cleanup.
10 14
11 15 [repo]
12 16 bare_path = "/srv/sando/mnw.git"
@@ -18,7 +22,7 @@ branch = "main"
18 22 source = "rsync://astra/var/backups/mnw/latest.sql.gz"
19 23 local_path = "/srv/sando/backups/latest.sql.gz"
20 24
21 - # ---- MM: local pre-staging gate ----
25 + # ---- host: pop-os local pre-staging gate ----
22 26 [[tier]]
23 27 name = "mm"
24 28 provisioned = true
@@ -28,7 +32,7 @@ gates = [
28 32 { kind = "migration_dry_run" },
29 33 { kind = "boot_smoke" },
30 34 ]
31 - # MM is the daemon's own host; no remote node row.
35 + # Host is the daemon's own machine (pop-os); no remote node row.
32 36
33 37 # ---- A: testnot.work staging ----
34 38 [[tier]]
M sando/todo.md +25 -33
@@ -4,7 +4,9 @@ Open work only. Completed items move to `todo_done.md` (sibling file) when one e
4 4
5 5 Format rule: every actionable line is a `- [ ]` checkbox. Headings group phases and themes; do not put status updates in them.
6 6
7 - Roadmap target: replace `server/deploy/deploy.sh` and astra-hosted `server/deploy/run-ci.sh` with Sando running on the MakeMachine, gating Hetzner prod through testnot.work.
7 + Roadmap target: replace `server/deploy/deploy.sh` and astra-hosted `server/deploy/run-ci.sh` with Sando running on **pop-os**, gating Hetzner prod through testnot.work.
8 +
9 + **Host decision:** Sando runs on pop-os (x86_64 Ubuntu-derived, systemd). Architecturally closest to Hetzner prod, no cross-compile, no init-system split. MakeMachine and EveryCycle are now a separate project — not Sando's concern.
8 10
9 11 Phases are ordered for execution. Phase 0 must finish before Phase 1 is meaningful. Phases 5+ are post-cutover hardening.
10 12
@@ -13,11 +15,11 @@ Phases are ordered for execution. Phase 0 must finish before Phase 1 is meaningf
13 15 Read these to orient before working on Sando:
14 16
15 17 - `README.md` — quickstart, API surface, v0 limitations
16 - - `sando.toml` — current topology (MM → A → B; C declared, not provisioned)
18 + - `sando.toml` — current topology (host → A → B; C declared, not provisioned)
17 19 - `daemon/src/main.rs` — startup sequence (config → topology → migrate → sync → bare-repo bootstrap → serve)
18 20 - `daemon/src/routes.rs` — `/state`, `/promote`, `/rollback`, `/rebuild`, `/backup/fetch`, `/events`
19 21 - `daemon/src/gates.rs` — gate runners; the load-bearing logic
20 - - `daemon/src/build.rs` — `build_and_run_mm` is the MM-tier pipeline
22 + - `daemon/src/build.rs` — host-tier build pipeline
21 23 - `daemon/src/deploy.rs` — `deploy_local`; remote SSH stub
22 24 - `daemon/migrations/001_init.sql` — schema (tiers/nodes as rows)
23 25 - `server/deploy/deploy.sh` — current cross-compile + push-to-Hetzner script (what we are replacing)
@@ -26,33 +28,23 @@ Read these to orient before working on Sando:
26 28
27 29 ---
28 30
29 - ## Phase 0 — MakeMachine bootstrap
30 -
31 - Hardware and base provisioning. None of the remote-deploy work below matters until MM exists.
32 -
33 - **Platform decision: MM runs Mountaineer.** MM is the first real Mountaineer deployment and Sando is its first real sysop helper (principle 14). Hetzner prod stays on its current distro for now; the Mountaineer-for-prod question is deferred at least a year. If MM-on-Mountaineer ever blocks an MNW deploy for more than a day, fall back to Ubuntu on MM — capture the trigger in `plans/mm-platform-fallback.md` before flipping the install.
31 + ## Phase 0 — pop-os bootstrap
34 32
35 - - [ ] Purchase MakeMachine hardware per `plans/mm-hardware-bom.md` (Threadripper Pro 7975WX + WRX90D8-2L/2T + 512 GB ECC RDIMM + 2× 4 TB Gen5 NVMe; ~$10.5K including A1 GPU). Dual-use as Sando host + EveryCycle dev box — see `~/hardware/everycycle/docs/roadmap.md` for the EveryCycle side.
36 - - [ ] Install Mountaineer (ZFS root, s6+s6-rc init, nushell, podman). Use the latest Dull Edge build available, or hand-roll from `side_projects/mountaineer/` if no release has shipped yet.
37 - - [ ] Write `plans/mm-platform-fallback.md`: explicit trigger conditions for re-imaging MM with Ubuntu, plus the swap-in procedure (which env files, which binaries, which directories to preserve).
38 - - [ ] Join MM to tailnet; allocate a stable hostname and record in `_meta/infra_tailnet.md`.
39 - - [ ] Provision `sando` system user; lock down the home dir; set up scoped SSH keys for outbound deploys.
40 - - [ ] Install scratch Postgres locally on MM (via apk); create the `sando_scratch` role + DB used by `migration_dry_run`.
41 - - [ ] Write Sando's s6-rc service definition (`sandod` long-run service, dependency on tailscale and postgres, restart on failure, env from `/etc/sando/sando.env`). Contribute upstream to Alpine if the definition turns out general enough — see Mountaineer principle on giving back.
42 - - [ ] Install `sandod` binary at `/usr/local/bin/sandod`; bring up the s6 service.
43 - - [ ] Write the production `sando.toml`; bare repo path under `/srv/sando/mnw.git`; A node `testnot.work`; B node Hetzner prod. Use `node.init = "systemd"` for the Hetzner nodes (see Phase 1).
44 - - [ ] Verify MNW server builds reproducibly on Mountaineer (musl libc vs glibc — sqlx/tokio/axum should be fine but confirm before relying on it). Capture any musl-specific surprises in `plans/mm-build-notes.md`.
33 + - [x] Provision `sando` system user on pop-os; lock down home dir; generate SSH keypair at `/srv/sando/.ssh/id_ed25519` for outbound deploys.
34 + - [x] Install scratch Postgres locally on pop-os; create `sando_scratch` role + DB used by `migration_dry_run`. (Owner of own DB; non-superuser.)
35 + - [x] Write systemd unit for `sandod` (long-run service, restart on failure, env from `/etc/sando/sando.env`). Installed at `/etc/systemd/system/sandod.service`.
36 + - [x] Write the production `sando.toml`; bare repo path under `/srv/sando/mnw.git`. Installed at `/etc/sando/sando.toml`; daemon config at `/etc/sando/sando-daemon.toml`.
37 + - [x] Install `sandod` binary at `/usr/local/bin/sandod`; enable + start the service. Live on `100.103.89.95:7766`; bare repo auto-bootstrapped at `/srv/sando/mnw.git`.
38 + - [ ] Verify MNW server builds reproducibly on pop-os.
39 + - [ ] Register sando pubkey with Hetzner prod (`deploy@alpha-west-1`) and testnot.work once that node exists. Pubkey: `ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEK+vhpr1V8VnsEemN9x6tAA2S05kmv/mQ3eVgSXSkJ8 sando@pop-os`.
45 40
46 41 ## Phase 1 — Remote deploy
47 42
48 - The MVP only deploys to `ssh_target=local`. Production needs real SSH/rsync, and the init-system split (MM on s6, Hetzner on systemd) needs a backend abstraction from day one.
43 + The MVP only deploys to `ssh_target=local`. Production needs real SSH/rsync.
49 44
50 - - [ ] Add `node.init` field to `sando.toml`: `"systemd" | "s6" | "local"`. Default `"systemd"` for backwards-compat. Every node declares its init explicitly so a future Hetzner-on-Mountaineer move is a TOML edit, not a Sando code change.
51 - - [ ] Refactor `deploy.rs` around an `InitBackend` trait with `reload_or_restart(unit_name) -> Result<()>` and `unit_path(release_root, version) -> PathBuf`. Two impls: `Systemd` (shells `systemctl reload-or-restart`) and `S6` (shells `s6-svc -r` against the service dir). `Local` impl is a no-op restart for dev.
52 - - [ ] Implement `deploy::deploy_node` remote path: rsync the staged binary to `<ssh_target>:<release_root>/releases/<version>/server`, then `ssh <ssh_target>` runs `ln -sfn releases/<version> current` plus the init-backend-appropriate reload.
53 - - [ ] Settle service-name convention. Current MNW server systemd unit is `makenotwork.service`; on s6 it would be `/etc/s6-rc/sv/mnw-server/`. Capture both names + the migration plan in `plans/service-names.md` before changing anything live.
54 - - [ ] Add `node.service_name` field to `sando.toml` (default derives from tier+role) so the convention is explicit per-node and backend-agnostic.
55 - - [ ] Bootstrap script for adding a fresh node: creates `<release_root>`, installs the init-backend-appropriate service definition pointing at `<release_root>/current/server`, adds the sando SSH key to `authorized_keys`. Idempotent. One script per backend, or one script that branches on init kind.
45 + - [ ] Implement `deploy::deploy_node` remote path: rsync staged binary to `<ssh_target>:<release_root>/releases/<version>/server`, then `ssh <ssh_target>` runs `ln -sfn releases/<version> current` plus `systemctl reload-or-restart <service>`.
46 + - [ ] Add `node.service_name` to `sando.toml` (default `makenotwork.service`).
47 + - [ ] Bootstrap script for adding a fresh node: creates `<release_root>`, installs the systemd unit pointing at `<release_root>/current/server`, adds the sando SSH key to `authorized_keys`. Idempotent.
56 48 - [ ] Garbage-collect old releases on the remote: keep last N (configurable, default 5) per node. Run at end of each successful deploy.
57 49 - [ ] Handle `rsync` failure mid-deploy: leave the previous `current` symlink intact; mark `deploys.outcome = 'failed'`; do not advance `tier_state`.
58 50
@@ -62,7 +54,7 @@ The MVP only deploys to `ssh_target=local`. Production needs real SSH/rsync, and
62 54
63 55 - [ ] Confirm astra's offsite replica (per `sync-backup-offsite.sh`) writes a deterministic latest-link path Sando can rsync from. If not, add one.
64 56 - [ ] Wire the production `sando.toml` `backup.source` to the astra rsync URL.
65 - - [ ] Schedule a daily `POST /backup/fetch` (cron or systemd timer on MM) so a fresh backup is always within 24h of any promote attempt.
57 + - [ ] Schedule a daily `POST /backup/fetch` (systemd timer on pop-os) so a fresh backup is always within 24h of any promote attempt.
66 58 - [ ] First end-to-end `migration_dry_run` against a real prod backup; confirm it catches the 2026-05-22 incident class (drop+recreate column migration sequence).
67 59 - [ ] Document the failure modes: what does the operator see in `/state` when the dry-run fails? Capture in `plans/migration-dryrun-failures.md`.
68 60 - [ ] Decide retention on `backups` table — prune rows older than N days so SQLite doesn't grow forever.
@@ -77,7 +69,6 @@ Sando currently only ships the binary. `deploy.sh` does more. Inventory each pie
77 69 - [ ] **Error pages** — static HTML in `server/deploy/error-pages/`. Either bake into the binary (preferred — versions with code) or ship as a `releases/<version>/error-pages/` sibling. Capture decision.
78 70 - [ ] **Security configs** — `sshd-git.conf`, `fail2ban-sshd.conf`, `setup-firewall.sh`. Move to node-bootstrap.
79 71 - [ ] **Restart warning** — `deploy.sh send_restart_warning` posts a banner before restart. Decide whether Sando emits this and through what surface (probably the existing in-app banner mechanism).
80 - - [ ] **Cross-compile from macOS** — `deploy.sh` builds on the dev laptop via `cargo-zigbuild`. Sando builds natively on MM (x86_64 Linux). Verify the resulting binaries are byte-identical or at least behavior-equivalent across one full sprint before retiring `deploy.sh`.
81 72 - [ ] **Prod migrations** — today, who runs `sqlx migrate run` against prod? `deploy.sh` doesn't (verify). Sando should run prod migrations as part of `POST /promote/{tier}` for the prod tiers, OR there should be an explicit `POST /migrate/{tier}` operator action. Decide.
82 73
83 74 ## Phase 4 — Cutover
@@ -99,15 +90,15 @@ The TUI polls. The MVP requires you to hand-insert a row for `manual_confirm`. B
99 90 - [ ] TUI: actions pane. `p` for promote (prompts for version + tier), `R` for rollback, `b` for backup fetch, `c` for manual_confirm.
100 91 - [ ] `POST /confirm/{tier}` endpoint that inserts a `gate_runs` row with `passed=1, gate_kind='manual_confirm'` for the current pending version. Replaces the hand-SQL workaround.
101 92 - [ ] TUI live log pane that follows the most recent build / gate run; backed by `WS /events`.
102 - - [ ] `POST /promote` body should accept `version` as optional; default to the current MM version when target is A, predecessor's current when target is B+. Reduces ceremony.
93 + - [ ] `POST /promote` body should accept `version` as optional; default to the current host version when target is A, predecessor's current when target is B+. Reduces ceremony.
103 94
104 95 ## Phase 6 — Monitoring + alerting
105 96
106 - - [ ] Wire MM's `/metrics` endpoint into the existing MNW Prometheus scrape config; record where the scrape config lives in `_meta/` or wherever monitoring already runs.
97 + - [ ] Wire pop-os `/metrics` endpoint into the existing MNW Prometheus scrape config; record where the scrape config lives in `_meta/` or wherever monitoring already runs.
107 98 - [ ] Add counters: `sando_builds_total{outcome}`, `sando_gates_total{tier,kind,outcome}`, `sando_deploys_total{tier,outcome}`, `sando_burn_in_remaining_hours{tier}`.
108 99 - [ ] Alert: build failed. Page on first failure (not flap-protected — builds are infrequent).
109 100 - [ ] Alert: migration_dry_run failed. Page immediately. This is the 2026-05-22-class signal.
110 - - [ ] Alert: a tier has had `current_version` unchanged for > N days while MM is green. (Operator forgot to promote.)
101 + - [ ] Alert: a tier has had `current_version` unchanged for > N days while host is green. (Operator forgot to promote.)
111 102
112 103 ## Phase 7 — Multi-node B+C
113 104
@@ -125,7 +116,7 @@ Move Postgres off the prod app node so B+C become truly interchangeable.
125 116
126 117 - [ ] Provision Postgres-only machine D (modest spec; reliability over performance).
127 118 - [ ] Migrate the prod DB from Hetzner app node to D. Capture procedure in `plans/postgres-d-migration.md`.
128 - - [ ] Update `server` `DATABASE_URL` everywhere (env files on B+C, scratch URL on MM stays local).
119 + - [ ] Update `server` `DATABASE_URL` everywhere (env files on B+C, scratch URL on pop-os stays local).
129 120 - [ ] Replica/HA story stays deferred; D is SPOF for now (per `_meta/preclear/.../decisions.md`).
130 121
131 122 ## Phase 9 — Hardening
@@ -135,10 +126,11 @@ Pick up after cutover is stable.
135 126 - [ ] Tailnet ACL audit: confirm only the laptop can reach `sandod:7766`. Document the ACL.
136 127 - [ ] Decide if v0.2 needs token auth on `sandod` endpoints (revisit assumption from `decisions.md` once there's a real second operator).
137 128 - [ ] Sando self-deploy: Sando builds and deploys *itself* through its own pipeline. Bootstraps the bootstrap. Closes the chicken-and-egg loop and is satisfying.
138 - - [ ] Backup-of-Sando-state: nightly SQLite snapshot to astra. The state DB tracks 6 months of deploys; losing it on a MM disk failure would be annoying.
129 + - [ ] Backup-of-Sando-state: nightly SQLite snapshot to astra. The state DB tracks 6 months of deploys; losing it on a pop-os disk failure would be annoying.
139 130
140 131 ## Notes / non-checkbox
141 132
142 - - WS `/events` and the operator-UX work in Phase 5 can run in parallel with Phase 1-3 once MM exists. They are sequenced after for review clarity, not because they block anything.
133 + - WS `/events` and the operator-UX work in Phase 5 can run in parallel with Phase 1-3 once Phase 0 is done. They are sequenced after for review clarity, not because they block anything.
143 134 - "Hotfix override" and `reset_burn_in` flag are already implemented end-to-end (see `decisions.md`); not on this list because there's nothing left to do until prod uses them.
144 135 - C tier exists in the schema as a `provisioned=false` row from day one — adding C in Phase 7 is a TOML edit, not a migration.
136 + - MakeMachine + EveryCycle are now a separate project. The hardware BOM (`plans/mm-hardware-bom.md`) should move there when that project gets its own repo.
@@ -97,7 +97,7 @@ A `json_error_layer` middleware converts HTML error responses to `{"error": "...
97 97
98 98 ## Database Layer
99 99
100 - PostgreSQL via sqlx with compile-time checked queries. 50 migrations (auto-applied on boot). Connection pool: 25 max connections, 3-second acquire timeout.
100 + PostgreSQL via sqlx with compile-time checked queries. Numbered migrations in `migrations/`, auto-applied on boot; the directory is the source of truth. Connection pool: 25 max connections, 3-second acquire timeout.
101 101
102 102 ### DB Modules
103 103
@@ -229,7 +229,7 @@ HTTP client (`mt_client`) for the Multithreaded forum instance. HMAC-signed inte
229 229
230 230 - **Passwords:** Argon2id with random salt per hash
231 231 - **Sessions:** `tower-sessions` with PostgreSQL-backed store, ID regeneration on login (prevents fixation), 7-day expiry on inactivity
232 - - **Session cache:** DashMap caches recent session validations (30-second TTL) to skip per-request DB touch
232 + - **Session cache:** DashMap caches recent session validations (TTL from `constants::SESSION_TOUCH_CACHE_SECS`, currently 5s) to skip per-request DB touch
233 233 - **2FA:** TOTP (totp-rs, 6-digit, 30-second step, +/-1 skew) + WebAuthn passkeys (webauthn-rs)
234 234 - **Account lockout:** 5 failed attempts triggers 15-minute lockout
235 235 - **New-device notifications:** Email alert on login from unrecognized session
@@ -359,7 +359,7 @@ Two spawned Tokio tasks, coordinated via `watch::channel` for graceful shutdown:
359 359 | Scheduler | `src/scheduler.rs` |
360 360 | Shared types | `src/types/` |
361 361 | Askama templates | `templates/` |
362 - | Migrations | `migrations/` (001-050) |
362 + | Migrations | `migrations/` (numbered, applied in order) |
363 363 | Static assets | `static/` |
364 364 | Integration tests | `tests/` |
365 365 | Deploy scripts | `deploy/` |
@@ -0,0 +1,1192 @@
1 + # Ultra Fuzz Report — MNW Server (Run #9 — launch eve)
2 +
3 + **Run date:** 2026-05-31 (evening)
4 + **Run number:** 9 (launchplan_final.md §1.5 referred to it as "Run #5" — stale; this is the 9th)
5 + **Trigger:** launchplan §1.5 pre-launch pass
6 +
7 + ## Run #9 headline
8 +
9 + Run #8 closed with "BAR MET — ALL FIVE AXES A-". Run #9 went deeper and surfaced 1 CRITICAL + 4 SERIOUS + several MED/HIGH items the prior 8 runs missed. All four launch-critical items fixed in-session; remaining items deferred with rationale below.
10 +
11 + | Axis | Run #8 | Run #9 | Direction |
12 + |------|--------|--------|-----------|
13 + | Payments | A- | A- | flat — 2 new SERIOUS surfaced; 1 fixed (webhook unmark on dual-failure 503), 1 deferred (subscription out-of-order webhook) |
14 + | Storage | A- | A- | flat — 1 new HIGH (migration 129 dead-letter table unused) + 2 MEDs (is_s3_key_live unindexed full-scan, LIKE-suffix false-positive); deferred |
15 + | UX Wiring | A- → B- → A- | A- | dipped on grade-cap for signup TOCTOU CRITICAL, restored after fix |
16 + | Security | A- | A- | flat — 2 new SERIOUS, both fixed (JWT-bump non-atomic, 2FA email IP spoofable) |
17 + | Performance | A- | A- | flat — 2 new HIGH (per-request reqwest::Client::new in 5 hot paths, unbounded spawn in expired-account cleanup); deferred to post-launch |
18 +
19 + **Net Run #9 (post-fix):** 0 CRITICAL · 1 SERIOUS open (Payments subscription ordering — documented deferral) · 3 HIGH open (deferred) · 7 MED open (deferred). **Launchplan §1.5 A- bar holds.**
20 +
21 + ## Run #9 — CRITICAL fixed in-session
22 +
23 + ### UX-CRITICAL — Signup TOCTOU: race → 500 + form loss → FIXED 2026-05-31
24 +
25 + `src/routes/pages/public/join_wizard.rs:99-139`. The wizard ran separate `get_user_by_username` / `get_user_by_email` checks before `create_user`. A concurrent signup with the same username or email slipping between SELECT and INSERT raised a 23505 unique violation that bubbled to `AppError::Database` → 500 "Something went wrong" — and the user's entire typed-in form was lost. On a public alpha-launch surge this is the highest-traffic public endpoint; the wrong page to be returning 500s on.
26 +
27 + **Fix landed:** `create_user` call site now matches `AppError::Database(sqlx::Error::Database(_))` with code 23505, inspects the constraint name (`users_username_key` / `users_email_key`), and routes through `return_error(..)` with a friendly message — same flow as the explicit pre-check branches. Same shape as the existing 23505 handling in `db/license_keys.rs`, `db/builds.rs`, `routes/api/guest_checkout.rs`.
28 +
29 + **Known follow-up (not blocking):** the form-reload still loses typed values on the error swap; `return_error` renders `LoginErrorTemplate` (message-only). Preserving field values would require threading them through the template — file a separate Phase 4 polish item.
30 +
31 + ## Run #9 — SERIOUS fixed in-session
32 +
33 + ### Sec-SERIOUS — `delete_all_sessions_for_user` non-atomic JWT bump → FIXED 2026-05-31
34 +
35 + `src/db/sessions.rs:247-263`. The function ran `DELETE FROM user_sessions` then a separate `UPDATE users SET jwt_invalidated_at = NOW()` on independent connections. If the UPDATE dropped (pool timeout, conn drop, postgres restart), session cookies were dead but every outstanding SyncKit JWT survived until natural expiry — exactly the leak this function exists to prevent. The in-code comment ("a session row deleted without a JWT bump is harmless, the converse would leak access") inverted reality.
36 +
37 + **Fix landed:** both writes wrapped in `pool.begin()` / `tx.commit()`. Comment updated.
38 +
39 + ### Sec-SERIOUS — 2FA login-notification email uses spoofable IP → FIXED 2026-05-31
40 +
41 + `src/routes/pages/public/two_factor.rs:308-312`. The 2FA-completion path read `x-forwarded-for` raw (first-comma-split) for the new-login email's IP field. Every other login surface (`routes/auth.rs:242`, `auth.rs:486`, `auth.rs:528`) routes through `crate::helpers::extract_client_ip` which prioritizes `CF-Connecting-IP`. An attacker who already captured a password could pre-set `X-Forwarded-For: 1.2.3.4` on the verify-2fa POST so the "new login from <city>" email lied about origin — the exact email users are told to trust for compromise detection.
42 +
43 + **Fix landed:** swapped to `crate::helpers::extract_client_ip(&headers)`. One-line change, parity restored.
44 +
45 + ### Pay-SERIOUS — Webhook dual-failure dropped events silently → FIXED 2026-05-31
46 +
47 + `src/routes/stripe/webhook/mod.rs:73-89`. Dedup row was marked processed before handler dispatch (correct for at-least-once). On `(handler_err, insert_failed_event_err)` dual failure, code returned 503 to trigger Stripe redelivery — but Stripe's redelivery would short-circuit at the dedup check (line 50) and 200 the event without ever processing it. The code's own comment acknowledged the bug; the right tool (`unmark_event_processed`, defined 30 lines away in `db/webhook_events.rs:40`) was never called.
48 +
49 + **Fix landed:** call `db::webhook_events::unmark_event_processed(&state.db, &event_id)` before returning 503, with logged-error best-effort if even that fails (same scenario where 503 was already wrong).
50 +
51 + ## Run #9 — DEFERRED with rationale (above A- bar)
52 +
53 + ### Pay-SERIOUS — Subscription webhook out-of-order events resurrect `active`
54 +
55 + `src/routes/stripe/webhook/subscriptions.rs:90, 116, 140`. Handlers blindly overwrite `subscriptions.status` and `period_end` from the webhook payload. Stripe does NOT guarantee delivery order. Sequence `past_due → active` reordered as `active → past_due → active(stale)` overwrites a legitimate `past_due` with stale `active` — restoring access for a user who hasn't paid.
56 +
57 + **Deferral rationale:** worst case is restored access for a few minutes until the next webhook arrives. Fix requires re-extracting Stripe's top-level `created` from `UntypedEvent` (currently dropped) and adding `WHERE last_event_at IS NULL OR last_event_at <= $created` guards on every status/period write across Fan+, creator-tier, and synckit code paths — non-trivial cross-cutting change. Post-launch fix in Phase 4; tracked in todo.md.
58 +
59 + ### Sto-HIGH — Migration 129 dead-letter table never written
60 +
61 + `migrations/129_pending_s3_deletions_dead_letter.sql` creates `pending_s3_deletions_dead_letter` and documents it as "operator-visible parking lot... require manual triage." `src/scheduler/cleanup.rs:453-457` on `attempts >= 10` only logs `tracing::error!` then removes the row — never inserts into the dead-letter table. Permanently-failing keys have zero operator visibility.
62 +
63 + **Deferral rationale:** operational, not runtime. No user impact; only operators lose triage signal. One-INSERT fix; bundle into Phase 4.
64 +
65 + ### Perf-HIGH — Per-request `reqwest::Client::new()` in 5 hot paths
66 +
67 + `routes/pages/dashboard/main.rs:118`, `routes/pages/public/landing.rs:284`, `routes/api/internal/cli_features.rs:440`, `routes/api/domains.rs:319`, `auth.rs:559`. Each call builds a fresh TCP pool, TLS context, DNS resolver — no keep-alive across requests. `MtClient` in `AppState` already keeps a pooled client; the dashboard bypasses it.
68 +
69 + **Deferral rationale:** real but matters at scale. Private alpha launch traffic well below where this becomes a tail-latency contributor. 30-min refactor; bundle into Phase 4 once launch traffic settles.
70 +
71 + ### Perf-HIGH — Unbounded `tokio::spawn` in expired-account cleanup
72 +
73 + `src/scheduler/cleanup.rs:215-220` (`spawn_expired_account_cleanups`). Daily tick spawns one task per expired account, no governor. `cleanup_sandbox_accounts` (same file, ~100 lines above) correctly caps at `CLEANUP_PARALLELISM=4` via `JoinSet`; the terminated/content-removal variants don't. A backlog of 200 expired accounts fan-outs 200 concurrent S3 prefix listings racing for the 25-conn pool at midnight.
74 +
75 + **Deferral rationale:** runs once daily; current expired-account count is small (private alpha). Trivial fix (lift the existing JoinSet pattern); not launch-blocking. Bundle with Phase 4.
76 +
77 + ## Run #9 — MED/LOW deferred (read-only carry-forward, in todo.md)
78 +
79 + - Pay-MED: `pricing.rs::parse_dollars_to_cents` misinterprets European decimal comma (`1,23` → 12300¢). User-controlled input; fixable in a single regex.
80 + - Pay-MED: SyncKit app-sub checkout silently defaults `storage_limit_bytes` to 0 if metadata missing.
81 + - Pay-MED: Guest checkout email falls back to `"unknown@guest"` sentinel; collisions possible.
82 + - Sto-MED: `is_s3_key_live` runs 7 EXISTS subqueries on unindexed `items.audio_s3_key` / `cover_s3_key` / `video_s3_key` / `versions.s3_key` etc — sequential scans per retry.
83 + - Sto-MED: `is_s3_key_live` LIKE-suffix pattern `'%' || s3_key` false-positives on neighboring keys (key `abc/file.png` matches `xabc/file.png`) — skips a legitimate delete → S3 object leaks.
84 + - UX-MED: "Log in" return_to query param in `purchase.html:145` is dead-wired — login handler always redirects `/dashboard`. Lost purchase intent.
85 + - UX-MED: Admin user filter buttons (`admin-users.html:35-44`) use `class="primary"` / `class="secondary"` instead of `btn-primary` / `btn-secondary` — renders unstyled.
86 + - UX-LOW: Pagination links in `git/issues.html:72,76` don't URL-encode `search`; `&page=99` in search query corrupts pagination.
87 + - UX-LOW: 5 sites do `.render().unwrap_or_default()` on Askama templates (blank UI on render failure, no log).
88 + - UX-LOW: `slugify` in `formatting.rs` produces `"post"` for any non-ASCII title; international creators get opaque URLs.
89 + - Sec-MINOR: `csrf.rs:176-185` `validate_token_consuming` doesn't consume — name promises stronger property than implementation.
90 + - Sec-MINOR: `routes/oauth.rs:101-111` `is_localhost_redirect` allows any port on localhost regardless of registered URI.
91 + - Sec-MINOR: `routes/pages/public/two_factor.rs::pending_2fa_started_at` reads `i64` via session.get; type mismatch silently → None → instantly-expired.
92 + - Sec-MINOR: `scanning/archive.rs:124` path-traversal check misses lone `..` segment (no trailing separator).
93 + - Perf-LOW: `scheduler/announcements.rs` linear walk through subscriber list in a single spawned task; no checkpointing.
94 + - Perf-LOW: `db/page_views.rs` `pending` HashMap has no max-cardinality cap (crawler hitting 100k unique target_ids before tick).
95 + - Perf-LOW: `build_runner.rs:441` local artifact tmpfile leaks if process crashes between SCP and `remove_file`.
96 +
97 + ## Run #9 — mandatory surprises
98 +
99 + - **Payments:** `routes/stripe/webhook/mod.rs:82-89` literally documents the bug it ships ("the dedup row was already marked processed... Stripe won't retry") and then chooses 503 anyway. The fix (`unmark_event_processed`) sat 30 lines away in the same crate, never called. Scar-tissue-comment-without-the-fix is a recognizable pattern across the codebase.
100 + - **Storage:** `routes/storage/mod.rs::commit_upload` sealed-helper pattern (Run #7 fix for the chronic disease) is the strongest piece of structural engineering in the repo — turned an enum into a witness type. But the *neighbor* file `migrations/129_pending_s3_deletions_dead_letter.sql` shows the opposite: migration written with detailed prose explaining the operator's parking lot, and the actual INSERT never wired up. Two adjacent fixes from the same audit-cycle, one structural and load-bearing, one ceremonial and silently broken.
101 + - **UX:** `csrf.rs` `PostureMethodRouter` + sealed `CsrfManuallyValidated` witness make registering a mutation route without an explicit posture declaration *uncompilable*. A+ engineering. The contrast with the signup wizard's TOCTOU-and-500-with-lost-form is jarring — defensive depth on CSRF, none on the front door.
102 + - **Security:** `routes/auth.rs:128-130` malformed-email branch skips the DUMMY_HASH timing equalizer that was added explicitly to prevent timing-side-channel user enumeration. ~2 orders of magnitude faster than every other failure path. The equalizer exists; this one path bypasses it.
103 + - **Performance:** `db/projects.rs::get_project_ids_for_user` is the only `fetch_all` in `projects.rs` without a `LIMIT`. Its neighbor `get_projects_by_user` caps at 500 with a documented safety comment. Cyber-squatter with 10k projects + account expiry → 10k S3 prefix-deletes in one spawned task. Asymmetric defense within the same module.
104 +
105 + ## Run #9 — stress-tested OK
106 +
107 + Verified attacks the code survived (high-confidence positives):
108 +
109 + - Stripe webhook signature replay (HMAC constant-time, multi-secret rotation, timestamp tolerance both directions)
110 + - Promo code concurrent over-use (single atomic UPDATE with max_uses + expires_at + starts_at)
111 + - Cart race past pre-check (23505 fallback aborts cleanly without charging)
112 + - License key prediction (6 wordlist × CSPRNG ≈ 66 bits)
113 + - Pre-signed URL Content-Length binding (S3 rejects mismatch at protocol level)
114 + - Storage cap atomicity (`try_replace_storage` single UPDATE)
115 + - Build claim race (partial unique index + 23505 backstop)
116 + - Idempotent re-confirms in all 4 upload confirm handlers (reaper-deletes-live-object closed)
117 + - Session row + JWT atomicity (post-fix verified above)
118 + - TOTP replay across skew window (matched-step tracked + strict `>` gate)
119 + - OAuth PKCE downgrade (S256 pinned at authorize + token-exchange)
120 + - CSRF body bypass via textarea-smuggled token (proper form parser)
121 + - Git diff/blame XSS (HTML-escaped in attacker-controlled spots)
122 + - Internal error leakage (tests assert no PG host, no S3 bucket, no sqlx variant leaks)
123 +
124 + ## Run #9 confidence per axis
125 +
126 + - Payments **HIGH** (~70% LoC read this pass; Phase 4 backlog visible)
127 + - Storage **HIGH** (full module read; cleanup.rs upper half only — MEDIUM there)
128 + - UX Wiring **HIGH** for CSRF/error/validation; **MEDIUM** for wizard step partials, embed routes, dashboard CSV import
129 + - Security **HIGH** for auth/CSRF/session; **MEDIUM** for scanning (YARA rule content unread), API key scoping
130 + - Performance **HIGH** for scan worker, scheduler, storage, build_runner; **MEDIUM** for SyncKit, postmark, import pipeline
131 +
132 + ## Run #9 bug counts
133 +
134 + | Severity | Payments | Storage | UX | Security | Perf | Total |
135 + |---|---|---|---|---|---|---|
136 + | CRITICAL | — | — | 1 (FIXED) | — | — | **1** |
137 + | SERIOUS | 2 (1 FIXED, 1 deferred) | — | — | 2 (FIXED) | — | **4** |
138 + | HIGH | — | 1 (deferred) | — | — | 2 (deferred) | **3** |
139 + | MED | 3 (deferred) | 2 (deferred) | 2 (deferred) | — | — | **7** |
140 + | LOW/NOTE | 2 | — | 3 | 4 | 3 | 12 |
141 +
142 + ## Run #9 delta vs Run #8
143 +
144 + - 1 CRITICAL surfaced + fixed (signup TOCTOU); class missed by prior 8 runs because no agent explicitly probed the public-signup race window
145 + - 4 SERIOUS surfaced; 3 fixed in-session, 1 deferred with rationale
146 + - Run #8 "BAR MET" claim was correct *for the surfaces it audited* but understated: this pass added explicit attack-vector probing for cross-conn atomicity, IP spoof parity across auth surfaces, and webhook dedup edge paths — none of which were in prior runs' scope
147 + - All previously closed Run #8 fixes verified intact (commit_upload seal, S1 tx atomicity, background.rs queue, cart MEDs)
148 +
149 + ---
150 +
151 + # Ultra Fuzz Report — MNW Server (Run #8 — historical)
152 +
153 + **Run date:** 2026-05-31
154 + **Run number:** 8
155 +
156 + ## Run #8 Headline
157 +
158 + | Axis | Run #5 | Run #6 | Run #7 | Run #8 | Direction |
159 + |------|--------|--------|--------|--------|-----------|
160 + | Payments | B | B+ | A- | **A-** | flat — H2 still deferred; 2 new MEDs surfaced (cart `min_price_cents` bypass, cart-all chain-break on all-free first seller) |
161 + | Storage | B- | A- | B+ | **A-** | ↑ H1 + S1 fixes verified closed; commit_upload seal intact across all 7 confirm handlers; genericization clean at every caller including synckit/blobs.rs |
162 + | UX Wiring | B | A- | A- | **A-** | flat — 1 new MED (item-wizard `pricing_model` silent fallback to "free" — same disease class fixed in project wizard at Run #6, not propagated) |
163 + | Security | A- | A- | A- | **A-** | flat — only diff in scope (username availability fail-closed) is a net improvement; MED backlog identical to Run #5/#6/#7 |
164 + | Performance | B- | A- | A- | **A-** | flat with 1 new SERIOUS — webhook `checkout_helpers.rs` unbounded `tokio::spawn` (send_purchase_emails / mailing_list / tip_email) competes with request handlers for the 25-slot pool under burst |
165 +
166 + **Net Run #8:** 0 CRITICAL · 1 SERIOUS new (Perf webhook spawn) — FIXED 2026-05-31 · 5 new MED — ALL FIXED 2026-05-31 · 1 SERIOUS previously-deferred (Payments H2 `claim_free_project` soft race) — FIXED 2026-05-31.
167 +
168 + **Post-Run #8 status (2026-05-31 end-of-day): 0 CRITICAL · 0 SERIOUS · 0 MED open from any prior run.** All five axes A-, all above-MED items closed, all Run #8 MEDs closed, prior-deferred SERIOUS closed. Launchplan §1.5 bar fully cleared.
169 +
170 + **2026-05-31 post-Run-#8 backlog sweep (7 waves):** 24 of 26 carried MED/LOW/NOTE items closed across Storage (5), Security (8), Performance (3), UX (2), Payments (2), Auth (4). Two deferred with rationale: `build_runner.rs` serial targets (LOW, builds run rarely, refactor touches denominator) and `scheduler/mod.rs` advisory-lock granularity (multi-replica concern, single-process today). New schema migration `133_items_duration_seconds_nonnegative.sql` pins the negative-duration invariant in the DB. New `commit_rescan` helper extends the chronic-disease commit_upload seal to admin paths. Tests: 1655 / 0.
171 +
172 + **Launchplan §1.5 bar:** **ALL 5 AXES AT A- — BAR MET.** The new Perf SERIOUS is axis-internal and the agent kept Perf at A- (machinery wins outweigh; same shape as previously-closed `record_view` per-request spawn — apply mpsc + drainer pattern). New Payments MEDs and UX MED are launch-quality items worth addressing or documenting before ship; none are A- blockers.
173 +
174 + ## Run #8 — new findings above MED
175 +
176 + ### P-SERIOUS — Webhook hot-path unbounded `tokio::spawn` (Performance) — FIXED 2026-05-31
177 + `src/routes/stripe/webhook/checkout_helpers.rs:58, 96, 124, 290` + `src/routes/stripe/webhook/checkout.rs:618`. `send_purchase_emails`, `subscribe_buyer_to_mailing_list`, `send_tip_email`, `send_guest_sale_notification`, guest-purchase-confirmation each `tokio::spawn` from the webhook handler. Multi-item cart fires N spawns per webhook; each task acquires 1-2 pool conns + a Postmark call. No JoinSet, no cap. Under burst, hundreds of detached tasks competed with request handlers for the 25-slot pool. Same shape as the Run #4 `record_view` per-request spawn (fixed via mpsc + drainer).
178 +
179 + **Fix landed:** new generic `src/background.rs` module — `BackgroundTx` + `spawn_pool()` with bounded mpsc (capacity 1024) + semaphore-bounded concurrent execution (8 workers, well below `DB_POOL_MAX_CONNECTIONS=25`). `state.bg.spawn(name, fut)` is non-blocking; queue overflow logs a warning and drops the task. The `spawn_email!` macro was refactored to use the bg queue (covers 17 callers across auth/admin/follows/library/two_factor/stripe webhook/login flows). The 5 manual webhook `tokio::spawn` sites were also migrated. Per-request email sends from postmark issue replies (×2), guest-claim email, and join-wizard signup (×2) were migrated in the same pass — same disease, same fix.
180 +
181 + **Out of scope for this fix** (different bug shapes; defer to Phase 4 polish or own remediation): import pipeline (long-running, needs own bound), MT community creation (single outbound HTTP, minor pool pressure), creator departure notification + status broadcast (broadcast-class — use `broadcast.rs` JoinSet pattern), idempotency-store post-response (trivial DB write), build_runner (already gated by claim flow), scheduler/monitor/scanning/page_views (background workers, not per-request).
182 +
183 + ### Payments MED — Cart `min_price_cents` bypass — FIXED 2026-05-31
184 + Both cart paths (`process_seller_checkout` and `create_cart_checkout`) now check `pc.min_price_cents` for non-platform Discount codes before applying the discount. Cart skips the ineligible item (others may still qualify) rather than rejecting the whole cart — matches the existing scope-skip pattern.
185 +
186 + ### Payments MED — Cart-all chain-break on all-free first seller — FIXED 2026-05-31
187 + `process_seller_checkout` signature changed `Result<String>` → `Result<Option<String>>`; all-free path now returns `Ok(None)` instead of `Err(BadRequest)`. New `drain_to_paid` helper loops through the queued sellers until a paid one is reached (returns URL) or queue exhausted (returns `Ok(None)` → library redirect). Both callers (`create_cart_checkout_all` and `checkout_success`) updated to use it.
188 +
189 + ### UX MED — Item wizard `pricing_model` silent fallback — FIXED 2026-05-31
190 + `save_pricing` now rejects missing pricing_model with `AppError::validation("Select a pricing model")` and rejects unknown values with `format!("Unknown pricing model: {other}")`. Same shape as the project wizard Run #6 fix.
191 +
192 + ### UX MED — Inline-JS template duplication — FIXED 2026-05-31
193 + Added delegated `data-copy-link` click handler to `static/mnw.js` with proper `.catch()` (falls back to `window.prompt` in non-secure contexts — better than the silent-no-op the inline snippets shipped with). 8 templates migrated from `onclick="navigator.clipboard.writeText(...).then(...)"` to `<a href="..." data-copy-link>Copy link</a>` (audio_player, blog_post, collection, item, project, text_reader, user, video_player). `href` is the real URL so middle-click / no-JS / share menus still work. Cache-bust query bumped to `v=0531`.
194 +
195 + ### Perf MED — Cart free-claim N+1 — FIXED 2026-05-31
196 + Extended `CartItem` with `enable_license_keys` + `default_max_activations` (both cart queries pull them through). Three free-claim loops (single-seller paid path, discount-zeroed promo path, chain-flow path) drop the per-item `get_item_by_id` and replace per-item `remove_from_cart` DELETE with a single bulk `remove_from_cart_bulk(..., ANY($2))` at the end of each loop. Per-item tx for `claim_free_item` stays (the per-item claim-vs-already-purchased return value is load-bearing for sales-count increment). Roundtrips per free item dropped from ~5-7 to ~3-4; per-loop DELETEs from N to 1.
197 +
198 + ## Run #8 — verified standing (storage fixes from session)
199 +
200 + - **H1** (`uploads.rs::confirm_upload` L295-337) — three-arm match correct. Zero-rows arm rolls back (replace path = `try_replace_storage` swap-back with `i64::MAX` cap; fresh-upload path = `decrement_storage_used`), then `enqueue_s3_orphan(new_key)`, returns BadRequest "Item was modified concurrently." Returns BEFORE `commit_upload` and BEFORE `remove_pending_upload` — pending_uploads row left as reaper second-line defense.
201 + - **S1** (`media.rs::media_confirm` L241-293) — single `state.db.begin()` wraps storage credit + pending_uploads clear + media_files INSERT. S3 IO entirely outside tx. tx drop → Postgres ROLLBACK → all three writes reverted atomically. 23505 detection via typed `AppError::Database(sqlx::Error::Database(...))` pattern works post-rollback. S3 cleanup fires on every tx-failure branch.
202 + - **Genericization** — `pending_uploads::remove_pending_upload` and `media_files::create` now `impl PgExecutor<'e>`. All 12 callers (including `synckit/blobs.rs:157`) still compile and execute correctly.
203 + - **Pool pressure delta from S1 tx** — neutral-to-better. Prior code grabbed 3 separate conns serially; new code grabs 1 conn for ~3× the duration. Users-row write lock held ~ms. Per-user serialization for sub-second uploads acceptable.
204 +
205 + ## Run #8 — mandatory surprises
206 +
207 + - **Payments:** `compute_splits` more careful than its comment promises — remainder-distribution loop constrained by `expected_total = amount * raw_total_pct.min(100) / 100`, so under-100% splits keep the owner's share AND distribute floor-rounding remainders up to bound. Proptest-style invariant tests fully fence it.
208 + - **Storage:** `try_increment_storage_on` inside the tx holds a row-level lock on `users` for the duration of the tx. Not a bug (sub-ms hold; cap can't be over-shot via WHERE re-evaluation under READ COMMITTED). But every media confirm now serializes per-user against every other storage write.
209 + - **UX:** Copy-link button is a chimera. Nine templates copy the same inline `onclick` that calls `navigator.clipboard.writeText`, mutates `this.textContent` to `"Copied!"` — silently broken in any tab loaded over plain HTTP, in iframes, or with restrictive CSP. No `.catch()` → no fallback, no error.
210 + - **Security:** `routes/auth.rs:128-130` malformed-email branch skips DUMMY_HASH timing equalizer. ~2 orders of magnitude faster than every other failure path — distinguishes "you submitted an invalid-email-shaped string" from "valid email, unknown account." Real timing oracle a few lines above the equalizer that was deliberately added to prevent exactly this.
211 + - **Performance:** `metrics::idempotency_middleware` does a DB SELECT on EVERY POST/PUT with an `Idempotency-Key` header BEFORE the handler runs. No bloom filter, no negative cache. ~1 extra ms per POST already doing 2-5 DB queries — free 20%+ on POST p50 available by adding an in-memory `seen` set.
212 +
213 + ## Run #8 bug counts
214 +
215 + | Severity | Payments | Storage | UX | Security | Perf | Total |
216 + |---|---|---|---|---|---|---|
217 + | CRITICAL | — | — | — | — | — | **0** |
218 + | SERIOUS | 1 (deferred) | — | — | — | 1 (new) | **2** |
219 + | MED | 2 (new) | 7 | 5 | 8 | 5 | 27 |
220 + | LOW/NOTE | 5 | 3 | 4 | 3 | 2 | 17 |
221 +
222 + ## Run #8 confidence per axis
223 +
224 + - Payments **HIGH** (~70% LoC read)
225 + - Storage **HIGH** (full)
226 + - UX **HIGH**
227 + - Security **HIGH** (scoped); MEDIUM for storage-route auth side-effects
228 + - Performance **HIGH**
229 +
230 + ## Run #8 delta vs Run #7
231 +
232 + - **Storage B+ → A-.** H1 + S1 fixes verified closed. Genericization clean.
233 + - **Payments A- flat.** 2 new MEDs (cart `min_price_cents` bypass, cart-all chain-break) surfaced via expanded coverage; H2 deferred unchanged.
234 + - **UX A- flat.** 1 new MED (item-wizard `pricing_model` silent fallback) — same disease class as project wizard fix from Run #6, not propagated.
235 + - **Security A- flat.** Net improvement (username fail-closed). MED backlog identical.
236 + - **Performance A- flat.** 1 new SERIOUS (webhook unbounded spawn) — same shape as Run #4 `record_view` fix. Cart free-flow N+1 (MED) — Run #5 fix covered paid only.
237 +
238 + ---
239 +
240 + # Ultra Fuzz Report — MNW Server (Run #7 — historical)
241 +
242 + **Run date:** 2026-05-31
243 + **Run number:** 7 (+ S1 + Storage code-fuzz fixes confirmed in Run #8)
244 +
245 + ## Headline
246 +
247 + | Axis | Run #5 | Run #6 | Run #7 | Direction |
248 + |------|--------|--------|--------|-----------|
249 + | Payments | B | B+ | **A-** | ↑↑ Phase 2 + Run #6 + Run #7 fixes all landed; S1 cart 23505 swallow fixed post-Run #7; H2 claim_free_project soft race deferred |
250 + | Storage | B- | A- | **B+ → A- pending Run #8** | ↑/↓ commit_upload structural fix is excellent; Run #6 idempotency fix introduced HIGH-1 (pending_uploads leak in 4 sites) + HIGH-2 (missing rollback on update_*_url) — both fixed post-Run #7. Storage code-fuzz 2026-05-31 surfaced H1 (confirm_upload silent zero-rows + side-effects-already-fired) and reopened S1 media_confirm tx atomicity — both fixed in same session |
251 + | UX Wiring | B | A- | **A-** | ↑ field-aware deletion + parse_dollars_to_cents shared; pricing_model silent fallback HIGH found and fixed post-Run #7 |
252 + | Security | A- | A- | (unchanged) | flat — no security-touching changes in Runs #6/#7 |
253 + | Performance | B- | A- | (unchanged) | flat — no perf-touching changes in Runs #6/#7 |
254 +
255 + ## Post-Run #7 Storage code-fuzz (2026-05-31)
256 +
257 + Targeted code-fuzz scoped to the Storage axis to verify A- before triggering full Run #8. Two findings above MED, both fixed in-session:
258 +
259 + - **H1 (HIGH) — `routes/storage/uploads.rs::confirm_upload` silent `rows_affected = 0`.** Same shape as the just-closed HIGH-2 (`update_*_url`), one step further along the same handler family. UPDATE at L295 uses ownership-filter `WHERE id = $1 AND project_id IN (SELECT id FROM projects WHERE user_id = $4)`; `rows_affected()` was never checked. If the item was deleted between `get_item_owner` (L156) and the UPDATE, storage credit stayed incremented, `pending_uploads` got cleared a few lines down, and `commit_upload` enqueued a scan job against a ghost target — permanent S3 leak + over-charged counter. **Fix:** three-arm match on the UPDATE result; zero-rows case rolls back storage and routes the new S3 key through `enqueue_s3_orphan` so the reaper still cleans it, then returns BadRequest "Item was modified concurrently."
260 + - **S1 (SERIOUS, Run #5 plan #12 reopened) — `routes/storage/media.rs::media_confirm` three-write atomicity.** Run #5 called for wrapping `try_increment_storage` → `remove_pending_upload` → `media_files::create` in a transaction; Run #7's in-process compensation only covered in-process errors. Process interruption (panic, OOM kill, container restart) between any two writes still leaked. **Fix:** all three writes now in a single tx; tx drop rolls back storage + pending_uploads + media_files atomically. Only the S3 object needs explicit cleanup (single `delete_object` after rollback). Supporting DB-layer changes: `creator_tiers::try_increment_storage_on(&mut PgConnection)` tx-friendly variant; `pending_uploads::remove_pending_upload` and `media_files::create` signatures genericized to `impl PgExecutor<'e>` (backwards compatible).
261 +
262 + Remaining storage MED/LOW (below launchplan §1.5 A- bar; ride into Phase 4 polish or document deferral):
263 + - MED — `update_project_image_url` / `update_item_cover` ignore `rows_affected()` (same shape as H1; mitigated for current callers because the only follow-on side-effect is `bump_cache_generation`).
264 + - MED — `downloads.rs:120` `((duration as u64) * 2).max(3600)` with no DB `CHECK (duration_seconds >= 0)`. Negative duration → multi-decade presigned URL. Exploitability requires creator-controlled negative duration; ffprobe doesn't produce them. Cap in code + add CHECK migration.
265 + - MED — Admin rescan paths (`routes/admin/uploads.rs:347, 390`) call `db::scan_jobs::enqueue` directly, bypassing the `commit_upload` structural seal. Ordering is correct so no live bug; demote `db::scan_jobs::enqueue` to `pub(crate)` and expose `commit_rescan(target, ...)` to close the chronic-disease finding for real.
266 + - MED — `enqueue_s3_orphan` single-policy doc in `routes/storage/mod.rs:24-30` overstates the discipline; many `s3.delete_object(...).await.ok()` direct calls remain at pre-storage-credit rejection paths. Tighten the doc or migrate the post-storage-credit sites.
267 + - MED — `is_s3_key_live` doesn't enumerate project image URLs (project cover keys live in a distinct prefix so no current bug; surface is fragile if future code paths queue project image keys).
268 + - LOW — `scanning/worker.rs:251` inline `UPDATE media_files SET scan_status` instead of `db::scanning::update_media_file_scan_status` helper.
269 + - LOW — `routes/pages/dashboard/wizards/item/save.rs:95` `update_item_cover_image_url` updates only `cover_image_url` (not s3_key/size); client-side hidden-field abuse can desync.
270 + - LOW — `db/pending_uploads.rs::remove_pending_upload` deletes by s3_key alone (per-handler prefix validation makes cross-user collision unreachable, but the function signature is broader than it needs to be).
271 +
272 + **Chronic disease status (5th run):** The invariant-in-prose / sibling-not-swept pattern that recurred across Runs #2–#6 was **structurally addressed** in Run #7 via two helpers:
273 + - `routes/storage/mod.rs::commit_upload(target: CommitTarget, ...)` — sealed `enqueue_scan_for` to module-private; the helper is now the only handler-reachable path for scan enqueue + scan_status flip after a DB write. Bug shapes 1–3 from prior runs are now structurally impossible to introduce in a new sibling.
274 + - `crate::pricing::parse_dollars_to_cents` + `validate_dollars_f64` — canonical dollar-to-cents conversion; bypassing has historically introduced NaN→$0 and saturating-overflow silent bugs.
275 +
276 + **Net after Run #7 + S1 fix:** 0 CRITICAL · 0 HIGH/SERIOUS · 1 SERIOUS deferred (Payments H2 soft race on `claim_free_project`) · a handful of MED/LOW polish items.
277 +
278 + ---
279 +
280 + # Ultra Fuzz Report — MNW Server (Run #5 — historical)
281 +
282 + **Run date:** 2026-05-30
283 + **Run number:** 5
284 +
285 + ## Headline
286 +
287 + | Axis | Run #4 | Run #5 | Direction |
288 + |------|--------|--------|-----------|
289 + | Payments | A- | **B** | ↓ (Run #4 plan items closed; 4 new SERIOUS surfaces previously unaudited: NULL item_id refund, splits >100% overflow, tip project authorization, cart unlisted bypass) |
290 + | Storage | A- | **B-** | ↓ (Run #4 `images.rs` ordering bug closed; same disease reappeared in `uploads.rs` route gate ordering — file-type rejection runs AFTER scan enqueue) |
291 + | UX Wiring | C+ | **B** | ↑ (Run #4 CSRF patchwork + creator-tier token fixed and structurally enforced; new CRIT: field-aware validation API is dead code at template boundary) |
292 + | Security | B+ | **A-** | ↑ (Run #4 git-shell validation, lockout email flood, CSRF policy all verified; no new CRIT/HIGH; remaining gaps are operational/MED) |
293 + | Performance | B | **B-** | ↓ (Run #4 scan_jobs retention + pool permit + broadcast bounding verified; new HIGHs in previously unaudited cart checkout + page-view paths + scheduler integrity scan) |
294 +
295 + Net: 3 CRITICAL (vs Run #4: 4), 13 HIGH/SERIOUS (vs Run #4: 10), 11 MED, 9 MINOR/LOW. Two axes regressed because Run #5 reached previously-unaudited territory (Payments tip/cart/refund edges; Performance hot-path request loops) while Run #4 plan items themselves were correctly closed. The Storage regression is a *recurrence of the same shape* in a sibling handler — the chronic invariant-in-prose disease, fourth consecutive run.
296 +
297 + ## Critical / High Findings (fix before launch)
298 +
299 + 1. **[Storage — CRITICAL]** `routes/storage/uploads.rs:204-237` — `confirm_upload` calls `enqueue_scan_for(...)` and `update_item_scan_status(... Pending)` BEFORE the match arm rejects `Download`/`Insertion`/`MediaImage`/`MediaVideo` with `BadRequest`. A misrouted-but-valid `item_id` confirms flips that item's scan status to Pending, blocks `stream_url` for every fan, and leaks a scan-job row for an S3 key that's then deleted.
300 + 2. **[UX — CRITICAL]** `error.rs:216-264` + `templates/error.html` — `AppError::validation_fields(summary, [(field, msg), ...])` is consumed only by unit tests. `ErrorTemplate` has no `fields:` member; no template renders per-field highlights. Every non-HTMX validation failure degrades to the global "Go Home / Go Back" page and wipes submitted form input. Handler authors are misled into thinking their carefully-tagged field errors reach the UI.
301 + 3. **[Perf — CRITICAL]** `build_runner.rs:175-180` — Partial-failure error message reports `("{}/{} succeeded", artifact_keys.len(), artifact_keys.len() + 1)`. Denominator is always `succeeded + 1`, regardless of how many targets actually ran. Three targets, one succeeded, two failed → reports "1/2" (should be 1/3). Failed-target count is never tracked.
302 +
303 + ### HIGH / SERIOUS
304 +
305 + 4. **[Payments — SERIOUS]** `db/transactions.rs:699-716` — `refund_transaction_by_payment_intent` returns `Vec<(TransactionId, ItemId)>` (non-Optional). Project-level transactions store `item_id IS NULL` (`routes/stripe/checkout/project.rs:135`). On `charge.refunded` for a project-level purchase, sqlx fails to decode NULL → `ItemId`; webhook handler 5xx's; Stripe retries forever.
306 + 5. **[Payments — SERIOUS]** `routes/stripe/webhook/checkout_helpers.rs:240-269` — `compute_splits` comment says "Defensive clamp: a misconfigured project_members row could sum past 100%" but the loop only adds remainder pennies and never subtracts. Two members at 60%+60% on $10 each are credited $6 each — $12 of $10 of revenue. Clamp only affects `expected_total`, never the already-computed per-member amounts. Tests cover ≤100% only.
307 + 6. **[Payments — SERIOUS]** `routes/stripe/checkout/tips.rs:104-106` — `TipForm.project_id` is taken verbatim from the form. The webhook later calls `record_tip_splits(tip.id, tip.project_id, ...)` and credits THAT project's members. An attacker tipping creator A can pass project B's UUID; B's members get split obligations credited against A's tip. Stripe money flows correctly; on-platform `tip_splits` records and any downstream reporting are corrupted.
308 + 7. **[Payments — SERIOUS]** `db/cart.rs:94-123` + `routes/stripe/checkout/cart.rs` — `item.rs:47-49` enforces "Unlisted items can only be obtained through their bundle" via `if !item.listed`. `toggle_cart_preflight` and `get_cart_items` check `is_public` but NOT `listed`. An attacker who knows an unlisted item's UUID can POST to `/api/cart/{id}/toggle` and check out via the cart flow, fully bypassing the bundle-only gate.
309 + 8. **[Payments — SERIOUS]** `routes/stripe/webhook/subscriptions.rs:117-121, 67-69, 95-96` — `status_str.parse::<SubscriptionStatus>()` returns BadRequest for any status not in `enums.rs:183-198` (Stripe's `paused` is new). Webhook handler returns Err; scheduler retries forever until status changes.
310 + 9. **[Payments — SERIOUS]** `payments/webhooks.rs:294-308` — `is_full_refund` returns true when `amount_refunded >= amount` and both are zero (Stripe sometimes emits these for $0 verification charges). Triggers `refund_transaction_by_payment_intent` with default `unknown` intent ID. Test at line 517-525 pins the behavior.
311 + 10. **[Storage — HIGH]** `routes/storage/versions.rs:159-174` — `version_confirm_upload` enqueues scan and flips `scan_status` to Pending BEFORE the `version.s3_key == req.s3_key` idempotency check at line 172. Duplicate retry of an already-confirmed upload knocks a Clean version back to Pending, breaking downloads.
312 + 11. **[Storage — HIGH]** `routes/storage/images.rs:179-208` — `project_image_confirm` replace branch is gated on `Ok(Some(old_size))` from `s3.object_size(&old_key)`. On `Err` (S3 hiccup) or `Ok(None)` (URL with no object behind it) it falls into the "no old image" branch, `try_increment_storage` without decrementing. Permanent storage over-count. Also: `update_project_image_url` runs AFTER `enqueue_deletions` of the old key, with no rollback path.
313 + 12. **[Storage — HIGH]** `routes/storage/media.rs:236-293` — `media_confirm` does three separate writes (`try_increment_storage`, `remove_pending_upload`, `media_files::create`) outside a transaction. Interruption between steps leaves S3 object orphaned with storage credit consumed and no DB row.
314 + 13. **[UX — HIGH]** `routes/pages/dashboard/wizards/item/save.rs:183-185, 214-227` — `let price_cents = (price_dollars * 100.0).round() as i32; if price_cents > 0 { validate_price_cents(price_cents)?; }`. Guard skips validation for 0 and negative values; value goes through `PriceCents::from_db` (no validation) into `update_item`. Submitting `price=-5` writes `-500` cents. Same pattern on PWYW: no `min <= suggested` check.
315 + 14. **[UX — HIGH]** `routes/pages/dashboard/wizards/item/save.rs:179-183` + `routes/api/items/bulk.rs:136-139` + `routes/pages/dashboard/wizards/project.rs:264-298` — `price_dollars: f64 = …parse()…unwrap_or(0.0)`. `"NaN".parse::<f64>()` succeeds; `NaN as i32 == 0` (silent Free). `1e20` saturates `i32::MAX`. Bulk path catches via `PriceCents::new` cap; `save.rs` does not — persists raw.
316 + 15. **[UX — HIGH]** `routes/auth.rs:356-361` — `let is_taken = db::users::get_user_by_username(...).await.map(|u| u.is_some()).unwrap_or(false);`. Transient DB error during signup live-check returns "available", misleading the user; subsequent signup races whatever real state the DB is in.
317 + 16. **[Perf — HIGH]** `routes/stripe/checkout/cart.rs:68-248` — Per cart item: sequential `has_purchased_item`, optional `remove_from_cart`, per-free-item `begin tx → claim_free_item → increment_sales_count → commit`, `get_item_by_id`, second `remove_from_cart`. 20-item cart ≈ 80 sequential roundtrips, ~20 separate transactions, 20 distinct pool acquisitions in series.
318 + 17. **[Perf — HIGH]** `db/page_views.rs:18-32` — `record_view` spawned per public request, takes a pool connection to UPSERT. With `DB_POOL_MAX_CONNECTIONS = 25`, a viral item link spawns unbounded tasks, eats the pool, times out real request handlers at acquire. No batching, no per-(target,session) debounce.
319 + 18. **[Perf — HIGH]** `scheduler/integrity.rs:53-73` — `check_sales_count_drift`: `SELECT i.id, i.sales_count, COUNT(t.id) FROM items LEFT JOIN transactions ... GROUP BY i.id HAVING i.sales_count != COUNT(t.id) LIMIT 50`. `HAVING` post-aggregation; Postgres scans every row in `items` and joins every completed transaction in history before filtering. `LIMIT 50` doesn't cap the work. Weekly multi-minute query holding a pool connection.
320 +
321 + ## Scorecard
322 +
323 + ### Axis Summary Grades
324 +
325 + | Axis | Overall | Cold Spots | Mandatory Surprise |
326 + |------|---------|------------|--------------------|
327 + | Payments | B | `routes/stripe/checkout/cart.rs` (B-), `routes/stripe/checkout/tips.rs` (B-), `db/transactions.rs` (B-), `routes/stripe/webhook/checkout_helpers.rs` (B-), `routes/stripe/webhook/subscriptions.rs` (B) | `compute_splits` carries a "Defensive clamp" comment that explicitly anticipates the >100% case and then fails to defend against it — only `expected_total` is clamped, the already-computed per-member splits go unchanged. Treat as evidence the defensive-comment culture is itself unreliable; comments and code drift independently. |
328 + | Storage | B- | `routes/storage/uploads.rs` (C+), `routes/storage/images.rs` (C+), `routes/storage/versions.rs` (C+), `routes/storage/media.rs` (B-), `db/mod.rs::check_sandbox_cap` (C+) | `stream_url` (`downloads.rs:119-122`) computes presigned expiry as `((duration as u64) * 2).max(3600)` where `duration: i32` and no DB CHECK ≥ 0 exists on `duration_seconds`. A negative value becomes near-`u64::MAX` expiry — a centuries-long presigned URL. The cast width and missing CHECK are independent latent bugs that compose into a multi-decade credential leak. |
329 + | UX Wiring | B | `routes/pages/dashboard/wizards/item/save.rs` (B-), `error.rs` (B-), `routes/pages/public/discover.rs` (B) | `update_item` takes ~13 positional `Option`s; call sites are unreadable and error-prone. The negative-price bug (HIGH #13) is born from this signature: anyone calling it has no compiler help distinguishing `Some(-500)` (bug) from `Some(500)` (intent). |
330 + | Security | A- | `helpers.rs` (B+), `scanning/clamav.rs` (B), `scanning/yara.rs` (B), `rate_limit.rs` (B+) | The "11 layer" scan pipeline test gives a false sense of coverage. ClamAV is `FailOpen` by explicit policy (`scanning/clamav.rs:19`), YARA silently skips rule files that fail to compile (`scanning/yara.rs:54-67`), and there is no startup assertion that any real AV layer is live. A misconfigured deploy can pass EICAR as Clean while the test suite is green. |
331 + | Performance | B- | `routes/stripe/checkout/cart.rs` (C), `scheduler/announcements.rs` (C+), `scheduler/integrity.rs` (C+), `scheduler/cleanup.rs` (B-), `build_runner.rs` (B-), `db/page_views.rs` (C+), `db/pending_s3_deletions.rs` (B) | The biggest scaling cliff is a 1-line `tokio::spawn` on the page-view path, not anything that "looks expensive". Hot-path response shipped its tail-latency problem to the same pool that serves it. |
332 +
333 + ## Bug Counts by Severity
334 +
335 + | Severity | Payments | Storage | UX | Security | Perf | Total |
336 + |---|---|---|---|---|---|---|
337 + | CRITICAL | — | 1 | 1 | — | 1 | **3** |
338 + | HIGH/SERIOUS | 5 | 3 | 3 | — | 3 | **14** |
339 + | MED | 2 | 3 | 2 | 4 | 2 | 13 |
340 + | MINOR/LOW | 2 | 2 | 2 | 3 | 1 | 10 |
341 +
342 + ## Cross-Cutting Concerns
343 +
344 + 1. **Side-effects-before-validation pattern.** Storage (uploads/versions/images route gates run after scan enqueue), Payments (tip `project_id` accepted before authorization, cart `listed` not checked before checkout), UX (price `from_db` after a guard that skips zero/negative). Four files, three axes, same shape: persist first, validate later.
345 + 2. **Invariant-in-prose, fourth consecutive run.** Run #2→#3 was MaybeUser; Run #3→#4 was scan_status ordering comments-vs-code; Run #4 partial fix landed (`images.rs`) but the same disease moved up a layer to `uploads.rs` (the route-level file-type gate now runs after scan enqueue). The Payments "defensive clamp" comment in `compute_splits` is the same shape on a different organ. **No type-level constructive impossibility has yet been applied to any of these.**
346 + 3. **Optional positional args as bug carriers.** `update_item`'s ~13 positional `Option`s let the wizard pass a negative-price `Option<PriceCents::from_db>` past the validator. Same pattern is implicated in the UX field-error finding — `ErrorTemplate`'s struct literal is missing a `fields:` field at every callsite and the compiler doesn't care.
347 + 4. **Hot-path pool pressure from fire-and-forget writes.** `record_view` per pageview, `tokio::spawn` per cart line, scheduler advisory-lock conn pinned across S3. The 25-connection pool is sized for a quiet box; three independent fan-out patterns can each saturate it.
348 + 5. **FailOpen with no liveness assertion.** ClamAV FailOpen + YARA optional + no startup gate = a green test suite can coexist with zero real AV coverage. Same shape as the Performance "spawned task accumulates without bound" pattern — both are silent degradations the operator never sees.
349 +
350 + ## Components Successfully Stress-Tested
351 +
352 + - All Run #4 Phase 1 closures verified standing (CSRF creator-tier token, `images.rs` scan_status ordering structural fix, git-shell validation, lockout `=` predicate, promo dedupe, scanner streaming + pool permit, broadcast bounded fan-out, scan_jobs retention).
353 + - Stripe HMAC: multi-secret `v1=` rotation now accepts on any match (Run #4 polish landed).
354 + - Promo `try_increment_use_count` race-free via atomic single-row UPDATE; release path uses detach for no-double-decrement; proptest-covered.
355 + - License keys: 66-bit entropy, DB UNIQUE, `FOR UPDATE` activation, full recount on revoke (display lag only — finding #M).
356 + - CSRF posture: `CsrfRouter<S>` newtype prevents a bare `Router::route(path, post(...))` from compiling in mutation-bearing files. Verified.
357 + - Argon2id parameters + `DUMMY_HASH` timing equalization on user-not-found (login, OAuth, SyncKit).
358 + - PKCE-S256 pinned at both authorize and token endpoints; OAuth code atomic single-use consume.
359 + - JWT future-iat rejection + `jwt_invalidated_at` second-equal `<=` semantics; password change bumps `jwt_invalidated_at` via `update_user_password`.
360 + - SSE shard-guard drop-before-remove; cross-process advisory locks for scheduler ticks.
361 + - ZIP bomb: decompressed-bytes counted (not claimed); ratio + depth caps; nested magic-byte detection.
362 + - `try_increment_storage` cap-predicate UPDATE; concurrent uploads cannot both squeeze past cap.
363 +
364 + ## Confidence Per Axis
365 +
366 + - Payments **HIGH** — read 22 of 23 listed files end-to-end with targeted attacks per surface; all four SERIOUS reproducible by line-tracing.
367 + - Storage **HIGH** — CRITICAL and all three HIGHs mechanically reproducible; mandatory surprise composes two latent bugs via line-by-line read.
368 + - UX Wiring **HIGH** — full read of `csrf.rs`, `error.rs`, `markdown.rs`, `formatting.rs`, `validation/mod.rs`; spot-checked 20+ templates for CSRF pattern; CRITICAL field-aware-validation finding cross-checked by grepping `validation_fields_ref` callers.
369 + - Security **MEDIUM** — auth/CSRF/OAuth/scanning surfaces walked thoroughly; admin/moderation/reports/ssh_keys API/totp routes only sampled. ClamAV FailOpen is **policy** not bug; flagged as architectural risk.
370 + - Performance **MEDIUM-HIGH** — spot-checked DB call patterns across 15+ files; exhaustive route-level N+1 sweep deferred; stripe/webhook code shows similar `for x in &xs` loops at `checkout.rs:149,167,198,452` that were not deep-audited.
371 +
372 + ## Metrics
373 +
374 + - Modules audited: ~80
375 + - Cold spots (≤ B): 18
376 + - Bugs: 3 CRITICAL, 14 HIGH/SERIOUS, 13 MED, 10 MINOR/LOW
377 + - Axes at A- or above: 1/5 (Security)
378 +
379 + ## Delta Since Run #4
380 +
381 + **FIXED (Run #4 items not surfaced this run):**
382 + - All 10 Run #4 Phase 1 items verified closed (CSRF creator-tier, `images.rs` ordering, git-shell validation, lockout email flood, cancel_pending CSRF, promo dedupe, scanner streaming + pool permit, scan_jobs retention, broadcast bounding).
383 + - All 7 Run #4 Phase 2 items verified closed (cart template price math, media reupload race, pending_uploads reaper bump, TOTP step-replay, delete_other_sessions cache eviction, `/login` CSRF, OAuth fetch_optional).
384 + - All 5 Run #4 Phase 3 items verified closed (claim_pending_build partial index, build status reaper race, `extract_s3_key_from_url` host pinning, TOTP `pending_2fa` tracking row, KNOWN_SYNC_APPS removed entirely).
385 + - All Phase 4 polish items verified closed.
386 +
387 + **NEW CRITICAL/HIGH in Run #5 (previously unaudited or regressed):**
388 + - Storage: `uploads.rs` route-level file-type gate runs after scan enqueue (CRIT).
389 + - UX: `validation_fields` plumbing is dead code at template boundary (CRIT).
390 + - Perf: `build_runner.rs` partial-failure denominator nonsense (CRIT).
391 + - Payments: NULL `item_id` decode bomb on project-level refunds (SERIOUS).
392 + - Payments: `compute_splits` over-credits when project_members sum >100% (SERIOUS).
393 + - Payments: tip `project_id` not validated vs recipient (SERIOUS).
394 + - Payments: cart bypasses item `listed` gate (SERIOUS).
395 + - Payments: unknown subscription status retry storm (SERIOUS).
396 + - Storage: `version_confirm_upload` scan enqueue before idempotency check (HIGH).
397 + - Storage: `project_image_confirm` mis-accounts on S3 probe failure + no rollback (HIGH).
398 + - Storage: `media_confirm` non-atomic three-write sequence (HIGH).
399 + - UX: negative/NaN price acceptance via `PriceCents::from_db` after permissive guard (HIGH).
400 + - UX: username availability check fails open on DB error (HIGH).
401 + - Perf: cart checkout 80 sequential roundtrips (HIGH).
402 + - Perf: `record_view` unbounded spawn per public request (HIGH).
403 + - Perf: `check_sales_count_drift` full-table aggregate (HIGH).
404 +
405 + **CHRONIC (across Run #3 → Run #4 → Run #5):**
406 + - **Invariant-in-prose / policy-not-in-types — FOURTH consecutive run.** Run #4 partially fixed the scan_status ordering inside `images.rs` (and the CSRF policy via `CsrfRouter` structurally), but the same disease *moved up a layer*: in `uploads.rs` the route-level file-type gate now runs *after* scan enqueue. The constructive-impossibility shape needed: extract a `commit_upload(file_type, ...)` higher-level operation that validates the file_type before doing any scan/credit side effects, then make `enqueue_scan_for` + `update_*_scan_status` `pub(crate)` so handlers cannot call them directly. The Payments `compute_splits` "Defensive clamp" comment + the UX `validation_fields_ref` orphan plumbing are the same disease in different organs.
407 +
408 + **REGRESSED:**
409 + - Payments (A- → B) — four new SERIOUS bugs surfaced in previously-unaudited tip/cart/refund/subscription-status corners. Not a regression in fixed code; a regression in audit coverage.
410 + - Storage (A- → B-) — invariant-in-prose recurrence (chronic above).
411 + - Performance (B → B-) — hot-path request loops audited for the first time.
412 +
413 + ---
414 +
415 + # Plan: Restore Every Axis to A- or Higher (Run #5)
416 +
417 + **Target grades:** Payments A · Storage A · UX A- · Security A- · Performance A-.
418 +
419 + User priority for the launch window: **resolve every CRITICAL/SERIOUS/HIGH before re-running**. Iterate until audits surface only small new errors.
420 +
421 + ## Phase 1 — CRITICAL (fix today)
422 +
423 + 1. **Storage CRIT — `uploads.rs` file-type gate ordering.** `routes/storage/uploads.rs:204-237`. Move the match arm that rejects `Download`/`Insertion`/`MediaImage`/`MediaVideo` BEFORE `enqueue_scan_for` and `update_item_scan_status`. Then make `enqueue_scan_for` + `update_*_scan_status` `pub(crate)` and expose a `commit_upload(file_type, item_id, s3_key)` higher-level op that performs validation → credit → row insert → status flip in the correct order. The same constructor must serve `versions.rs` and `images.rs`. This closes the chronic invariant-in-prose finding.
424 + 2. **UX CRIT — Field-aware validation reaches the UI.** `error.rs:216-264` + `templates/error.html` + `templates/partials/form_errors.html` (new). Either (a) add `fields: Vec<(String, String)>` to `ErrorTemplate` and a `{% for f in fields %}` block in `error.html` + per-input markup; or (b) delete `validation_fields*` API entirely and replace handler callsites with `validation(summary)`. Choose (a) for non-HTMX forms that need to preserve user input; choose (b) only if every existing callsite is HTMX-only and uses OOB swaps for inline errors. Audit all `validation_fields` callers and pick a path.
425 + 3. **Perf CRIT — `build_runner.rs` partial-failure denominator.** `build_runner.rs:175-180`. Track `failed_count` alongside `artifact_keys`; report `succeeded/(succeeded+failed)`. Add a test that runs 3 targets with 2 failures and asserts "1/3" in the error string.
426 +
427 + ## Phase 2 — SERIOUS / HIGH (fix this weekend)
428 +
429 + 4. **Payments SERIOUS — NULL item_id refund decode.** `db/transactions.rs:699-716`. Change return to `Vec<(TransactionId, Option<ItemId>)>`; `refund_transaction_by_payment_intent` caller skips `decrement_sales_count`/`revoke_keys_by_transaction` when `item_id is None`. Add a fixture-based test against a project-level transaction.
430 + 5. **Payments SERIOUS — `compute_splits` over-credit.** `routes/stripe/webhook/checkout_helpers.rs:240-269`. Reject `total_split_pct > 100` at the project_members write site (DB CHECK or validation). Defensively, scale each split proportionally when sum > 100, OR clamp each split against remaining `expected_total` budget in the loop. Add a test at 60%+60%.
431 + 6. **Payments SERIOUS — Tip project authorization.** `routes/stripe/checkout/tips.rs:104-106`. After accepting `TipForm`, fetch the project and assert `project.user_id == recipient_id`; return 400 otherwise.
432 + 7. **Payments SERIOUS — Cart bypasses `listed` gate.** `db/cart.rs:94-123` and `get_cart_items`/`get_cart_items_for_seller`. Add `AND i.listed = true` to all three queries. Add a check in the per-seller checkout path. Add a regression test that toggles an unlisted item into the cart and asserts rejection.
433 + 8. **Payments SERIOUS — Unknown subscription status.** `routes/stripe/webhook/subscriptions.rs:117-121`. Replace `?` with a match: known statuses dispatch; unknown statuses `tracing::warn!` and return `StatusCode::OK` so Stripe stops retrying.
434 + 9. **Payments SERIOUS — `is_full_refund` zero-amount.** `payments/webhooks.rs:294-308`. Predicate becomes `amount > 0 && amount_refunded >= amount`. Update the test at line 517-525 to invert (zero-amount must NOT be treated as full refund).
435 + 10. **Storage HIGH — `versions.rs` enqueue-before-idempotency.** `routes/storage/versions.rs:159-174`. Move idempotency `version.s3_key == req.s3_key` check BEFORE `enqueue_scan_for`. Apply the Phase 1 `commit_upload` helper here.
436 + 11. **Storage HIGH — `project_image_confirm` probe-failure + no rollback.** `routes/storage/images.rs:179-208`. (a) On `Err` or `Ok(None)` from `s3.object_size`, fall back to the row's recorded size (add a `project_image_bytes` column if not present) rather than the "no old image" branch. (b) Move `enqueue_deletions` to AFTER `update_project_image_url` success, or wrap both in a tx with the enqueue inside.
437 + 12. **Storage HIGH — `media_confirm` non-atomic three-write.** `routes/storage/media.rs:236-293`. Wrap `try_increment_storage` → `remove_pending_upload` → `media_files::create` in a transaction. The storage credit refund must fire on any failure path.
438 + 13. **UX HIGH — Negative/NaN prices via `from_db`.** `routes/pages/dashboard/wizards/item/save.rs:183-185, 214-227`. Use `PriceCents::new(price_cents)?` unconditionally; drop the `> 0` guard. Add `min <= suggested` check on PWYW.
439 + 14. **UX HIGH — f64 price parsing accepts NaN.** Same file + `routes/api/items/bulk.rs:136-139` + `routes/pages/dashboard/wizards/project.rs:264-298`. Parse as decimal cents directly (or `Decimal::from_str_exact` from the `rust_decimal` crate already in `Cargo.lock`); reject NaN/Inf; reject negative/saturating values before cast.
440 + 15. **UX HIGH — Username live-check fails open.** `routes/auth.rs:356-361`. Propagate the DB error or treat it as "unavailable, try again" — never "available" by default.
441 + 16. **Perf HIGH — Cart checkout sequential roundtrips.** `routes/stripe/checkout/cart.rs:68-248`. Bulk-load `has_purchased_item` once with `WHERE item_id = ANY($1)`. Batch `get_item_by_id` lookups. Claim free items in a single transaction with batched inserts. Aim for ≤ 5 roundtrips for any cart size.
442 + 17. **Perf HIGH — `record_view` unbounded spawn.** `db/page_views.rs:18-32`. Replace per-request spawn with an `mpsc` channel; one background task drains every 250ms and flushes one bulk `INSERT … ON CONFLICT … DO UPDATE SET view_count = page_view_daily.view_count + EXCLUDED.view_count`.
443 + 18. **Perf HIGH — Sales drift full-table aggregate.** `scheduler/integrity.rs:53-73`. Maintain trigger-updated `transactions_completed_count` per item, or run the check off-pool against a snapshot. Short term: add `WHERE i.sales_count > 0 OR EXISTS (SELECT 1 FROM transactions WHERE item_id = i.id LIMIT 1)` to drop the LEFT JOIN's all-zero rows from the aggregate.
444 +
445 + ## Phase 3 — MED (fix before re-run if cheap)
446 +
447 + - Storage: advisory-lock leak in `check_sandbox_cap` (`db/mod.rs:92-128`) → `pg_advisory_xact_lock` or RAII guard.
448 + - Storage: `is_s3_key_live` missing tables (`db/pending_s3_deletions.rs:67-82`) → audit all s3_key-bearing columns; consider normalized `s3_objects` table.
449 + - Storage: `delete_version` owner SELECT outside tx + post-commit S3 enqueue (`db/versions.rs:267-315`) → owner SELECT inside tx; enqueue inside tx.
450 + - Security: ClamAV `FailOpen` startup assertion (`scanning/clamav.rs:19` + `scanning/mod.rs:151-164`) → refuse boot if scan configured but no AV layer live; emit `tracing::error!` after N consecutive ClamAV errors.
451 + - Security: `helpers.rs:44-50` `DefaultHasher` for advisory lock keys → stable hasher (`sha2` first 8 bytes, or `xxh3` with constant seed).
452 + - Security: OAuth `state` size cap (`routes/oauth.rs:379-386`) → reject `form.state.len() > 1024`; cap `code_challenge` at 44 base64url chars.
453 + - Security: `extract_client_ip` non-Cloudflare fallback warning (`helpers.rs:33-40`) → emit one-shot `tracing::warn!` at startup if no `CF-Connecting-IP` seen after N requests.
454 + - UX: pagination offset overflow (`routes/pages/public/discover.rs:85-87`, `routes/admin/users.rs:37-39`) → clamp `page` to `total_pages.max(1)` before arithmetic.
455 + - UX: forms render without `_csrf` when handler forgets to populate `csrf_token` → make `csrf_token` non-optional in form-bearing templates (compile-time error) or render an inline "refresh and try again" notice.
456 + - UX: `validate_username` byte-length check (`routes/auth.rs:322`) → `chars().count()`, or reorder ASCII filter before length.
457 + - Perf: scheduler advisory-lock connection pinned across S3 (`scheduler/mod.rs:92-279`) → dedicated `PgPoolOptions::new().max_connections(1)` outside the main pool.
458 + - Perf: cleanup S3 deletes serialized inside scheduler tick (`scheduler/cleanup.rs:77-100`) → `for_each_concurrent(8, ...)`; better, move user-deletion off the scheduler tick.
459 +
460 + ## Phase 4 — Polish (after re-run shows axes ≥ A-)
461 +
462 + - Payments: `has_active_subscription_to_item` period-end clause mirroring (`db/subscriptions.rs:464-470`).
463 + - Payments: `get_active_creator_tier` + `sync_user_creator_tier` period-end defense (`db/creator_tiers.rs:91-103, 181-194`).
464 + - Payments: `release_use_count` race messaging (`db/promo_codes.rs:184-200`).
465 + - Payments: License key `activation_count` recount on revoke (`db/license_keys.rs:343-382`).
466 + - Payments: Subscription minimum-charge check (`payments/checkout.rs:283-317`).
467 + - Payments: Webhook v1/v2 unmark-on-failure parity (`routes/stripe/webhook/mod.rs:48-86`).
468 + - Storage: `media_files.list_folders` scan filter (`db/media_files.rs:73-82`).
469 + - Storage: `pending_uploads.record_pending_upload` silent user-mismatch (`db/pending_uploads.rs:23-33`).
470 + - Storage: `append_log_bounded` non-atomic size cap (`build_runner.rs:516-534`).
471 + - Storage: `downloads.rs:119-122` presigned-URL expiry: cap `duration_seconds` at i64 + add DB CHECK ≥ 0.
472 + - Security: `validate_token_consuming` for OAuth POST (`routes/oauth.rs:206`).
473 + - Security: `parse_repo_path` rejects lone-dot entries (`git_ssh.rs:162`).
474 + - Security: ClamAV INSTREAM 16K cap → treat truncation as fail-closed (`scanning/clamav.rs:101-108`).
475 + - UX: validation error messages stop reflecting user input (`wizards/item/mod.rs:176-179`).
476 + - UX: CSRF body extraction stops using `from_utf8_lossy` (`csrf.rs:528-543`).
477 + - Perf: scan-pipeline 400 MiB worst-case capacity-plan note (`constants.rs:156-157`).
478 + - Perf: announcement fan-out persistence + resume (`scheduler/announcements.rs:59-89, 147-177`).
479 + - Perf: build log per-line DB roundtrip (`build_runner.rs:516-534`) → in-process running total.
480 +
481 + ## Phase 5 — Chronic (must land in Run #6 or this audit cycle has failed)
482 +
483 + **Invariant-in-prose / policy-not-in-types, fourth consecutive run.** The Phase 1 #1 fix (constructive `commit_upload` helper sealing the lower-level ops) is the only acceptable resolution. Memory notes, comments warning future authors, and renamed-helper approaches have been tried in three prior runs and recurred each time. After Phase 1 lands, audit `compute_splits` and `ErrorTemplate` for the same shape and apply the same treatment.
484 +
485 + ---
486 +
487 +
488 +
489 + ## Headline
490 +
491 + | Axis | Run #3 | Run #4 | Direction |
492 + |------|--------|--------|-----------|
493 + | Payments | A- | **A-** | flat (1 new SERIOUS: promo over-release on cart cleanup) |
494 + | Storage | B+ | **A-** | ↑ (Run #3 image-confirm rollback/race-guard fixes verified; one residual CRIT in same file) |
495 + | UX Wiring | B+ | **C+** | ↓ (CSRF policy patchwork: missing tokens + undocumented mutation in exempt prefix) |
496 + | Security | B+ | **B+** | flat (different HIGHs: git-shell repo-name validation + lockout DoS) |
497 + | Performance | B- | **B** | ↑ (Run #3 sync-FS-in-async + DashMap shard-lock + monitor split all verified; new unbounded scan_jobs/broadcast/pool-permit findings) |
498 +
499 + Net: 4 CRITICALs (vs Run #3: 2), 10 HIGH/SERIOUS (vs Run #3: 10), 22 MED, 23 MINOR/LOW. Ship-blockers are concentrated in two structural rots — CSRF policy and scan_jobs growth — not in net-new logic mistakes.
500 +
Lines truncated
@@ -76,7 +76,7 @@ Single file: `static/style.css` (~3100 lines). No preprocessor, no minification
76 76 2. **Variables** (`:root` custom properties)
77 77 3. **Reset + Base** (global element styles)
78 78 4. **Layout** (`.padded-page`, `.centered-page`, `.container`)
79 - 5. **Buttons** (`.primary`, `.secondary`, `.danger`, `.small`)
79 + 5. **Buttons** (`.btn-primary`, `.btn-secondary`, `.btn-danger`, `.small` modifier)
80 80 6. **Forms** (`.form-group`, `.form-section`, `.checkbox-group`)
81 81 7. **Tables** (`.data-table`, `.compact-table`)
82 82 8. **Utilities** (`.text-sm`, `.muted`, `.scroll-x`, etc.)
@@ -110,15 +110,7 @@ Single file: `static/style.css` (~3100 lines). No preprocessor, no minification
110 110
111 111 ### Button Variants
112 112
113 - | Class | Background | Text | Usage |
114 - |-------|-----------|------|-------|
115 - | `button` (default) | `--light-background` | `--detail` | Generic actions |
116 - | `.primary` | `--primary-dark` (black) | White | Main CTAs |
117 - | `.secondary` | `--surface-muted` | `--detail` | Alternative actions |
118 - | `.danger` | `--danger` (red) | White | Destructive actions |
119 - | `.small` | (modifier) | (modifier) | Compact size for table cells |
120 -
121 - Combine: `class="primary small"`, `class="danger small"`.
113 + Buttons use the `.btn-*` family. See `design-system.md` § Buttons for the current class names, colors, and modifiers. The bare `.primary` / `.secondary` / `.danger` classes were retired; use `.btn-primary`, `.btn-secondary`, `.btn-danger` instead.
122 114
123 115 ## HTMX Patterns
124 116
@@ -1,6 +1,6 @@
1 1 # Schema — MNW Server
2 2
3 - PostgreSQL database. 57 migrations in `migrations/`, auto-applied on boot via sqlx. Extension: `pg_trgm` (trigram fuzzy search).
3 + PostgreSQL database. Migrations live under `migrations/`, numbered and auto-applied on boot via sqlx; the directory is the source of truth. Extension: `pg_trgm` (trigram fuzzy search).
4 4
5 5 ## Domain Map
6 6
@@ -20,7 +20,6 @@ PostgreSQL database. 57 migrations in `migrations/`, auto-applied on boot via sq
20 20 | Custom Domains | 1 | Creator vanity domains |
21 21 | OAuth | 1 | PKCE authorization codes |
22 22 | Waitlist & Invites | 3 | Creator waves, waitlist, invite codes |
23 - | Content Security | 2 | Download fingerprints, streaming sessions |
24 23 | Admin | 1 | Abuse reports |
25 24 | Media | 1 | User media library (images for markdown) |
26 25 | Import | 1 | Bulk import jobs (Patreon, Ko-fi, Gumroad) |
@@ -47,7 +46,7 @@ Core accounts. Every user has one row; creator features are gated by `can_create
47 46 | stripe_account_id | TEXT | Stripe Connect account |
48 47 | stripe_onboarding_complete | BOOL | |
49 48 | can_create_projects | BOOL | Creator gate |
50 - | creator_tier | TEXT | 'basic', 'small_files', 'big_files', 'streaming' |
49 + | creator_tier | TEXT | 'basic', 'small_files', 'big_files', 'everything' |
51 50 | storage_used_bytes | BIGINT | Computed from versions + insertions |
52 51 | max_file_override_bytes | BIGINT | Per-user override |
53 52 | grandfathered_until | TIMESTAMPTZ | Grace period for existing creators |
@@ -130,7 +129,7 @@ Products/content within projects. The central commerce entity — holds pricing,
130 129 | enable_license_keys | BOOL | DRM gate |
131 130 | custom_license_text | TEXT | License shown on download |
132 131 | sales_count / play_count / download_count | INT | Denormalized counters |
133 - | web_only | BOOL | Prevents download (streaming only) |
132 + | web_only | BOOL | Publish without emailing mailing-list subscribers |
134 133
135 134 **Indexes:** project_id, is_public, sales_count, tsvector search (title+description+body), title trigram, desc trigram, (project_id, slug).
136 135 **Trigger:** `update_items_updated_at`.
@@ -463,12 +462,6 @@ Creator onboarding pipeline: waves (batches), waitlist (applications), invite co
463 462 - creator_waitlist: user_id → users CASCADE (UNIQUE), wave_id → creator_waves SET NULL
464 463 - invite_codes: creator_id → users CASCADE, code UNIQUE
465 464
466 - ### download_fingerprints
467 - Watermark tracking for paid downloads. Records fingerprint ID, IP, UA per download.
468 -
469 - ### streaming_sessions
470 - Active streaming sessions with IP binding and concurrency tracking.
471 -
472 465 ### reports
473 466 User abuse reports. Status: open → resolved/dismissed.
474 467
@@ -507,6 +500,6 @@ All trigram indexes use `gin_trgm_ops` from the `pg_trgm` extension.
507 500
508 501 ## Key Paths
509 502
510 - - `migrations/` — all 57 migration files (numbered, applied in order)
503 + - `migrations/` — numbered SQL files, applied in order
511 504 - `src/db/` — query functions grouped by domain
512 505 - `src/models/` — Rust structs matching table schemas
@@ -0,0 +1,15 @@
1 + -- Add a non-secret lookup prefix to backup_codes so verification is O(1)
2 + -- instead of one Argon2 hash per unused row.
3 + --
4 + -- The prefix is the first 4 chars of the (now 16-char) code; the full code is
5 + -- still Argon2-hashed in code_hash, so leaking code_prefix only narrows the
6 + -- offline brute-force space from 36^16 to 36^12 (~62 bits remaining secret).
7 + --
8 + -- Legacy 8-char codes have code_prefix = NULL; verify falls back to the
9 + -- iterate-all path for those rows until they're regenerated.
10 +
11 + ALTER TABLE backup_codes ADD COLUMN code_prefix VARCHAR(8) NULL;
12 +
13 + CREATE INDEX idx_backup_codes_user_prefix
14 + ON backup_codes (user_id, code_prefix)
15 + WHERE used_at IS NULL AND code_prefix IS NOT NULL;
@@ -0,0 +1,72 @@
1 + -- Outbox table for transaction-completion side effects.
2 + --
3 + -- The Stripe webhook handler flips the underlying row (transaction, tip,
4 + -- subscription) to "completed" inside one DB transaction; it ALSO inserts one
5 + -- `completion_effects` row per side-effect (bundle grant, license key, revenue
6 + -- splits, emails, etc.) in that same transaction. A background worker drains
7 + -- the outbox with FOR UPDATE SKIP LOCKED, retrying each effect independently.
8 + --
9 + -- This decouples "the sale is recorded" (atomic, exactly-once) from "all
10 + -- derived effects have run" (eventually consistent, individually retryable).
11 + -- Before this table existed, a retry of a completed webhook silently
12 + -- abandoned every effect because `complete_transaction` returned `Ok(None)`
13 + -- on the second call and the handler short-circuited.
14 +
15 + CREATE TABLE completion_effects (
16 + id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
17 +
18 + -- Polymorphic parent: exactly one of these is set per row.
19 + transaction_id UUID REFERENCES transactions(id) ON DELETE CASCADE,
20 + tip_id UUID REFERENCES tips(id) ON DELETE CASCADE,
21 + subscription_id UUID REFERENCES subscriptions(id) ON DELETE CASCADE,
22 +
23 + -- Discriminator for the dispatcher's switch.
24 + kind VARCHAR(64) NOT NULL,
25 +
26 + -- JSON-serialised inputs needed to re-execute the effect.
27 + payload JSONB NOT NULL DEFAULT '{}'::jsonb,
28 +
29 + -- pending | succeeded | failed (terminal after max_attempts)
30 + status VARCHAR(16) NOT NULL DEFAULT 'pending',
31 + attempt INT NOT NULL DEFAULT 0,
32 + last_error TEXT,
33 +
34 + scheduled_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
35 + completed_at TIMESTAMPTZ,
36 + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
37 +
38 + CHECK (
39 + (transaction_id IS NOT NULL)::int +
40 + (tip_id IS NOT NULL)::int +
41 + (subscription_id IS NOT NULL)::int = 1
42 + )
43 + );
44 +
45 + CREATE INDEX idx_completion_effects_pending
46 + ON completion_effects (scheduled_at)
47 + WHERE status = 'pending';
48 +
49 + CREATE INDEX idx_completion_effects_transaction
50 + ON completion_effects (transaction_id)
51 + WHERE transaction_id IS NOT NULL;
52 +
53 + CREATE INDEX idx_completion_effects_tip
54 + ON completion_effects (tip_id)
55 + WHERE tip_id IS NOT NULL;
56 +
57 + CREATE INDEX idx_completion_effects_subscription
58 + ON completion_effects (subscription_id)
59 + WHERE subscription_id IS NOT NULL;
60 +
61 + -- Effect-level idempotency: don't double-enqueue the same effect for the same
62 + -- parent (e.g. a duplicate webhook delivery races the first one between the
63 + -- "row already completed?" check and the outbox INSERT).
64 + CREATE UNIQUE INDEX uq_completion_effects_transaction_kind
65 + ON completion_effects (transaction_id, kind)
66 + WHERE transaction_id IS NOT NULL;
67 + CREATE UNIQUE INDEX uq_completion_effects_tip_kind
68 + ON completion_effects (tip_id, kind)
69 + WHERE tip_id IS NOT NULL;
70 + CREATE UNIQUE INDEX uq_completion_effects_subscription_kind
71 + ON completion_effects (subscription_id, kind)
72 + WHERE subscription_id IS NOT NULL;
@@ -0,0 +1,40 @@
1 + -- Extend the completion_effects outbox to accept fan_plus and creator-tier
2 + -- subscriptions as parents. These are separate tables from `subscriptions`
3 + -- (content-creator subscriptions); the original migration only modeled the
4 + -- three parent types known at the time. Each new parent gets the same
5 + -- (parent_id, kind) unique-index treatment for at-most-once enqueue.
6 +
7 + ALTER TABLE completion_effects
8 + ADD COLUMN fan_plus_subscription_id UUID
9 + REFERENCES fan_plus_subscriptions(id) ON DELETE CASCADE;
10 +
11 + ALTER TABLE completion_effects
12 + ADD COLUMN creator_subscription_id UUID
13 + REFERENCES creator_subscriptions(id) ON DELETE CASCADE;
14 +
15 + -- Replace the original 3-way exactly-one CHECK with a 5-way version.
16 + ALTER TABLE completion_effects DROP CONSTRAINT completion_effects_check;
17 + ALTER TABLE completion_effects ADD CONSTRAINT completion_effects_check
18 + CHECK (
19 + (transaction_id IS NOT NULL)::int +
20 + (tip_id IS NOT NULL)::int +
21 + (subscription_id IS NOT NULL)::int +
22 + (fan_plus_subscription_id IS NOT NULL)::int +
23 + (creator_subscription_id IS NOT NULL)::int = 1
24 + );
25 +
26 + CREATE INDEX idx_completion_effects_fan_plus_subscription
27 + ON completion_effects (fan_plus_subscription_id)
28 + WHERE fan_plus_subscription_id IS NOT NULL;
29 +
30 + CREATE INDEX idx_completion_effects_creator_subscription
31 + ON completion_effects (creator_subscription_id)
32 + WHERE creator_subscription_id IS NOT NULL;
33 +
34 + CREATE UNIQUE INDEX uq_completion_effects_fan_plus_kind
35 + ON completion_effects (fan_plus_subscription_id, kind)
36 + WHERE fan_plus_subscription_id IS NOT NULL;
37 +
38 + CREATE UNIQUE INDEX uq_completion_effects_creator_subscription_kind
39 + ON completion_effects (creator_subscription_id, kind)
40 + WHERE creator_subscription_id IS NOT NULL;
@@ -0,0 +1,11 @@
1 + -- Track the byte size of each project's cover image in the database so we
2 + -- can correctly decrement storage on replace/clear without doing an S3 HEAD.
3 + -- The HEAD-based path was racy: between two replace requests, the S3 object
4 + -- size could change underfoot (or the HEAD could fail entirely), producing
5 + -- a wrong decrement and silently corrupting `creator_tiers.storage_used`.
6 + --
7 + -- NULL means "size not recorded" — pre-migration rows fall back to a
8 + -- best-effort S3 HEAD until the next replace stores a real value.
9 +
10 + ALTER TABLE projects
11 + ADD COLUMN cover_image_size_bytes BIGINT;
@@ -0,0 +1,9 @@
1 + -- Composite index for the `list user's media filtered by scan_status` query
2 + -- path. Without it, listing a creator's clean media files scans the full
3 + -- (user_id) partition and filters in memory, which becomes a hotspot on
4 + -- accounts with thousands of media rows.
5 + --
6 + -- Run #2 audit, medium-priority.
7 +
8 + CREATE INDEX IF NOT EXISTS idx_media_files_user_scan_status
9 + ON media_files (user_id, scan_status);
@@ -0,0 +1,24 @@
1 + -- Per-app opt-in for the RFC 8252 loopback redirect wildcard.
2 + --
3 + -- Background: OAuth `redirect_uri` validation accepts ANY http://127.0.0.1:PORT/,
4 + -- [::1]:PORT/, or localhost:PORT/ regardless of an app's registered
5 + -- redirect_uris list. That's correct for native (desktop/mobile) apps that
6 + -- can't reserve known ports, but it widens the attack surface for any
7 + -- web-only app — a phishing URL pointing /oauth/authorize at the attacker's
8 + -- loopback can hijack the auth flow even with PKCE in play (attacker
9 + -- initiates and supplies their own code_verifier).
10 + --
11 + -- This column gates the wildcard per app:
12 + -- - existing rows default `true` to preserve SyncKit (today's only OAuth
13 + -- consumer, all native).
14 + -- - new rows registered via the developer UI default `false`; the app
15 + -- creator must explicitly opt in by checking "This is a desktop or
16 + -- mobile app."
17 + --
18 + -- See `routes/oauth.rs::is_localhost_redirect` and the Run #2 audit entry.
19 +
20 + ALTER TABLE sync_apps ADD COLUMN allow_loopback BOOLEAN NOT NULL DEFAULT true;
21 +
22 + -- Future-proof: change the column-level default to `false` so newly
23 + -- INSERTed rows that don't explicitly set it land safe.
24 + ALTER TABLE sync_apps ALTER COLUMN allow_loopback SET DEFAULT false;
@@ -0,0 +1,22 @@
1 + -- Migration 129: pending_s3_deletions dead-letter table
2 + --
3 + -- Audit Run #3 (Storage SERIOUS): `pending_s3_deletions.attempts` was
4 + -- incremented forever on a permanently-failing key (malformed s3_key, gone
5 + -- bucket, ACL gap), generating retry noise and crowding out real deletions.
6 + -- The dead-letter table moves rows that exceed MAX_ATTEMPTS off the hot
7 + -- queue into an operator-visible parking lot so the worker can keep making
8 + -- progress on the legitimate backlog. Rows here require manual triage.
9 +
10 + CREATE TABLE IF NOT EXISTS pending_s3_deletions_dead_letter (
11 + id UUID PRIMARY KEY,
12 + s3_key TEXT NOT NULL,
13 + bucket TEXT NOT NULL,
14 + source TEXT NOT NULL,
15 + created_at TIMESTAMPTZ NOT NULL,
16 + attempts INT NOT NULL,
17 + last_attempted_at TIMESTAMPTZ,
18 + dead_lettered_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
19 + );
20 +
21 + CREATE INDEX IF NOT EXISTS idx_pending_s3_deletions_dead_letter_dead_lettered_at
22 + ON pending_s3_deletions_dead_letter(dead_lettered_at);
@@ -0,0 +1,11 @@
1 + -- Migration 130: index user_sessions(ip_address)
2 + --
3 + -- Audit Run #3 (Perf LOW): `count_active_sandboxes_for_ip` (db/mod.rs)
4 + -- filters `user_sessions.ip_address = $1` without an index, so the abuse
5 + -- prevention cap that fires on every sandbox signup table-scans
6 + -- user_sessions. Partial index over the non-null subset since ip_address
7 + -- is nullable (legacy sessions predating session tracking can carry NULL).
8 +
9 + CREATE INDEX IF NOT EXISTS idx_user_sessions_ip_address
10 + ON user_sessions(ip_address)
11 + WHERE ip_address IS NOT NULL;
M server/todo.md +298 -1