//! `/robots.txt` and `/sitemap.xml` for crawler discovery. use std::sync::{Mutex, OnceLock}; use std::time::{Duration, Instant}; use axum::{ extract::State, response::{IntoResponse, Response}, http::header, }; use chrono::{DateTime, Utc}; use crate::{error::Result, AppState}; /// In-memory cache for the rendered sitemap XML. Crawlers hit this rarely /// and the same response is fine for ~10 min; without the cache an attacker /// hammering `/sitemap.xml` could pin the DB pool on two unbounded /// `fetch_all` queries per request. const SITEMAP_CACHE_TTL: Duration = Duration::from_secs(600); static SITEMAP_CACHE: OnceLock>> = OnceLock::new(); /// `/robots.txt`. Permits indexing the public surface; blocks the dashboard, /// API, admin tooling, and authentication paths where indexed URLs would /// be noise or actively harmful (login-page leakage, stale checkout URLs). pub(super) async fn robots_txt(State(state): State) -> impl IntoResponse { let host = &state.config.host_url; let body = format!( "User-agent: *\n\ Disallow: /dashboard\n\ Disallow: /admin\n\ Disallow: /api/\n\ Disallow: /auth/\n\ Disallow: /login\n\ Disallow: /logout\n\ Disallow: /join\n\ Disallow: /stripe/\n\ Disallow: /oauth/\n\ Disallow: /checkout/\n\ Disallow: /cart\n\ Disallow: /library\n\ Disallow: /buy/\n\ Disallow: /purchase/\n\ Disallow: /download/\n\ Disallow: /claim\n\ \n\ Sitemap: {host}/sitemap.xml\n" ); ( [(header::CONTENT_TYPE, "text/plain; charset=utf-8")], body, ) } /// `/sitemap.xml`. Includes the public top-level pages plus active creator /// profiles and their public items. Capped to keep the response bounded and /// cached for SITEMAP_CACHE_TTL to absorb crawler-or-attacker hammering /// without firing two large `fetch_all` queries per request. pub(super) async fn sitemap_xml(State(state): State) -> Result { let cache = SITEMAP_CACHE.get_or_init(|| Mutex::new(None)); if let Some((generated_at, cached_xml)) = cache.lock().expect("sitemap cache mutex").as_ref() && generated_at.elapsed() < SITEMAP_CACHE_TTL { return Ok(( [(header::CONTENT_TYPE, "application/xml; charset=utf-8")], cached_xml.clone(), ) .into_response()); } let host = state.config.host_url.trim_end_matches('/').to_string(); // Active creator usernames (anyone with at least one public, listed, // non-deleted item under a public project). let creator_rows: Vec<(String,)> = sqlx::query_as( r#" SELECT DISTINCT u.username FROM users u JOIN projects p ON p.user_id = u.id JOIN items i ON i.project_id = p.id WHERE p.is_public = true AND i.is_public = true AND i.listed = true AND i.deleted_at IS NULL ORDER BY u.username LIMIT 5000 "#, ) .fetch_all(&state.db) .await?; // Public items (id + updated_at for ). let item_rows: Vec<(uuid::Uuid, DateTime)> = sqlx::query_as( r#" SELECT i.id, i.updated_at FROM items i JOIN projects p ON i.project_id = p.id WHERE p.is_public = true AND i.is_public = true AND i.listed = true AND i.deleted_at IS NULL ORDER BY i.updated_at DESC LIMIT 20000 "#, ) .fetch_all(&state.db) .await?; let mut xml = String::with_capacity(64 * 1024); xml.push_str(r#""#); xml.push('\n'); xml.push_str(r#""#); xml.push('\n'); // Top-level public pages for path in [ "/", "/discover", "/creators", "/pricing", "/fan-plus", "/policy", "/changelog", "/docs", "/use-cases", "/team", ] { xml.push_str(&format!( " {host}{path}\n" )); } for (username,) in &creator_rows { xml.push_str(&format!( " {host}/u/{}\n", xml_escape(username), )); } for (id, updated_at) in &item_rows { xml.push_str(&format!( " {host}/i/{}{}\n", id, updated_at.format("%Y-%m-%d"), )); } xml.push_str("\n"); // Cache the rendered XML for the next SITEMAP_CACHE_TTL window. Multiple // concurrent requests can each pass the stale check above, run the // queries in parallel, then overwrite each other here — that's a // self-correcting thundering-herd worth at most a few extra queries // every 10 min, not worth a dedicated single-flight primitive. if let Ok(mut guard) = cache.lock() { *guard = Some((Instant::now(), xml.clone())); } Ok(( [(header::CONTENT_TYPE, "application/xml; charset=utf-8")], xml, ) .into_response()) } /// Minimal XML escape for the handful of characters that matter inside /// `` text. Usernames are already alphanumeric+underscore at validation /// time, so this is belt-and-braces — but the encoder must exist if any /// future content gets piped through. fn xml_escape(s: &str) -> String { s.replace('&', "&") .replace('<', "<") .replace('>', ">") .replace('"', """) .replace('\'', "'") }