Skip to main content

max / makenotwork

5.5 KB · 168 lines History Blame Raw
1 //! `/robots.txt` and `/sitemap.xml` for crawler discovery.
2
3 use std::sync::{Mutex, OnceLock};
4 use std::time::{Duration, Instant};
5
6 use axum::{
7 extract::State,
8 response::{IntoResponse, Response},
9 http::header,
10 };
11 use chrono::{DateTime, Utc};
12
13 use crate::{error::Result, AppState};
14
15 /// In-memory cache for the rendered sitemap XML. Crawlers hit this rarely
16 /// and the same response is fine for ~10 min; without the cache an attacker
17 /// hammering `/sitemap.xml` could pin the DB pool on two unbounded
18 /// `fetch_all` queries per request.
19 const SITEMAP_CACHE_TTL: Duration = Duration::from_secs(600);
20 static SITEMAP_CACHE: OnceLock<Mutex<Option<(Instant, String)>>> = OnceLock::new();
21
22 /// `/robots.txt`. Permits indexing the public surface; blocks the dashboard,
23 /// API, admin tooling, and authentication paths where indexed URLs would
24 /// be noise or actively harmful (login-page leakage, stale checkout URLs).
25 pub(super) async fn robots_txt(State(state): State<AppState>) -> impl IntoResponse {
26 let host = &state.config.host_url;
27 let body = format!(
28 "User-agent: *\n\
29 Disallow: /dashboard\n\
30 Disallow: /admin\n\
31 Disallow: /api/\n\
32 Disallow: /auth/\n\
33 Disallow: /login\n\
34 Disallow: /logout\n\
35 Disallow: /join\n\
36 Disallow: /stripe/\n\
37 Disallow: /oauth/\n\
38 Disallow: /checkout/\n\
39 Disallow: /cart\n\
40 Disallow: /library\n\
41 Disallow: /buy/\n\
42 Disallow: /purchase/\n\
43 Disallow: /download/\n\
44 Disallow: /claim\n\
45 \n\
46 Sitemap: {host}/sitemap.xml\n"
47 );
48 (
49 [(header::CONTENT_TYPE, "text/plain; charset=utf-8")],
50 body,
51 )
52 }
53
54 /// `/sitemap.xml`. Includes the public top-level pages plus active creator
55 /// profiles and their public items. Capped to keep the response bounded and
56 /// cached for SITEMAP_CACHE_TTL to absorb crawler-or-attacker hammering
57 /// without firing two large `fetch_all` queries per request.
58 pub(super) async fn sitemap_xml(State(state): State<AppState>) -> Result<Response> {
59 let cache = SITEMAP_CACHE.get_or_init(|| Mutex::new(None));
60 if let Some((generated_at, cached_xml)) = cache.lock().expect("sitemap cache mutex").as_ref()
61 && generated_at.elapsed() < SITEMAP_CACHE_TTL
62 {
63 return Ok((
64 [(header::CONTENT_TYPE, "application/xml; charset=utf-8")],
65 cached_xml.clone(),
66 )
67 .into_response());
68 }
69
70 let host = state.config.host_url.trim_end_matches('/').to_string();
71
72 // Active creator usernames (anyone with at least one public, listed,
73 // non-deleted item under a public project).
74 let creator_rows: Vec<(String,)> = sqlx::query_as(
75 r#"
76 SELECT DISTINCT u.username
77 FROM users u
78 JOIN projects p ON p.user_id = u.id
79 JOIN items i ON i.project_id = p.id
80 WHERE p.is_public = true
81 AND i.is_public = true
82 AND i.listed = true
83 AND i.deleted_at IS NULL
84 ORDER BY u.username
85 LIMIT 5000
86 "#,
87 )
88 .fetch_all(&state.db)
89 .await?;
90
91 // Public items (id + updated_at for <lastmod>).
92 let item_rows: Vec<(uuid::Uuid, DateTime<Utc>)> = sqlx::query_as(
93 r#"
94 SELECT i.id, i.updated_at
95 FROM items i
96 JOIN projects p ON i.project_id = p.id
97 WHERE p.is_public = true
98 AND i.is_public = true
99 AND i.listed = true
100 AND i.deleted_at IS NULL
101 ORDER BY i.updated_at DESC
102 LIMIT 20000
103 "#,
104 )
105 .fetch_all(&state.db)
106 .await?;
107
108 let mut xml = String::with_capacity(64 * 1024);
109 xml.push_str(r#"<?xml version="1.0" encoding="UTF-8"?>"#);
110 xml.push('\n');
111 xml.push_str(r#"<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">"#);
112 xml.push('\n');
113
114 // Top-level public pages
115 for path in [
116 "/", "/discover", "/creators", "/pricing", "/fan-plus",
117 "/policy", "/changelog", "/docs", "/use-cases", "/team",
118 ] {
119 xml.push_str(&format!(
120 " <url><loc>{host}{path}</loc></url>\n"
121 ));
122 }
123
124 for (username,) in &creator_rows {
125 xml.push_str(&format!(
126 " <url><loc>{host}/u/{}</loc></url>\n",
127 xml_escape(username),
128 ));
129 }
130
131 for (id, updated_at) in &item_rows {
132 xml.push_str(&format!(
133 " <url><loc>{host}/i/{}</loc><lastmod>{}</lastmod></url>\n",
134 id,
135 updated_at.format("%Y-%m-%d"),
136 ));
137 }
138
139 xml.push_str("</urlset>\n");
140
141 // Cache the rendered XML for the next SITEMAP_CACHE_TTL window. Multiple
142 // concurrent requests can each pass the stale check above, run the
143 // queries in parallel, then overwrite each other here — that's a
144 // self-correcting thundering-herd worth at most a few extra queries
145 // every 10 min, not worth a dedicated single-flight primitive.
146 if let Ok(mut guard) = cache.lock() {
147 *guard = Some((Instant::now(), xml.clone()));
148 }
149
150 Ok((
151 [(header::CONTENT_TYPE, "application/xml; charset=utf-8")],
152 xml,
153 )
154 .into_response())
155 }
156
157 /// Minimal XML escape for the handful of characters that matter inside
158 /// `<loc>` text. Usernames are already alphanumeric+underscore at validation
159 /// time, so this is belt-and-braces — but the encoder must exist if any
160 /// future content gets piped through.
161 fn xml_escape(s: &str) -> String {
162 s.replace('&', "&amp;")
163 .replace('<', "&lt;")
164 .replace('>', "&gt;")
165 .replace('"', "&quot;")
166 .replace('\'', "&apos;")
167 }
168