Skip to main content

max / multithreaded

10.2 KB · 337 lines History Blame Raw
1 //! Link preview — server-side OpenGraph metadata fetch for post URLs.
2
3 use pulldown_cmark::{Event, Parser, Tag};
4 use reqwest::header::CONTENT_TYPE;
5
6 /// Maximum number of URLs to extract per post.
7 const MAX_URLS: usize = 3;
8
9 /// Maximum response body size to read (1 MB).
10 const MAX_BODY_SIZE: usize = 1_048_576;
11
12 /// Validate that a URL is safe to fetch (no SSRF to internal networks).
13 fn validate_url(url: &str) -> bool {
14 let lower = url.to_ascii_lowercase();
15 if !lower.starts_with("http://") && !lower.starts_with("https://") {
16 return false;
17 }
18 let host_part = lower
19 .strip_prefix("http://")
20 .or_else(|| lower.strip_prefix("https://"))
21 .unwrap_or("");
22 let host_and_port = host_part.split('/').next().unwrap_or("");
23 let host = if host_and_port.starts_with('[') {
24 host_and_port
25 .split(']')
26 .next()
27 .map(|s| format!("{}]", s))
28 .unwrap_or_default()
29 } else {
30 host_and_port.split(':').next().unwrap_or("").to_string()
31 };
32 let host = host.as_str();
33 if host == "localhost"
34 || host == "127.0.0.1"
35 || host == "[::1]"
36 || host == "0.0.0.0"
37 || host.starts_with("10.")
38 || host.starts_with("192.168.")
39 || host.starts_with("169.254.")
40 || host.starts_with("[fd")
41 || host.starts_with("[fe80:")
42 {
43 return false;
44 }
45 // Block 172.16.0.0/12
46 if let Some(rest) = host.strip_prefix("172.")
47 && let Some(second) = rest.split('.').next()
48 && let Ok(n) = second.parse::<u8>()
49 && (16..=31).contains(&n)
50 {
51 return false;
52 }
53 true
54 }
55
56 /// Extract unique http/https URLs from markdown text via pulldown_cmark link parsing.
57 /// Returns at most `MAX_URLS` URLs.
58 pub fn extract_urls(input: &str) -> Vec<String> {
59 let parser = Parser::new(input);
60 let mut seen = std::collections::HashSet::new();
61 let mut urls = Vec::new();
62
63 for event in parser {
64 if let Event::Start(Tag::Link { dest_url, .. }) = event {
65 let url = dest_url.to_string();
66 if (url.starts_with("http://") || url.starts_with("https://"))
67 && seen.insert(url.clone())
68 {
69 urls.push(url);
70 if urls.len() >= MAX_URLS {
71 break;
72 }
73 }
74 }
75 }
76
77 urls
78 }
79
80 /// Build a reqwest client for link preview fetching with SSRF-safe redirect policy.
81 pub fn build_preview_client() -> reqwest::Client {
82 reqwest::Client::builder()
83 .redirect(reqwest::redirect::Policy::custom(|attempt| {
84 if !validate_url(attempt.url().as_str()) || attempt.previous().len() >= 5 {
85 attempt.stop()
86 } else {
87 attempt.follow()
88 }
89 }))
90 .build()
91 .expect("failed to build preview HTTP client")
92 }
93
94 /// Fetch OpenGraph metadata from a URL. Returns `(og:title, og:description)`.
95 /// Best-effort: returns None on any error (timeout, too large, parse failure).
96 #[tracing::instrument(skip_all)]
97 pub async fn fetch_og_metadata(
98 http: &reqwest::Client,
99 url: &str,
100 ) -> Option<(Option<String>, Option<String>)> {
101 if !validate_url(url) {
102 return None;
103 }
104
105 let resp = http
106 .get(url)
107 .timeout(std::time::Duration::from_secs(5))
108 .header("User-Agent", "Multithreaded/LinkPreview")
109 .send()
110 .await
111 .ok()?;
112
113 if !resp.status().is_success() {
114 return None;
115 }
116
117 // Only fetch HTML content
118 if let Some(ct) = resp.headers().get(CONTENT_TYPE) {
119 let ct_str = ct.to_str().unwrap_or("");
120 if !ct_str.starts_with("text/html") {
121 return None;
122 }
123 }
124
125 // Read body in chunks, capping at MAX_BODY_SIZE
126 let mut body = Vec::new();
127 let mut stream = resp;
128 while body.len() < MAX_BODY_SIZE {
129 let chunk = match stream.chunk().await.ok()? {
130 Some(c) => c,
131 None => break,
132 };
133 let remaining = MAX_BODY_SIZE - body.len();
134 body.extend_from_slice(&chunk[..chunk.len().min(remaining)]);
135 }
136
137 let html = String::from_utf8_lossy(&body);
138
139 let og_title = extract_og_meta(&html, "og:title");
140 let og_desc = extract_og_meta(&html, "og:description");
141
142 // Fall back to <title> tag if no og:title
143 let title = og_title.or_else(|| extract_html_title(&html));
144
145 if title.is_some() || og_desc.is_some() {
146 Some((title, og_desc))
147 } else {
148 None
149 }
150 }
151
152 /// Extract a `<meta property="..." content="...">` value from HTML.
153 fn extract_og_meta(html: &str, property: &str) -> Option<String> {
154 static OG_RE: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(|| {
155 regex_lite::Regex::new(
156 r#"<meta\s[^>]*?property\s*=\s*"([^"]*)"[^>]*?content\s*=\s*"([^"]*)"[^>]*?>"#,
157 )
158 .unwrap()
159 });
160 static OG_RE_REV: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(|| {
161 regex_lite::Regex::new(
162 r#"<meta\s[^>]*?content\s*=\s*"([^"]*)"[^>]*?property\s*=\s*"([^"]*)"[^>]*?>"#,
163 )
164 .unwrap()
165 });
166
167 // Try property-first order
168 for caps in OG_RE.captures_iter(html) {
169 if &caps[1] == property {
170 let val = caps[2].trim().to_string();
171 if !val.is_empty() {
172 return Some(val);
173 }
174 }
175 }
176 // Try content-first order (some sites put content before property)
177 for caps in OG_RE_REV.captures_iter(html) {
178 if &caps[2] == property {
179 let val = caps[1].trim().to_string();
180 if !val.is_empty() {
181 return Some(val);
182 }
183 }
184 }
185 None
186 }
187
188 /// Extract the `<title>` tag content from HTML.
189 fn extract_html_title(html: &str) -> Option<String> {
190 static TITLE_RE: std::sync::LazyLock<regex_lite::Regex> =
191 std::sync::LazyLock::new(|| regex_lite::Regex::new(r"<title[^>]*>([^<]+)</title>").unwrap());
192 TITLE_RE.captures(html).map(|c| c[1].trim().to_string())
193 }
194
195 #[cfg(test)]
196 mod tests {
197 use super::*;
198
199 #[test]
200 fn extract_urls_from_markdown() {
201 let input = "Check [this](https://example.com) and [that](https://other.com/page).";
202 let urls = extract_urls(input);
203 assert_eq!(urls, vec!["https://example.com", "https://other.com/page"]);
204 }
205
206 #[test]
207 fn extract_urls_skips_non_http() {
208 let input = "[mail](mailto:a@b.com) [site](https://x.com)";
209 let urls = extract_urls(input);
210 assert_eq!(urls, vec!["https://x.com"]);
211 }
212
213 #[test]
214 fn extract_urls_caps_at_three() {
215 let input = "[a](https://1.com) [b](https://2.com) [c](https://3.com) [d](https://4.com)";
216 let urls = extract_urls(input);
217 assert_eq!(urls.len(), 3);
218 }
219
220 #[test]
221 fn extract_urls_deduplicates() {
222 let input = "[a](https://same.com) [b](https://same.com)";
223 let urls = extract_urls(input);
224 assert_eq!(urls, vec!["https://same.com"]);
225 }
226
227 #[test]
228 fn extract_urls_no_links() {
229 let urls = extract_urls("no links here");
230 assert!(urls.is_empty());
231 }
232
233 #[test]
234 fn og_meta_property_first() {
235 let html = r#"<meta property="og:title" content="My Page">"#;
236 assert_eq!(extract_og_meta(html, "og:title"), Some("My Page".to_string()));
237 }
238
239 #[test]
240 fn og_meta_content_first() {
241 let html = r#"<meta content="Description here" property="og:description">"#;
242 assert_eq!(
243 extract_og_meta(html, "og:description"),
244 Some("Description here".to_string())
245 );
246 }
247
248 #[test]
249 fn og_meta_missing() {
250 let html = r#"<meta property="og:image" content="img.png">"#;
251 assert_eq!(extract_og_meta(html, "og:title"), None);
252 }
253
254 #[test]
255 fn html_title_fallback() {
256 let html = "<html><head><title>Page Title</title></head></html>";
257 assert_eq!(extract_html_title(html), Some("Page Title".to_string()));
258 }
259
260 #[test]
261 fn html_title_missing() {
262 let html = "<html><head></head></html>";
263 assert_eq!(extract_html_title(html), None);
264 }
265
266 // -- validate_url tests --
267
268 #[test]
269 fn validate_url_allows_https() {
270 assert!(validate_url("https://example.com"));
271 assert!(validate_url("https://example.com/path?q=1"));
272 }
273
274 #[test]
275 fn validate_url_allows_http() {
276 assert!(validate_url("http://example.com"));
277 }
278
279 #[test]
280 fn validate_url_blocks_non_http_schemes() {
281 assert!(!validate_url("ftp://example.com"));
282 assert!(!validate_url("file:///etc/passwd"));
283 assert!(!validate_url("javascript:alert(1)"));
284 assert!(!validate_url("data:text/html,<h1>hi</h1>"));
285 }
286
287 #[test]
288 fn validate_url_blocks_localhost() {
289 assert!(!validate_url("http://localhost"));
290 assert!(!validate_url("http://localhost:8080"));
291 assert!(!validate_url("http://127.0.0.1"));
292 assert!(!validate_url("http://127.0.0.1:3000"));
293 assert!(!validate_url("http://0.0.0.0"));
294 assert!(!validate_url("http://[::1]"));
295 assert!(!validate_url("http://[::1]:8080"));
296 }
297
298 #[test]
299 fn validate_url_blocks_private_10() {
300 assert!(!validate_url("http://10.0.0.1"));
301 assert!(!validate_url("http://10.255.255.255"));
302 }
303
304 #[test]
305 fn validate_url_blocks_private_192_168() {
306 assert!(!validate_url("http://192.168.0.1"));
307 assert!(!validate_url("http://192.168.1.100:8080"));
308 }
309
310 #[test]
311 fn validate_url_blocks_private_172_16() {
312 assert!(!validate_url("http://172.16.0.1"));
313 assert!(!validate_url("http://172.31.255.255"));
314 // 172.15 and 172.32 are public
315 assert!(validate_url("http://172.15.0.1"));
316 assert!(validate_url("http://172.32.0.1"));
317 }
318
319 #[test]
320 fn validate_url_blocks_link_local() {
321 assert!(!validate_url("http://169.254.0.1"));
322 assert!(!validate_url("http://169.254.169.254")); // AWS metadata
323 }
324
325 #[test]
326 fn validate_url_blocks_ipv6_private() {
327 assert!(!validate_url("http://[fd00::1]"));
328 assert!(!validate_url("http://[fe80::1]"));
329 }
330
331 #[test]
332 fn validate_url_allows_public_ips() {
333 assert!(validate_url("http://8.8.8.8"));
334 assert!(validate_url("https://93.184.216.34"));
335 }
336 }
337