Skip to main content

max / makenotwork

12.0 KB · 385 lines History Blame Raw
1 //! Link preview — server-side OpenGraph metadata fetch for post URLs.
2
3 use pulldown_cmark::{Event, Parser, Tag};
4 use reqwest::header::CONTENT_TYPE;
5
6 /// Maximum number of URLs to extract per post.
7 const MAX_URLS: usize = 3;
8
9 /// Maximum response body size to read (1 MB).
10 const MAX_BODY_SIZE: usize = 1_048_576;
11
12 /// Check if an IP address is private/reserved (not safe for SSRF).
13 fn is_private_ip(ip: std::net::IpAddr) -> bool {
14 match ip {
15 std::net::IpAddr::V4(v4) => {
16 v4.is_loopback()
17 || v4.is_private()
18 || v4.is_link_local()
19 || v4.is_broadcast()
20 || v4.is_unspecified()
21 || v4.octets()[0] == 100 && (v4.octets()[1] & 0xC0) == 64 // 100.64.0.0/10 (CGNAT / Tailscale)
22 }
23 std::net::IpAddr::V6(v6) => {
24 v6.is_loopback()
25 || v6.is_unspecified()
26 || (v6.segments()[0] & 0xfe00) == 0xfc00 // ULA fd00::/7
27 || (v6.segments()[0] & 0xffc0) == 0xfe80 // link-local
28 || matches!(v6.to_ipv4_mapped(), Some(v4) if is_private_ip(std::net::IpAddr::V4(v4)))
29 }
30 }
31 }
32
33 /// Validate that a URL is safe to fetch (no SSRF to internal networks).
34 /// Resolves the hostname to catch alternative IP encodings (octal, hex, decimal, IPv6-mapped).
35 fn validate_url(url: &str) -> bool {
36 let lower = url.to_ascii_lowercase();
37 if !lower.starts_with("http://") && !lower.starts_with("https://") {
38 return false;
39 }
40 let host_part = lower
41 .strip_prefix("http://")
42 .or_else(|| lower.strip_prefix("https://"))
43 .unwrap_or("");
44 let host_and_port = host_part.split('/').next().unwrap_or("");
45 let host = if host_and_port.starts_with('[') {
46 host_and_port
47 .split(']')
48 .next()
49 .map(|s| format!("{}]", s))
50 .unwrap_or_default()
51 } else {
52 host_and_port.split(':').next().unwrap_or("").to_string()
53 };
54 let host = host.as_str();
55
56 // Quick string-based check for common private patterns
57 if host == "localhost" || host == "0.0.0.0" {
58 return false;
59 }
60
61 // Try parsing as a raw IP address (catches octal, hex, decimal encodings)
62 let bare_host = host.trim_start_matches('[').trim_end_matches(']');
63 if let Ok(ip) = bare_host.parse::<std::net::IpAddr>() {
64 return !is_private_ip(ip);
65 }
66
67 // For hostnames, resolve and check all addresses
68 if let Ok(addrs) = std::net::ToSocketAddrs::to_socket_addrs(&(bare_host, 80)) {
69 for addr in addrs {
70 if is_private_ip(addr.ip()) {
71 return false;
72 }
73 }
74 }
75
76 true
77 }
78
79 /// Extract unique http/https URLs from markdown text via pulldown_cmark link parsing.
80 /// Returns at most `MAX_URLS` URLs.
81 pub fn extract_urls(input: &str) -> Vec<String> {
82 let parser = Parser::new(input);
83 let mut seen = std::collections::HashSet::new();
84 let mut urls = Vec::new();
85
86 for event in parser {
87 if let Event::Start(Tag::Link { dest_url, .. }) = event {
88 let url = dest_url.to_string();
89 if (url.starts_with("http://") || url.starts_with("https://"))
90 && seen.insert(url.clone())
91 {
92 urls.push(url);
93 if urls.len() >= MAX_URLS {
94 break;
95 }
96 }
97 }
98 }
99
100 urls
101 }
102
103 /// Build a reqwest client for link preview fetching with SSRF-safe redirect policy.
104 pub fn build_preview_client() -> reqwest::Client {
105 reqwest::Client::builder()
106 .redirect(reqwest::redirect::Policy::custom(|attempt| {
107 if !validate_url(attempt.url().as_str()) || attempt.previous().len() >= 5 {
108 attempt.stop()
109 } else {
110 attempt.follow()
111 }
112 }))
113 .build()
114 .expect("failed to build preview HTTP client")
115 }
116
117 /// Strategy for fetching link previews. The `Noop` variant lets tests skip
118 /// real HTTP without monkey-patching `tokio::spawn`; production constructs
119 /// `Http(build_preview_client())`.
120 #[derive(Clone)]
121 pub enum LinkPreviewFetcher {
122 Http(reqwest::Client),
123 Noop,
124 }
125
126 impl LinkPreviewFetcher {
127 pub async fn fetch(&self, url: &str) -> Option<(Option<String>, Option<String>)> {
128 match self {
129 Self::Http(client) => fetch_og_metadata(client, url).await,
130 Self::Noop => None,
131 }
132 }
133 }
134
135 /// Fetch OpenGraph metadata from a URL. Returns `(og:title, og:description)`.
136 /// Best-effort: returns None on any error (timeout, too large, parse failure).
137 #[tracing::instrument(skip_all)]
138 pub async fn fetch_og_metadata(
139 http: &reqwest::Client,
140 url: &str,
141 ) -> Option<(Option<String>, Option<String>)> {
142 if !validate_url(url) {
143 return None;
144 }
145
146 let resp = http
147 .get(url)
148 .timeout(std::time::Duration::from_secs(5))
149 .header("User-Agent", "Multithreaded/LinkPreview")
150 .send()
151 .await
152 .ok()?;
153
154 if !resp.status().is_success() {
155 return None;
156 }
157
158 // Only fetch HTML content
159 if let Some(ct) = resp.headers().get(CONTENT_TYPE) {
160 let ct_str = ct.to_str().unwrap_or("");
161 if !ct_str.starts_with("text/html") {
162 return None;
163 }
164 }
165
166 // Read body in chunks, capping at MAX_BODY_SIZE
167 let mut body = Vec::new();
168 let mut stream = resp;
169 while body.len() < MAX_BODY_SIZE {
170 let chunk = match stream.chunk().await.ok()? {
171 Some(c) => c,
172 None => break,
173 };
174 let remaining = MAX_BODY_SIZE - body.len();
175 body.extend_from_slice(&chunk[..chunk.len().min(remaining)]);
176 }
177
178 let html = String::from_utf8_lossy(&body);
179
180 let og_title = extract_og_meta(&html, "og:title");
181 let og_desc = extract_og_meta(&html, "og:description");
182
183 // Fall back to <title> tag if no og:title
184 let title = og_title.or_else(|| extract_html_title(&html));
185
186 if title.is_some() || og_desc.is_some() {
187 Some((title, og_desc))
188 } else {
189 None
190 }
191 }
192
193 /// Extract a `<meta property="..." content="...">` value from HTML.
194 fn extract_og_meta(html: &str, property: &str) -> Option<String> {
195 static OG_RE: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(|| {
196 regex_lite::Regex::new(
197 r#"<meta\s[^>]*?property\s*=\s*"([^"]*)"[^>]*?content\s*=\s*"([^"]*)"[^>]*?>"#,
198 )
199 .unwrap()
200 });
201 static OG_RE_REV: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(|| {
202 regex_lite::Regex::new(
203 r#"<meta\s[^>]*?content\s*=\s*"([^"]*)"[^>]*?property\s*=\s*"([^"]*)"[^>]*?>"#,
204 )
205 .unwrap()
206 });
207
208 // Try property-first order
209 for caps in OG_RE.captures_iter(html) {
210 if &caps[1] == property {
211 let val = caps[2].trim().to_string();
212 if !val.is_empty() {
213 return Some(val);
214 }
215 }
216 }
217 // Try content-first order (some sites put content before property)
218 for caps in OG_RE_REV.captures_iter(html) {
219 if &caps[2] == property {
220 let val = caps[1].trim().to_string();
221 if !val.is_empty() {
222 return Some(val);
223 }
224 }
225 }
226 None
227 }
228
229 /// Extract the `<title>` tag content from HTML.
230 fn extract_html_title(html: &str) -> Option<String> {
231 static TITLE_RE: std::sync::LazyLock<regex_lite::Regex> =
232 std::sync::LazyLock::new(|| regex_lite::Regex::new(r"<title[^>]*>([^<]+)</title>").unwrap());
233 TITLE_RE.captures(html).map(|c| c[1].trim().to_string())
234 }
235
236 #[cfg(test)]
237 mod tests {
238 use super::*;
239
240 #[test]
241 fn extract_urls_from_markdown() {
242 let input = "Check [this](https://example.com) and [that](https://other.com/page).";
243 let urls = extract_urls(input);
244 assert_eq!(urls, vec!["https://example.com", "https://other.com/page"]);
245 }
246
247 #[test]
248 fn extract_urls_skips_non_http() {
249 let input = "[mail](mailto:a@b.com) [site](https://x.com)";
250 let urls = extract_urls(input);
251 assert_eq!(urls, vec!["https://x.com"]);
252 }
253
254 #[test]
255 fn extract_urls_caps_at_three() {
256 let input = "[a](https://1.com) [b](https://2.com) [c](https://3.com) [d](https://4.com)";
257 let urls = extract_urls(input);
258 assert_eq!(urls.len(), 3);
259 }
260
261 #[test]
262 fn extract_urls_deduplicates() {
263 let input = "[a](https://same.com) [b](https://same.com)";
264 let urls = extract_urls(input);
265 assert_eq!(urls, vec!["https://same.com"]);
266 }
267
268 #[test]
269 fn extract_urls_no_links() {
270 let urls = extract_urls("no links here");
271 assert!(urls.is_empty());
272 }
273
274 #[test]
275 fn og_meta_property_first() {
276 let html = r#"<meta property="og:title" content="My Page">"#;
277 assert_eq!(extract_og_meta(html, "og:title"), Some("My Page".to_string()));
278 }
279
280 #[test]
281 fn og_meta_content_first() {
282 let html = r#"<meta content="Description here" property="og:description">"#;
283 assert_eq!(
284 extract_og_meta(html, "og:description"),
285 Some("Description here".to_string())
286 );
287 }
288
289 #[test]
290 fn og_meta_missing() {
291 let html = r#"<meta property="og:image" content="img.png">"#;
292 assert_eq!(extract_og_meta(html, "og:title"), None);
293 }
294
295 #[test]
296 fn html_title_fallback() {
297 let html = "<html><head><title>Page Title</title></head></html>";
298 assert_eq!(extract_html_title(html), Some("Page Title".to_string()));
299 }
300
301 #[test]
302 fn html_title_missing() {
303 let html = "<html><head></head></html>";
304 assert_eq!(extract_html_title(html), None);
305 }
306
307 // -- validate_url tests --
308
309 #[test]
310 fn validate_url_allows_https() {
311 assert!(validate_url("https://example.com"));
312 assert!(validate_url("https://example.com/path?q=1"));
313 }
314
315 #[test]
316 fn validate_url_allows_http() {
317 assert!(validate_url("http://example.com"));
318 }
319
320 #[test]
321 fn validate_url_blocks_non_http_schemes() {
322 assert!(!validate_url("ftp://example.com"));
323 assert!(!validate_url("file:///etc/passwd"));
324 assert!(!validate_url("javascript:alert(1)"));
325 assert!(!validate_url("data:text/html,<h1>hi</h1>"));
326 }
327
328 #[test]
329 fn validate_url_blocks_localhost() {
330 assert!(!validate_url("http://localhost"));
331 assert!(!validate_url("http://localhost:8080"));
332 assert!(!validate_url("http://127.0.0.1"));
333 assert!(!validate_url("http://127.0.0.1:3000"));
334 assert!(!validate_url("http://0.0.0.0"));
335 assert!(!validate_url("http://[::1]"));
336 assert!(!validate_url("http://[::1]:8080"));
337 }
338
339 #[test]
340 fn validate_url_blocks_private_10() {
341 assert!(!validate_url("http://10.0.0.1"));
342 assert!(!validate_url("http://10.255.255.255"));
343 }
344
345 #[test]
346 fn validate_url_blocks_private_192_168() {
347 assert!(!validate_url("http://192.168.0.1"));
348 assert!(!validate_url("http://192.168.1.100:8080"));
349 }
350
351 #[test]
352 fn validate_url_blocks_private_172_16() {
353 assert!(!validate_url("http://172.16.0.1"));
354 assert!(!validate_url("http://172.31.255.255"));
355 // 172.15 and 172.32 are public
356 assert!(validate_url("http://172.15.0.1"));
357 assert!(validate_url("http://172.32.0.1"));
358 }
359
360 #[test]
361 fn validate_url_blocks_link_local() {
362 assert!(!validate_url("http://169.254.0.1"));
363 assert!(!validate_url("http://169.254.169.254")); // AWS metadata
364 }
365
366 #[test]
367 fn validate_url_blocks_ipv6_private() {
368 assert!(!validate_url("http://[fd00::1]"));
369 assert!(!validate_url("http://[fe80::1]"));
370 }
371
372 #[test]
373 fn validate_url_allows_public_ips() {
374 assert!(validate_url("http://8.8.8.8"));
375 assert!(validate_url("https://93.184.216.34"));
376 }
377
378 #[tokio::test]
379 async fn noop_fetcher_returns_none_without_network() {
380 let fetcher = LinkPreviewFetcher::Noop;
381 // Any URL — would be a public host in production but here we expect no I/O.
382 assert!(fetcher.fetch("https://example.com").await.is_none());
383 }
384 }
385