max / multithreaded

10.2 KB · 337 lines History Blame Raw

1	//! Link preview — server-side OpenGraph metadata fetch for post URLs.
2
3	use pulldown_cmark::{Event, Parser, Tag};
4	use reqwest::header::CONTENT_TYPE;
5
6	/// Maximum number of URLs to extract per post.
7	const MAX_URLS: usize = 3;
8
9	/// Maximum response body size to read (1 MB).
10	const MAX_BODY_SIZE: usize = 1_048_576;
11
12	/// Validate that a URL is safe to fetch (no SSRF to internal networks).
13	fn validate_url(url: &str) -> bool {
14	let lower = url.to_ascii_lowercase();
15	if !lower.starts_with("http://") && !lower.starts_with("https://") {
16	return false;
17	}
18	let host_part = lower
19	.strip_prefix("http://")
20	.or_else(\|\| lower.strip_prefix("https://"))
21	.unwrap_or("");
22	let host_and_port = host_part.split('/').next().unwrap_or("");
23	let host = if host_and_port.starts_with('[') {
24	host_and_port
25	.split(']')
26	.next()
27	.map(\|s\| format!("{}]", s))
28	.unwrap_or_default()
29	} else {
30	host_and_port.split(':').next().unwrap_or("").to_string()
31	};
32	let host = host.as_str();
33	if host == "localhost"
34	\|\| host == "127.0.0.1"
35	\|\| host == "[::1]"
36	\|\| host == "0.0.0.0"
37	\|\| host.starts_with("10.")
38	\|\| host.starts_with("192.168.")
39	\|\| host.starts_with("169.254.")
40	\|\| host.starts_with("[fd")
41	\|\| host.starts_with("[fe80:")
42	{
43	return false;
44	}
45	// Block 172.16.0.0/12
46	if let Some(rest) = host.strip_prefix("172.")
47	&& let Some(second) = rest.split('.').next()
48	&& let Ok(n) = second.parse::<u8>()
49	&& (16..=31).contains(&n)
50	{
51	return false;
52	}
53	true
54	}
55
56	/// Extract unique http/https URLs from markdown text via pulldown_cmark link parsing.
57	/// Returns at most `MAX_URLS` URLs.
58	pub fn extract_urls(input: &str) -> Vec<String> {
59	let parser = Parser::new(input);
60	let mut seen = std::collections::HashSet::new();
61	let mut urls = Vec::new();
62
63	for event in parser {
64	if let Event::Start(Tag::Link { dest_url, .. }) = event {
65	let url = dest_url.to_string();
66	if (url.starts_with("http://") \|\| url.starts_with("https://"))
67	&& seen.insert(url.clone())
68	{
69	urls.push(url);
70	if urls.len() >= MAX_URLS {
71	break;
72	}
73	}
74	}
75	}
76
77	urls
78	}
79
80	/// Build a reqwest client for link preview fetching with SSRF-safe redirect policy.
81	pub fn build_preview_client() -> reqwest::Client {
82	reqwest::Client::builder()
83	.redirect(reqwest::redirect::Policy::custom(\|attempt\| {
84	if !validate_url(attempt.url().as_str()) \|\| attempt.previous().len() >= 5 {
85	attempt.stop()
86	} else {
87	attempt.follow()
88	}
89	}))
90	.build()
91	.expect("failed to build preview HTTP client")
92	}
93
94	/// Fetch OpenGraph metadata from a URL. Returns `(og:title, og:description)`.
95	/// Best-effort: returns None on any error (timeout, too large, parse failure).
96	#[tracing::instrument(skip_all)]
97	pub async fn fetch_og_metadata(
98	http: &reqwest::Client,
99	url: &str,
100	) -> Option<(Option<String>, Option<String>)> {
101	if !validate_url(url) {
102	return None;
103	}
104
105	let resp = http
106	.get(url)
107	.timeout(std::time::Duration::from_secs(5))
108	.header("User-Agent", "Multithreaded/LinkPreview")
109	.send()
110	.await
111	.ok()?;
112
113	if !resp.status().is_success() {
114	return None;
115	}
116
117	// Only fetch HTML content
118	if let Some(ct) = resp.headers().get(CONTENT_TYPE) {
119	let ct_str = ct.to_str().unwrap_or("");
120	if !ct_str.starts_with("text/html") {
121	return None;
122	}
123	}
124
125	// Read body in chunks, capping at MAX_BODY_SIZE
126	let mut body = Vec::new();
127	let mut stream = resp;
128	while body.len() < MAX_BODY_SIZE {
129	let chunk = match stream.chunk().await.ok()? {
130	Some(c) => c,
131	None => break,
132	};
133	let remaining = MAX_BODY_SIZE - body.len();
134	body.extend_from_slice(&chunk[..chunk.len().min(remaining)]);
135	}
136
137	let html = String::from_utf8_lossy(&body);
138
139	let og_title = extract_og_meta(&html, "og:title");
140	let og_desc = extract_og_meta(&html, "og:description");
141
142	// Fall back to <title> tag if no og:title
143	let title = og_title.or_else(\|\| extract_html_title(&html));
144
145	if title.is_some() \|\| og_desc.is_some() {
146	Some((title, og_desc))
147	} else {
148	None
149	}
150	}
151
152	/// Extract a `<meta property="..." content="...">` value from HTML.
153	fn extract_og_meta(html: &str, property: &str) -> Option<String> {
154	static OG_RE: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(\|\| {
155	regex_lite::Regex::new(
156	r#"<meta\s[^>]?property\s=\s"([^"])"[^>]?content\s=\s"([^"])"[^>]*?>"#,
157	)
158	.unwrap()
159	});
160	static OG_RE_REV: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(\|\| {
161	regex_lite::Regex::new(
162	r#"<meta\s[^>]?content\s=\s"([^"])"[^>]?property\s=\s"([^"])"[^>]*?>"#,
163	)
164	.unwrap()
165	});
166
167	// Try property-first order
168	for caps in OG_RE.captures_iter(html) {
169	if &caps[1] == property {
170	let val = caps[2].trim().to_string();
171	if !val.is_empty() {
172	return Some(val);
173	}
174	}
175	}
176	// Try content-first order (some sites put content before property)
177	for caps in OG_RE_REV.captures_iter(html) {
178	if &caps[2] == property {
179	let val = caps[1].trim().to_string();
180	if !val.is_empty() {
181	return Some(val);
182	}
183	}
184	}
185	None
186	}
187
188	/// Extract the `<title>` tag content from HTML.
189	fn extract_html_title(html: &str) -> Option<String> {
190	static TITLE_RE: std::sync::LazyLock<regex_lite::Regex> =
191	std::sync::LazyLock::new(\|\| regex_lite::Regex::new(r"<title[^>]*>([^<]+)</title>").unwrap());
192	TITLE_RE.captures(html).map(\|c\| c[1].trim().to_string())
193	}
194
195	#[cfg(test)]
196	mod tests {
197	use super::*;
198
199	#[test]
200	fn extract_urls_from_markdown() {
201	let input = "Check [this](https://example.com) and [that](https://other.com/page).";
202	let urls = extract_urls(input);
203	assert_eq!(urls, vec!["https://example.com", "https://other.com/page"]);
204	}
205
206	#[test]
207	fn extract_urls_skips_non_http() {
208	let input = "[mail](mailto:a@b.com) [site](https://x.com)";
209	let urls = extract_urls(input);
210	assert_eq!(urls, vec!["https://x.com"]);
211	}
212
213	#[test]
214	fn extract_urls_caps_at_three() {
215	let input = "[a](https://1.com) [b](https://2.com) [c](https://3.com) [d](https://4.com)";
216	let urls = extract_urls(input);
217	assert_eq!(urls.len(), 3);
218	}
219
220	#[test]
221	fn extract_urls_deduplicates() {
222	let input = "[a](https://same.com) [b](https://same.com)";
223	let urls = extract_urls(input);
224	assert_eq!(urls, vec!["https://same.com"]);
225	}
226
227	#[test]
228	fn extract_urls_no_links() {
229	let urls = extract_urls("no links here");
230	assert!(urls.is_empty());
231	}
232
233	#[test]
234	fn og_meta_property_first() {
235	let html = r#"<meta property="og:title" content="My Page">"#;
236	assert_eq!(extract_og_meta(html, "og:title"), Some("My Page".to_string()));
237	}
238
239	#[test]
240	fn og_meta_content_first() {
241	let html = r#"<meta content="Description here" property="og:description">"#;
242	assert_eq!(
243	extract_og_meta(html, "og:description"),
244	Some("Description here".to_string())
245	);
246	}
247
248	#[test]
249	fn og_meta_missing() {
250	let html = r#"<meta property="og:image" content="img.png">"#;
251	assert_eq!(extract_og_meta(html, "og:title"), None);
252	}
253
254	#[test]
255	fn html_title_fallback() {
256	let html = "<html><head><title>Page Title</title></head></html>";
257	assert_eq!(extract_html_title(html), Some("Page Title".to_string()));
258	}
259
260	#[test]
261	fn html_title_missing() {
262	let html = "<html><head></head></html>";
263	assert_eq!(extract_html_title(html), None);
264	}
265
266	// -- validate_url tests --
267
268	#[test]
269	fn validate_url_allows_https() {
270	assert!(validate_url("https://example.com"));
271	assert!(validate_url("https://example.com/path?q=1"));
272	}
273
274	#[test]
275	fn validate_url_allows_http() {
276	assert!(validate_url("http://example.com"));
277	}
278
279	#[test]
280	fn validate_url_blocks_non_http_schemes() {
281	assert!(!validate_url("ftp://example.com"));
282	assert!(!validate_url("file:///etc/passwd"));
283	assert!(!validate_url("javascript:alert(1)"));
284	assert!(!validate_url("data:text/html,<h1>hi</h1>"));
285	}
286
287	#[test]
288	fn validate_url_blocks_localhost() {
289	assert!(!validate_url("http://localhost"));
290	assert!(!validate_url("http://localhost:8080"));
291	assert!(!validate_url("http://127.0.0.1"));
292	assert!(!validate_url("http://127.0.0.1:3000"));
293	assert!(!validate_url("http://0.0.0.0"));
294	assert!(!validate_url("http://[::1]"));
295	assert!(!validate_url("http://[::1]:8080"));
296	}
297
298	#[test]
299	fn validate_url_blocks_private_10() {
300	assert!(!validate_url("http://10.0.0.1"));
301	assert!(!validate_url("http://10.255.255.255"));
302	}
303
304	#[test]
305	fn validate_url_blocks_private_192_168() {
306	assert!(!validate_url("http://192.168.0.1"));
307	assert!(!validate_url("http://192.168.1.100:8080"));
308	}
309
310	#[test]
311	fn validate_url_blocks_private_172_16() {
312	assert!(!validate_url("http://172.16.0.1"));
313	assert!(!validate_url("http://172.31.255.255"));
314	// 172.15 and 172.32 are public
315	assert!(validate_url("http://172.15.0.1"));
316	assert!(validate_url("http://172.32.0.1"));
317	}
318
319	#[test]
320	fn validate_url_blocks_link_local() {
321	assert!(!validate_url("http://169.254.0.1"));
322	assert!(!validate_url("http://169.254.169.254")); // AWS metadata
323	}
324
325	#[test]
326	fn validate_url_blocks_ipv6_private() {
327	assert!(!validate_url("http://[fd00::1]"));
328	assert!(!validate_url("http://[fe80::1]"));
329	}
330
331	#[test]
332	fn validate_url_allows_public_ips() {
333	assert!(validate_url("http://8.8.8.8"));
334	assert!(validate_url("https://93.184.216.34"));
335	}
336	}
337