max / makenotwork

12.0 KB · 385 lines History Blame Raw

1	//! Link preview — server-side OpenGraph metadata fetch for post URLs.
2
3	use pulldown_cmark::{Event, Parser, Tag};
4	use reqwest::header::CONTENT_TYPE;
5
6	/// Maximum number of URLs to extract per post.
7	const MAX_URLS: usize = 3;
8
9	/// Maximum response body size to read (1 MB).
10	const MAX_BODY_SIZE: usize = 1_048_576;
11
12	/// Check if an IP address is private/reserved (not safe for SSRF).
13	fn is_private_ip(ip: std::net::IpAddr) -> bool {
14	match ip {
15	std::net::IpAddr::V4(v4) => {
16	v4.is_loopback()
17	\|\| v4.is_private()
18	\|\| v4.is_link_local()
19	\|\| v4.is_broadcast()
20	\|\| v4.is_unspecified()
21	\|\| v4.octets()[0] == 100 && (v4.octets()[1] & 0xC0) == 64 // 100.64.0.0/10 (CGNAT / Tailscale)
22	}
23	std::net::IpAddr::V6(v6) => {
24	v6.is_loopback()
25	\|\| v6.is_unspecified()
26	\|\| (v6.segments()[0] & 0xfe00) == 0xfc00 // ULA fd00::/7
27	\|\| (v6.segments()[0] & 0xffc0) == 0xfe80 // link-local
28	\|\| matches!(v6.to_ipv4_mapped(), Some(v4) if is_private_ip(std::net::IpAddr::V4(v4)))
29	}
30	}
31	}
32
33	/// Validate that a URL is safe to fetch (no SSRF to internal networks).
34	/// Resolves the hostname to catch alternative IP encodings (octal, hex, decimal, IPv6-mapped).
35	fn validate_url(url: &str) -> bool {
36	let lower = url.to_ascii_lowercase();
37	if !lower.starts_with("http://") && !lower.starts_with("https://") {
38	return false;
39	}
40	let host_part = lower
41	.strip_prefix("http://")
42	.or_else(\|\| lower.strip_prefix("https://"))
43	.unwrap_or("");
44	let host_and_port = host_part.split('/').next().unwrap_or("");
45	let host = if host_and_port.starts_with('[') {
46	host_and_port
47	.split(']')
48	.next()
49	.map(\|s\| format!("{}]", s))
50	.unwrap_or_default()
51	} else {
52	host_and_port.split(':').next().unwrap_or("").to_string()
53	};
54	let host = host.as_str();
55
56	// Quick string-based check for common private patterns
57	if host == "localhost" \|\| host == "0.0.0.0" {
58	return false;
59	}
60
61	// Try parsing as a raw IP address (catches octal, hex, decimal encodings)
62	let bare_host = host.trim_start_matches('[').trim_end_matches(']');
63	if let Ok(ip) = bare_host.parse::<std::net::IpAddr>() {
64	return !is_private_ip(ip);
65	}
66
67	// For hostnames, resolve and check all addresses
68	if let Ok(addrs) = std::net::ToSocketAddrs::to_socket_addrs(&(bare_host, 80)) {
69	for addr in addrs {
70	if is_private_ip(addr.ip()) {
71	return false;
72	}
73	}
74	}
75
76	true
77	}
78
79	/// Extract unique http/https URLs from markdown text via pulldown_cmark link parsing.
80	/// Returns at most `MAX_URLS` URLs.
81	pub fn extract_urls(input: &str) -> Vec<String> {
82	let parser = Parser::new(input);
83	let mut seen = std::collections::HashSet::new();
84	let mut urls = Vec::new();
85
86	for event in parser {
87	if let Event::Start(Tag::Link { dest_url, .. }) = event {
88	let url = dest_url.to_string();
89	if (url.starts_with("http://") \|\| url.starts_with("https://"))
90	&& seen.insert(url.clone())
91	{
92	urls.push(url);
93	if urls.len() >= MAX_URLS {
94	break;
95	}
96	}
97	}
98	}
99
100	urls
101	}
102
103	/// Build a reqwest client for link preview fetching with SSRF-safe redirect policy.
104	pub fn build_preview_client() -> reqwest::Client {
105	reqwest::Client::builder()
106	.redirect(reqwest::redirect::Policy::custom(\|attempt\| {
107	if !validate_url(attempt.url().as_str()) \|\| attempt.previous().len() >= 5 {
108	attempt.stop()
109	} else {
110	attempt.follow()
111	}
112	}))
113	.build()
114	.expect("failed to build preview HTTP client")
115	}
116
117	/// Strategy for fetching link previews. The `Noop` variant lets tests skip
118	/// real HTTP without monkey-patching `tokio::spawn`; production constructs
119	/// `Http(build_preview_client())`.
120	#[derive(Clone)]
121	pub enum LinkPreviewFetcher {
122	Http(reqwest::Client),
123	Noop,
124	}
125
126	impl LinkPreviewFetcher {
127	pub async fn fetch(&self, url: &str) -> Option<(Option<String>, Option<String>)> {
128	match self {
129	Self::Http(client) => fetch_og_metadata(client, url).await,
130	Self::Noop => None,
131	}
132	}
133	}
134
135	/// Fetch OpenGraph metadata from a URL. Returns `(og:title, og:description)`.
136	/// Best-effort: returns None on any error (timeout, too large, parse failure).
137	#[tracing::instrument(skip_all)]
138	pub async fn fetch_og_metadata(
139	http: &reqwest::Client,
140	url: &str,
141	) -> Option<(Option<String>, Option<String>)> {
142	if !validate_url(url) {
143	return None;
144	}
145
146	let resp = http
147	.get(url)
148	.timeout(std::time::Duration::from_secs(5))
149	.header("User-Agent", "Multithreaded/LinkPreview")
150	.send()
151	.await
152	.ok()?;
153
154	if !resp.status().is_success() {
155	return None;
156	}
157
158	// Only fetch HTML content
159	if let Some(ct) = resp.headers().get(CONTENT_TYPE) {
160	let ct_str = ct.to_str().unwrap_or("");
161	if !ct_str.starts_with("text/html") {
162	return None;
163	}
164	}
165
166	// Read body in chunks, capping at MAX_BODY_SIZE
167	let mut body = Vec::new();
168	let mut stream = resp;
169	while body.len() < MAX_BODY_SIZE {
170	let chunk = match stream.chunk().await.ok()? {
171	Some(c) => c,
172	None => break,
173	};
174	let remaining = MAX_BODY_SIZE - body.len();
175	body.extend_from_slice(&chunk[..chunk.len().min(remaining)]);
176	}
177
178	let html = String::from_utf8_lossy(&body);
179
180	let og_title = extract_og_meta(&html, "og:title");
181	let og_desc = extract_og_meta(&html, "og:description");
182
183	// Fall back to <title> tag if no og:title
184	let title = og_title.or_else(\|\| extract_html_title(&html));
185
186	if title.is_some() \|\| og_desc.is_some() {
187	Some((title, og_desc))
188	} else {
189	None
190	}
191	}
192
193	/// Extract a `<meta property="..." content="...">` value from HTML.
194	fn extract_og_meta(html: &str, property: &str) -> Option<String> {
195	static OG_RE: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(\|\| {
196	regex_lite::Regex::new(
197	r#"<meta\s[^>]?property\s=\s"([^"])"[^>]?content\s=\s"([^"])"[^>]*?>"#,
198	)
199	.unwrap()
200	});
201	static OG_RE_REV: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(\|\| {
202	regex_lite::Regex::new(
203	r#"<meta\s[^>]?content\s=\s"([^"])"[^>]?property\s=\s"([^"])"[^>]*?>"#,
204	)
205	.unwrap()
206	});
207
208	// Try property-first order
209	for caps in OG_RE.captures_iter(html) {
210	if &caps[1] == property {
211	let val = caps[2].trim().to_string();
212	if !val.is_empty() {
213	return Some(val);
214	}
215	}
216	}
217	// Try content-first order (some sites put content before property)
218	for caps in OG_RE_REV.captures_iter(html) {
219	if &caps[2] == property {
220	let val = caps[1].trim().to_string();
221	if !val.is_empty() {
222	return Some(val);
223	}
224	}
225	}
226	None
227	}
228
229	/// Extract the `<title>` tag content from HTML.
230	fn extract_html_title(html: &str) -> Option<String> {
231	static TITLE_RE: std::sync::LazyLock<regex_lite::Regex> =
232	std::sync::LazyLock::new(\|\| regex_lite::Regex::new(r"<title[^>]*>([^<]+)</title>").unwrap());
233	TITLE_RE.captures(html).map(\|c\| c[1].trim().to_string())
234	}
235
236	#[cfg(test)]
237	mod tests {
238	use super::*;
239
240	#[test]
241	fn extract_urls_from_markdown() {
242	let input = "Check [this](https://example.com) and [that](https://other.com/page).";
243	let urls = extract_urls(input);
244	assert_eq!(urls, vec!["https://example.com", "https://other.com/page"]);
245	}
246
247	#[test]
248	fn extract_urls_skips_non_http() {
249	let input = "[mail](mailto:a@b.com) [site](https://x.com)";
250	let urls = extract_urls(input);
251	assert_eq!(urls, vec!["https://x.com"]);
252	}
253
254	#[test]
255	fn extract_urls_caps_at_three() {
256	let input = "[a](https://1.com) [b](https://2.com) [c](https://3.com) [d](https://4.com)";
257	let urls = extract_urls(input);
258	assert_eq!(urls.len(), 3);
259	}
260
261	#[test]
262	fn extract_urls_deduplicates() {
263	let input = "[a](https://same.com) [b](https://same.com)";
264	let urls = extract_urls(input);
265	assert_eq!(urls, vec!["https://same.com"]);
266	}
267
268	#[test]
269	fn extract_urls_no_links() {
270	let urls = extract_urls("no links here");
271	assert!(urls.is_empty());
272	}
273
274	#[test]
275	fn og_meta_property_first() {
276	let html = r#"<meta property="og:title" content="My Page">"#;
277	assert_eq!(extract_og_meta(html, "og:title"), Some("My Page".to_string()));
278	}
279
280	#[test]
281	fn og_meta_content_first() {
282	let html = r#"<meta content="Description here" property="og:description">"#;
283	assert_eq!(
284	extract_og_meta(html, "og:description"),
285	Some("Description here".to_string())
286	);
287	}
288
289	#[test]
290	fn og_meta_missing() {
291	let html = r#"<meta property="og:image" content="img.png">"#;
292	assert_eq!(extract_og_meta(html, "og:title"), None);
293	}
294
295	#[test]
296	fn html_title_fallback() {
297	let html = "<html><head><title>Page Title</title></head></html>";
298	assert_eq!(extract_html_title(html), Some("Page Title".to_string()));
299	}
300
301	#[test]
302	fn html_title_missing() {
303	let html = "<html><head></head></html>";
304	assert_eq!(extract_html_title(html), None);
305	}
306
307	// -- validate_url tests --
308
309	#[test]
310	fn validate_url_allows_https() {
311	assert!(validate_url("https://example.com"));
312	assert!(validate_url("https://example.com/path?q=1"));
313	}
314
315	#[test]
316	fn validate_url_allows_http() {
317	assert!(validate_url("http://example.com"));
318	}
319
320	#[test]
321	fn validate_url_blocks_non_http_schemes() {
322	assert!(!validate_url("ftp://example.com"));
323	assert!(!validate_url("file:///etc/passwd"));
324	assert!(!validate_url("javascript:alert(1)"));
325	assert!(!validate_url("data:text/html,<h1>hi</h1>"));
326	}
327
328	#[test]
329	fn validate_url_blocks_localhost() {
330	assert!(!validate_url("http://localhost"));
331	assert!(!validate_url("http://localhost:8080"));
332	assert!(!validate_url("http://127.0.0.1"));
333	assert!(!validate_url("http://127.0.0.1:3000"));
334	assert!(!validate_url("http://0.0.0.0"));
335	assert!(!validate_url("http://[::1]"));
336	assert!(!validate_url("http://[::1]:8080"));
337	}
338
339	#[test]
340	fn validate_url_blocks_private_10() {
341	assert!(!validate_url("http://10.0.0.1"));
342	assert!(!validate_url("http://10.255.255.255"));
343	}
344
345	#[test]
346	fn validate_url_blocks_private_192_168() {
347	assert!(!validate_url("http://192.168.0.1"));
348	assert!(!validate_url("http://192.168.1.100:8080"));
349	}
350
351	#[test]
352	fn validate_url_blocks_private_172_16() {
353	assert!(!validate_url("http://172.16.0.1"));
354	assert!(!validate_url("http://172.31.255.255"));
355	// 172.15 and 172.32 are public
356	assert!(validate_url("http://172.15.0.1"));
357	assert!(validate_url("http://172.32.0.1"));
358	}
359
360	#[test]
361	fn validate_url_blocks_link_local() {
362	assert!(!validate_url("http://169.254.0.1"));
363	assert!(!validate_url("http://169.254.169.254")); // AWS metadata
364	}
365
366	#[test]
367	fn validate_url_blocks_ipv6_private() {
368	assert!(!validate_url("http://[fd00::1]"));
369	assert!(!validate_url("http://[fe80::1]"));
370	}
371
372	#[test]
373	fn validate_url_allows_public_ips() {
374	assert!(validate_url("http://8.8.8.8"));
375	assert!(validate_url("https://93.184.216.34"));
376	}
377
378	#[tokio::test]
379	async fn noop_fetcher_returns_none_without_network() {
380	let fetcher = LinkPreviewFetcher::Noop;
381	// Any URL — would be a public host in production but here we expect no I/O.
382	assert!(fetcher.fetch("https://example.com").await.is_none());
383	}
384	}
385