//! Link preview — server-side OpenGraph metadata fetch for post URLs. use pulldown_cmark::{Event, Parser, Tag}; use reqwest::header::CONTENT_TYPE; /// Maximum number of URLs to extract per post. const MAX_URLS: usize = 3; /// Maximum response body size to read (1 MB). const MAX_BODY_SIZE: usize = 1_048_576; /// Validate that a URL is safe to fetch (no SSRF to internal networks). fn validate_url(url: &str) -> bool { let lower = url.to_ascii_lowercase(); if !lower.starts_with("http://") && !lower.starts_with("https://") { return false; } let host_part = lower .strip_prefix("http://") .or_else(|| lower.strip_prefix("https://")) .unwrap_or(""); let host_and_port = host_part.split('/').next().unwrap_or(""); let host = if host_and_port.starts_with('[') { host_and_port .split(']') .next() .map(|s| format!("{}]", s)) .unwrap_or_default() } else { host_and_port.split(':').next().unwrap_or("").to_string() }; let host = host.as_str(); if host == "localhost" || host == "127.0.0.1" || host == "[::1]" || host == "0.0.0.0" || host.starts_with("10.") || host.starts_with("192.168.") || host.starts_with("169.254.") || host.starts_with("[fd") || host.starts_with("[fe80:") { return false; } // Block 172.16.0.0/12 if let Some(rest) = host.strip_prefix("172.") && let Some(second) = rest.split('.').next() && let Ok(n) = second.parse::() && (16..=31).contains(&n) { return false; } true } /// Extract unique http/https URLs from markdown text via pulldown_cmark link parsing. /// Returns at most `MAX_URLS` URLs. pub fn extract_urls(input: &str) -> Vec { let parser = Parser::new(input); let mut seen = std::collections::HashSet::new(); let mut urls = Vec::new(); for event in parser { if let Event::Start(Tag::Link { dest_url, .. }) = event { let url = dest_url.to_string(); if (url.starts_with("http://") || url.starts_with("https://")) && seen.insert(url.clone()) { urls.push(url); if urls.len() >= MAX_URLS { break; } } } } urls } /// Build a reqwest client for link preview fetching with SSRF-safe redirect policy. pub fn build_preview_client() -> reqwest::Client { reqwest::Client::builder() .redirect(reqwest::redirect::Policy::custom(|attempt| { if !validate_url(attempt.url().as_str()) || attempt.previous().len() >= 5 { attempt.stop() } else { attempt.follow() } })) .build() .expect("failed to build preview HTTP client") } /// Fetch OpenGraph metadata from a URL. Returns `(og:title, og:description)`. /// Best-effort: returns None on any error (timeout, too large, parse failure). #[tracing::instrument(skip_all)] pub async fn fetch_og_metadata( http: &reqwest::Client, url: &str, ) -> Option<(Option, Option)> { if !validate_url(url) { return None; } let resp = http .get(url) .timeout(std::time::Duration::from_secs(5)) .header("User-Agent", "Multithreaded/LinkPreview") .send() .await .ok()?; if !resp.status().is_success() { return None; } // Only fetch HTML content if let Some(ct) = resp.headers().get(CONTENT_TYPE) { let ct_str = ct.to_str().unwrap_or(""); if !ct_str.starts_with("text/html") { return None; } } // Read body in chunks, capping at MAX_BODY_SIZE let mut body = Vec::new(); let mut stream = resp; while body.len() < MAX_BODY_SIZE { let chunk = match stream.chunk().await.ok()? { Some(c) => c, None => break, }; let remaining = MAX_BODY_SIZE - body.len(); body.extend_from_slice(&chunk[..chunk.len().min(remaining)]); } let html = String::from_utf8_lossy(&body); let og_title = extract_og_meta(&html, "og:title"); let og_desc = extract_og_meta(&html, "og:description"); // Fall back to tag if no og:title let title = og_title.or_else(|| extract_html_title(&html)); if title.is_some() || og_desc.is_some() { Some((title, og_desc)) } else { None } } /// Extract a `<meta property="..." content="...">` value from HTML. fn extract_og_meta(html: &str, property: &str) -> Option<String> { static OG_RE: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(|| { regex_lite::Regex::new( r#"<meta\s[^>]*?property\s*=\s*"([^"]*)"[^>]*?content\s*=\s*"([^"]*)"[^>]*?>"#, ) .unwrap() }); static OG_RE_REV: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(|| { regex_lite::Regex::new( r#"<meta\s[^>]*?content\s*=\s*"([^"]*)"[^>]*?property\s*=\s*"([^"]*)"[^>]*?>"#, ) .unwrap() }); // Try property-first order for caps in OG_RE.captures_iter(html) { if &caps[1] == property { let val = caps[2].trim().to_string(); if !val.is_empty() { return Some(val); } } } // Try content-first order (some sites put content before property) for caps in OG_RE_REV.captures_iter(html) { if &caps[2] == property { let val = caps[1].trim().to_string(); if !val.is_empty() { return Some(val); } } } None } /// Extract the `<title>` tag content from HTML. fn extract_html_title(html: &str) -> Option<String> { static TITLE_RE: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(|| regex_lite::Regex::new(r"<title[^>]*>([^<]+)").unwrap()); TITLE_RE.captures(html).map(|c| c[1].trim().to_string()) } #[cfg(test)] mod tests { use super::*; #[test] fn extract_urls_from_markdown() { let input = "Check [this](https://example.com) and [that](https://other.com/page)."; let urls = extract_urls(input); assert_eq!(urls, vec!["https://example.com", "https://other.com/page"]); } #[test] fn extract_urls_skips_non_http() { let input = "[mail](mailto:a@b.com) [site](https://x.com)"; let urls = extract_urls(input); assert_eq!(urls, vec!["https://x.com"]); } #[test] fn extract_urls_caps_at_three() { let input = "[a](https://1.com) [b](https://2.com) [c](https://3.com) [d](https://4.com)"; let urls = extract_urls(input); assert_eq!(urls.len(), 3); } #[test] fn extract_urls_deduplicates() { let input = "[a](https://same.com) [b](https://same.com)"; let urls = extract_urls(input); assert_eq!(urls, vec!["https://same.com"]); } #[test] fn extract_urls_no_links() { let urls = extract_urls("no links here"); assert!(urls.is_empty()); } #[test] fn og_meta_property_first() { let html = r#""#; assert_eq!(extract_og_meta(html, "og:title"), Some("My Page".to_string())); } #[test] fn og_meta_content_first() { let html = r#""#; assert_eq!( extract_og_meta(html, "og:description"), Some("Description here".to_string()) ); } #[test] fn og_meta_missing() { let html = r#""#; assert_eq!(extract_og_meta(html, "og:title"), None); } #[test] fn html_title_fallback() { let html = "Page Title"; assert_eq!(extract_html_title(html), Some("Page Title".to_string())); } #[test] fn html_title_missing() { let html = ""; assert_eq!(extract_html_title(html), None); } // -- validate_url tests -- #[test] fn validate_url_allows_https() { assert!(validate_url("https://example.com")); assert!(validate_url("https://example.com/path?q=1")); } #[test] fn validate_url_allows_http() { assert!(validate_url("http://example.com")); } #[test] fn validate_url_blocks_non_http_schemes() { assert!(!validate_url("ftp://example.com")); assert!(!validate_url("file:///etc/passwd")); assert!(!validate_url("javascript:alert(1)")); assert!(!validate_url("data:text/html,

hi

")); } #[test] fn validate_url_blocks_localhost() { assert!(!validate_url("http://localhost")); assert!(!validate_url("http://localhost:8080")); assert!(!validate_url("http://127.0.0.1")); assert!(!validate_url("http://127.0.0.1:3000")); assert!(!validate_url("http://0.0.0.0")); assert!(!validate_url("http://[::1]")); assert!(!validate_url("http://[::1]:8080")); } #[test] fn validate_url_blocks_private_10() { assert!(!validate_url("http://10.0.0.1")); assert!(!validate_url("http://10.255.255.255")); } #[test] fn validate_url_blocks_private_192_168() { assert!(!validate_url("http://192.168.0.1")); assert!(!validate_url("http://192.168.1.100:8080")); } #[test] fn validate_url_blocks_private_172_16() { assert!(!validate_url("http://172.16.0.1")); assert!(!validate_url("http://172.31.255.255")); // 172.15 and 172.32 are public assert!(validate_url("http://172.15.0.1")); assert!(validate_url("http://172.32.0.1")); } #[test] fn validate_url_blocks_link_local() { assert!(!validate_url("http://169.254.0.1")); assert!(!validate_url("http://169.254.169.254")); // AWS metadata } #[test] fn validate_url_blocks_ipv6_private() { assert!(!validate_url("http://[fd00::1]")); assert!(!validate_url("http://[fe80::1]")); } #[test] fn validate_url_allows_public_ips() { assert!(validate_url("http://8.8.8.8")); assert!(validate_url("https://93.184.216.34")); } }