//! Link preview — server-side OpenGraph metadata fetch for post URLs. use pulldown_cmark::{Event, Parser, Tag}; use reqwest::header::CONTENT_TYPE; /// Maximum number of URLs to extract per post. const MAX_URLS: usize = 3; /// Maximum response body size to read (1 MB). const MAX_BODY_SIZE: usize = 1_048_576; /// Check if an IP address is private/reserved (not safe for SSRF). fn is_private_ip(ip: std::net::IpAddr) -> bool { match ip { std::net::IpAddr::V4(v4) => { v4.is_loopback() || v4.is_private() || v4.is_link_local() || v4.is_broadcast() || v4.is_unspecified() || v4.octets()[0] == 100 && (v4.octets()[1] & 0xC0) == 64 // 100.64.0.0/10 (CGNAT / Tailscale) } std::net::IpAddr::V6(v6) => { v6.is_loopback() || v6.is_unspecified() || (v6.segments()[0] & 0xfe00) == 0xfc00 // ULA fd00::/7 || (v6.segments()[0] & 0xffc0) == 0xfe80 // link-local || matches!(v6.to_ipv4_mapped(), Some(v4) if is_private_ip(std::net::IpAddr::V4(v4))) } } } /// Validate that a URL is safe to fetch (no SSRF to internal networks). /// Resolves the hostname to catch alternative IP encodings (octal, hex, decimal, IPv6-mapped). fn validate_url(url: &str) -> bool { let lower = url.to_ascii_lowercase(); if !lower.starts_with("http://") && !lower.starts_with("https://") { return false; } let host_part = lower .strip_prefix("http://") .or_else(|| lower.strip_prefix("https://")) .unwrap_or(""); let host_and_port = host_part.split('/').next().unwrap_or(""); let host = if host_and_port.starts_with('[') { host_and_port .split(']') .next() .map(|s| format!("{}]", s)) .unwrap_or_default() } else { host_and_port.split(':').next().unwrap_or("").to_string() }; let host = host.as_str(); // Quick string-based check for common private patterns if host == "localhost" || host == "0.0.0.0" { return false; } // Try parsing as a raw IP address (catches octal, hex, decimal encodings) let bare_host = host.trim_start_matches('[').trim_end_matches(']'); if let Ok(ip) = bare_host.parse::() { return !is_private_ip(ip); } // For hostnames, resolve and check all addresses if let Ok(addrs) = std::net::ToSocketAddrs::to_socket_addrs(&(bare_host, 80)) { for addr in addrs { if is_private_ip(addr.ip()) { return false; } } } true } /// Extract unique http/https URLs from markdown text via pulldown_cmark link parsing. /// Returns at most `MAX_URLS` URLs. pub fn extract_urls(input: &str) -> Vec { let parser = Parser::new(input); let mut seen = std::collections::HashSet::new(); let mut urls = Vec::new(); for event in parser { if let Event::Start(Tag::Link { dest_url, .. }) = event { let url = dest_url.to_string(); if (url.starts_with("http://") || url.starts_with("https://")) && seen.insert(url.clone()) { urls.push(url); if urls.len() >= MAX_URLS { break; } } } } urls } /// Build a reqwest client for link preview fetching with SSRF-safe redirect policy. pub fn build_preview_client() -> reqwest::Client { reqwest::Client::builder() .redirect(reqwest::redirect::Policy::custom(|attempt| { if !validate_url(attempt.url().as_str()) || attempt.previous().len() >= 5 { attempt.stop() } else { attempt.follow() } })) .build() .expect("failed to build preview HTTP client") } /// Strategy for fetching link previews. The `Noop` variant lets tests skip /// real HTTP without monkey-patching `tokio::spawn`; production constructs /// `Http(build_preview_client())`. #[derive(Clone)] pub enum LinkPreviewFetcher { Http(reqwest::Client), Noop, } impl LinkPreviewFetcher { pub async fn fetch(&self, url: &str) -> Option<(Option, Option)> { match self { Self::Http(client) => fetch_og_metadata(client, url).await, Self::Noop => None, } } } /// Fetch OpenGraph metadata from a URL. Returns `(og:title, og:description)`. /// Best-effort: returns None on any error (timeout, too large, parse failure). #[tracing::instrument(skip_all)] pub async fn fetch_og_metadata( http: &reqwest::Client, url: &str, ) -> Option<(Option, Option)> { if !validate_url(url) { return None; } let resp = http .get(url) .timeout(std::time::Duration::from_secs(5)) .header("User-Agent", "Multithreaded/LinkPreview") .send() .await .ok()?; if !resp.status().is_success() { return None; } // Only fetch HTML content if let Some(ct) = resp.headers().get(CONTENT_TYPE) { let ct_str = ct.to_str().unwrap_or(""); if !ct_str.starts_with("text/html") { return None; } } // Read body in chunks, capping at MAX_BODY_SIZE let mut body = Vec::new(); let mut stream = resp; while body.len() < MAX_BODY_SIZE { let chunk = match stream.chunk().await.ok()? { Some(c) => c, None => break, }; let remaining = MAX_BODY_SIZE - body.len(); body.extend_from_slice(&chunk[..chunk.len().min(remaining)]); } let html = String::from_utf8_lossy(&body); let og_title = extract_og_meta(&html, "og:title"); let og_desc = extract_og_meta(&html, "og:description"); // Fall back to tag if no og:title let title = og_title.or_else(|| extract_html_title(&html)); if title.is_some() || og_desc.is_some() { Some((title, og_desc)) } else { None } } /// Extract a `<meta property="..." content="...">` value from HTML. fn extract_og_meta(html: &str, property: &str) -> Option<String> { static OG_RE: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(|| { regex_lite::Regex::new( r#"<meta\s[^>]*?property\s*=\s*"([^"]*)"[^>]*?content\s*=\s*"([^"]*)"[^>]*?>"#, ) .unwrap() }); static OG_RE_REV: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(|| { regex_lite::Regex::new( r#"<meta\s[^>]*?content\s*=\s*"([^"]*)"[^>]*?property\s*=\s*"([^"]*)"[^>]*?>"#, ) .unwrap() }); // Try property-first order for caps in OG_RE.captures_iter(html) { if &caps[1] == property { let val = caps[2].trim().to_string(); if !val.is_empty() { return Some(val); } } } // Try content-first order (some sites put content before property) for caps in OG_RE_REV.captures_iter(html) { if &caps[2] == property { let val = caps[1].trim().to_string(); if !val.is_empty() { return Some(val); } } } None } /// Extract the `<title>` tag content from HTML. fn extract_html_title(html: &str) -> Option<String> { static TITLE_RE: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(|| regex_lite::Regex::new(r"<title[^>]*>([^<]+)").unwrap()); TITLE_RE.captures(html).map(|c| c[1].trim().to_string()) } #[cfg(test)] mod tests { use super::*; #[test] fn extract_urls_from_markdown() { let input = "Check [this](https://example.com) and [that](https://other.com/page)."; let urls = extract_urls(input); assert_eq!(urls, vec!["https://example.com", "https://other.com/page"]); } #[test] fn extract_urls_skips_non_http() { let input = "[mail](mailto:a@b.com) [site](https://x.com)"; let urls = extract_urls(input); assert_eq!(urls, vec!["https://x.com"]); } #[test] fn extract_urls_caps_at_three() { let input = "[a](https://1.com) [b](https://2.com) [c](https://3.com) [d](https://4.com)"; let urls = extract_urls(input); assert_eq!(urls.len(), 3); } #[test] fn extract_urls_deduplicates() { let input = "[a](https://same.com) [b](https://same.com)"; let urls = extract_urls(input); assert_eq!(urls, vec!["https://same.com"]); } #[test] fn extract_urls_no_links() { let urls = extract_urls("no links here"); assert!(urls.is_empty()); } #[test] fn og_meta_property_first() { let html = r#""#; assert_eq!(extract_og_meta(html, "og:title"), Some("My Page".to_string())); } #[test] fn og_meta_content_first() { let html = r#""#; assert_eq!( extract_og_meta(html, "og:description"), Some("Description here".to_string()) ); } #[test] fn og_meta_missing() { let html = r#""#; assert_eq!(extract_og_meta(html, "og:title"), None); } #[test] fn html_title_fallback() { let html = "Page Title"; assert_eq!(extract_html_title(html), Some("Page Title".to_string())); } #[test] fn html_title_missing() { let html = ""; assert_eq!(extract_html_title(html), None); } // -- validate_url tests -- #[test] fn validate_url_allows_https() { assert!(validate_url("https://example.com")); assert!(validate_url("https://example.com/path?q=1")); } #[test] fn validate_url_allows_http() { assert!(validate_url("http://example.com")); } #[test] fn validate_url_blocks_non_http_schemes() { assert!(!validate_url("ftp://example.com")); assert!(!validate_url("file:///etc/passwd")); assert!(!validate_url("javascript:alert(1)")); assert!(!validate_url("data:text/html,

hi

")); } #[test] fn validate_url_blocks_localhost() { assert!(!validate_url("http://localhost")); assert!(!validate_url("http://localhost:8080")); assert!(!validate_url("http://127.0.0.1")); assert!(!validate_url("http://127.0.0.1:3000")); assert!(!validate_url("http://0.0.0.0")); assert!(!validate_url("http://[::1]")); assert!(!validate_url("http://[::1]:8080")); } #[test] fn validate_url_blocks_private_10() { assert!(!validate_url("http://10.0.0.1")); assert!(!validate_url("http://10.255.255.255")); } #[test] fn validate_url_blocks_private_192_168() { assert!(!validate_url("http://192.168.0.1")); assert!(!validate_url("http://192.168.1.100:8080")); } #[test] fn validate_url_blocks_private_172_16() { assert!(!validate_url("http://172.16.0.1")); assert!(!validate_url("http://172.31.255.255")); // 172.15 and 172.32 are public assert!(validate_url("http://172.15.0.1")); assert!(validate_url("http://172.32.0.1")); } #[test] fn validate_url_blocks_link_local() { assert!(!validate_url("http://169.254.0.1")); assert!(!validate_url("http://169.254.169.254")); // AWS metadata } #[test] fn validate_url_blocks_ipv6_private() { assert!(!validate_url("http://[fd00::1]")); assert!(!validate_url("http://[fe80::1]")); } #[test] fn validate_url_allows_public_ips() { assert!(validate_url("http://8.8.8.8")); assert!(validate_url("https://93.184.216.34")); } #[tokio::test] async fn noop_fetcher_returns_none_without_network() { let fetcher = LinkPreviewFetcher::Noop; // Any URL — would be a public host in production but here we expect no I/O. assert!(fetcher.fetch("https://example.com").await.is_none()); } }