hi

//! Link preview — server-side OpenGraph metadata fetch for post URLs.

use pulldown_cmark::{Event, Parser, Tag};
use reqwest::header::CONTENT_TYPE;

/// Maximum number of URLs to extract per post.
const MAX_URLS: usize = 3;

/// Maximum response body size to read (1 MB).
const MAX_BODY_SIZE: usize = 1_048_576;

/// Check if an IP address is private/reserved (not safe for SSRF).
fn is_private_ip(ip: std::net::IpAddr) -> bool {
    match ip {
        std::net::IpAddr::V4(v4) => {
            v4.is_loopback()
                || v4.is_private()
                || v4.is_link_local()
                || v4.is_broadcast()
                || v4.is_unspecified()
                || v4.octets()[0] == 100 && (v4.octets()[1] & 0xC0) == 64 // 100.64.0.0/10 (CGNAT / Tailscale)
        }
        std::net::IpAddr::V6(v6) => {
            v6.is_loopback()
                || v6.is_unspecified()
                || (v6.segments()[0] & 0xfe00) == 0xfc00 // ULA fd00::/7
                || (v6.segments()[0] & 0xffc0) == 0xfe80 // link-local
                || matches!(v6.to_ipv4_mapped(), Some(v4) if is_private_ip(std::net::IpAddr::V4(v4)))
        }
    }
}

/// Validate that a URL is safe to fetch (no SSRF to internal networks).
/// Resolves the hostname to catch alternative IP encodings (octal, hex, decimal, IPv6-mapped).
fn validate_url(url: &str) -> bool {
    let lower = url.to_ascii_lowercase();
    if !lower.starts_with("http://") && !lower.starts_with("https://") {
        return false;
    }
    let host_part = lower
        .strip_prefix("http://")
        .or_else(|| lower.strip_prefix("https://"))
        .unwrap_or("");
    let host_and_port = host_part.split('/').next().unwrap_or("");
    let host = if host_and_port.starts_with('[') {
        host_and_port
            .split(']')
            .next()
            .map(|s| format!("{}]", s))
            .unwrap_or_default()
    } else {
        host_and_port.split(':').next().unwrap_or("").to_string()
    };
    let host = host.as_str();

    // Quick string-based check for common private patterns
    if host == "localhost" || host == "0.0.0.0" {
        return false;
    }

    // Try parsing as a raw IP address (catches octal, hex, decimal encodings)
    let bare_host = host.trim_start_matches('[').trim_end_matches(']');
    if let Ok(ip) = bare_host.parse::<std::net::IpAddr>() {
        return !is_private_ip(ip);
    }

    // For hostnames, resolve and check all addresses
    if let Ok(addrs) = std::net::ToSocketAddrs::to_socket_addrs(&(bare_host, 80)) {
        for addr in addrs {
            if is_private_ip(addr.ip()) {
                return false;
            }
        }
    }

    true
}

/// Extract unique http/https URLs from markdown text via pulldown_cmark link parsing.
/// Returns at most `MAX_URLS` URLs.
pub fn extract_urls(input: &str) -> Vec<String> {
    let parser = Parser::new(input);
    let mut seen = std::collections::HashSet::new();
    let mut urls = Vec::new();

    for event in parser {
        if let Event::Start(Tag::Link { dest_url, .. }) = event {
            let url = dest_url.to_string();
            if (url.starts_with("http://") || url.starts_with("https://"))
                && seen.insert(url.clone())
            {
                urls.push(url);
                if urls.len() >= MAX_URLS {
                    break;
                }
            }
        }
    }

    urls
}

/// Build a reqwest client for link preview fetching with SSRF-safe redirect policy.
pub fn build_preview_client() -> reqwest::Client {
    reqwest::Client::builder()
        .redirect(reqwest::redirect::Policy::custom(|attempt| {
            if !validate_url(attempt.url().as_str()) || attempt.previous().len() >= 5 {
                attempt.stop()
            } else {
                attempt.follow()
            }
        }))
        .build()
        .expect("failed to build preview HTTP client")
}

/// Strategy for fetching link previews. The `Noop` variant lets tests skip
/// real HTTP without monkey-patching `tokio::spawn`; production constructs
/// `Http(build_preview_client())`.
#[derive(Clone)]
pub enum LinkPreviewFetcher {
    Http(reqwest::Client),
    Noop,
}

impl LinkPreviewFetcher {
    pub async fn fetch(&self, url: &str) -> Option<(Option<String>, Option<String>)> {
        match self {
            Self::Http(client) => fetch_og_metadata(client, url).await,
            Self::Noop => None,
        }
    }
}

/// Fetch OpenGraph metadata from a URL. Returns `(og:title, og:description)`.
/// Best-effort: returns None on any error (timeout, too large, parse failure).
#[tracing::instrument(skip_all)]
pub async fn fetch_og_metadata(
    http: &reqwest::Client,
    url: &str,
) -> Option<(Option<String>, Option<String>)> {
    if !validate_url(url) {
        return None;
    }

    let resp = http
        .get(url)
        .timeout(std::time::Duration::from_secs(5))
        .header("User-Agent", "Multithreaded/LinkPreview")
        .send()
        .await
        .ok()?;

    if !resp.status().is_success() {
        return None;
    }

    // Only fetch HTML content
    if let Some(ct) = resp.headers().get(CONTENT_TYPE) {
        let ct_str = ct.to_str().unwrap_or("");
        if !ct_str.starts_with("text/html") {
            return None;
        }
    }

    // Read body in chunks, capping at MAX_BODY_SIZE
    let mut body = Vec::new();
    let mut stream = resp;
    while body.len() < MAX_BODY_SIZE {
        let chunk = match stream.chunk().await.ok()? {
            Some(c) => c,
            None => break,
        };
        let remaining = MAX_BODY_SIZE - body.len();
        body.extend_from_slice(&chunk[..chunk.len().min(remaining)]);
    }

    let html = String::from_utf8_lossy(&body);

    let og_title = extract_og_meta(&html, "og:title");
    let og_desc = extract_og_meta(&html, "og:description");

    // Fall back to <title> tag if no og:title
    let title = og_title.or_else(|| extract_html_title(&html));

    if title.is_some() || og_desc.is_some() {
        Some((title, og_desc))
    } else {
        None
    }
}

/// Extract a `<meta property="..." content="...">` value from HTML.
fn extract_og_meta(html: &str, property: &str) -> Option<String> {
    static OG_RE: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(|| {
        regex_lite::Regex::new(
            r#"<meta\s[^>]*?property\s*=\s*"([^"]*)"[^>]*?content\s*=\s*"([^"]*)"[^>]*?>"#,
        )
        .unwrap()
    });
    static OG_RE_REV: std::sync::LazyLock<regex_lite::Regex> = std::sync::LazyLock::new(|| {
        regex_lite::Regex::new(
            r#"<meta\s[^>]*?content\s*=\s*"([^"]*)"[^>]*?property\s*=\s*"([^"]*)"[^>]*?>"#,
        )
        .unwrap()
    });

    // Try property-first order
    for caps in OG_RE.captures_iter(html) {
        if &caps[1] == property {
            let val = caps[2].trim().to_string();
            if !val.is_empty() {
                return Some(val);
            }
        }
    }
    // Try content-first order (some sites put content before property)
    for caps in OG_RE_REV.captures_iter(html) {
        if &caps[2] == property {
            let val = caps[1].trim().to_string();
            if !val.is_empty() {
                return Some(val);
            }
        }
    }
    None
}

/// Extract the `<title>` tag content from HTML.
fn extract_html_title(html: &str) -> Option<String> {
    static TITLE_RE: std::sync::LazyLock<regex_lite::Regex> =
        std::sync::LazyLock::new(|| regex_lite::Regex::new(r"<title[^>]*>([^<]+)</title>").unwrap());
    TITLE_RE.captures(html).map(|c| c[1].trim().to_string())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn extract_urls_from_markdown() {
        let input = "Check [this](https://example.com) and [that](https://other.com/page).";
        let urls = extract_urls(input);
        assert_eq!(urls, vec!["https://example.com", "https://other.com/page"]);
    }

    #[test]
    fn extract_urls_skips_non_http() {
        let input = "[mail](mailto:a@b.com) [site](https://x.com)";
        let urls = extract_urls(input);
        assert_eq!(urls, vec!["https://x.com"]);
    }

    #[test]
    fn extract_urls_caps_at_three() {
        let input = "[a](https://1.com) [b](https://2.com) [c](https://3.com) [d](https://4.com)";
        let urls = extract_urls(input);
        assert_eq!(urls.len(), 3);
    }

    #[test]
    fn extract_urls_deduplicates() {
        let input = "[a](https://same.com) [b](https://same.com)";
        let urls = extract_urls(input);
        assert_eq!(urls, vec!["https://same.com"]);
    }

    #[test]
    fn extract_urls_no_links() {
        let urls = extract_urls("no links here");
        assert!(urls.is_empty());
    }

    #[test]
    fn og_meta_property_first() {
        let html = r#"<meta property="og:title" content="My Page">"#;
        assert_eq!(extract_og_meta(html, "og:title"), Some("My Page".to_string()));
    }

    #[test]
    fn og_meta_content_first() {
        let html = r#"<meta content="Description here" property="og:description">"#;
        assert_eq!(
            extract_og_meta(html, "og:description"),
            Some("Description here".to_string())
        );
    }

    #[test]
    fn og_meta_missing() {
        let html = r#"<meta property="og:image" content="img.png">"#;
        assert_eq!(extract_og_meta(html, "og:title"), None);
    }

    #[test]
    fn html_title_fallback() {
        let html = "<html><head><title>Page Title</title></head></html>";
        assert_eq!(extract_html_title(html), Some("Page Title".to_string()));
    }

    #[test]
    fn html_title_missing() {
        let html = "<html><head></head></html>";
        assert_eq!(extract_html_title(html), None);
    }

    // -- validate_url tests --

    #[test]
    fn validate_url_allows_https() {
        assert!(validate_url("https://example.com"));
        assert!(validate_url("https://example.com/path?q=1"));
    }

    #[test]
    fn validate_url_allows_http() {
        assert!(validate_url("http://example.com"));
    }

    #[test]
    fn validate_url_blocks_non_http_schemes() {
        assert!(!validate_url("ftp://example.com"));
        assert!(!validate_url("file:///etc/passwd"));
        assert!(!validate_url("javascript:alert(1)"));
        assert!(!validate_url("data:text/html,<h1>hi</h1>"));
    }

    #[test]
    fn validate_url_blocks_localhost() {
        assert!(!validate_url("http://localhost"));
        assert!(!validate_url("http://localhost:8080"));
        assert!(!validate_url("http://127.0.0.1"));
        assert!(!validate_url("http://127.0.0.1:3000"));
        assert!(!validate_url("http://0.0.0.0"));
        assert!(!validate_url("http://[::1]"));
        assert!(!validate_url("http://[::1]:8080"));
    }

    #[test]
    fn validate_url_blocks_private_10() {
        assert!(!validate_url("http://10.0.0.1"));
        assert!(!validate_url("http://10.255.255.255"));
    }

    #[test]
    fn validate_url_blocks_private_192_168() {
        assert!(!validate_url("http://192.168.0.1"));
        assert!(!validate_url("http://192.168.1.100:8080"));
    }

    #[test]
    fn validate_url_blocks_private_172_16() {
        assert!(!validate_url("http://172.16.0.1"));
        assert!(!validate_url("http://172.31.255.255"));
        // 172.15 and 172.32 are public
        assert!(validate_url("http://172.15.0.1"));
        assert!(validate_url("http://172.32.0.1"));
    }

    #[test]
    fn validate_url_blocks_link_local() {
        assert!(!validate_url("http://169.254.0.1"));
        assert!(!validate_url("http://169.254.169.254")); // AWS metadata
    }

    #[test]
    fn validate_url_blocks_ipv6_private() {
        assert!(!validate_url("http://[fd00::1]"));
        assert!(!validate_url("http://[fe80::1]"));
    }

    #[test]
    fn validate_url_allows_public_ips() {
        assert!(validate_url("http://8.8.8.8"));
        assert!(validate_url("https://93.184.216.34"));
    }

    #[tokio::test]
    async fn noop_fetcher_returns_none_without_network() {
        let fetcher = LinkPreviewFetcher::Noop;
        // Any URL — would be a public host in production but here we expect no I/O.
        assert!(fetcher.fetch("https://example.com").await.is_none());
    }
}