//! Layer 7: URLhaus URL-reputation lookups against strings extracted from
//! the uploaded file.
//!
//! Pulls printable-ASCII URL substrings out of the binary, dedupes by host,
//! and queries abuse.ch's URLhaus `host` endpoint for the first N hosts
//! (capped to keep us within free-tier limits and to bound per-upload work).
//! Any known-malicious host triggers `Fail`; clean lookups or no extracted
//! URLs trigger `Pass`; auth or network errors fail-open by policy.
//!
//! Like MalwareBazaar, URLhaus requires the abuse.ch `Auth-Key` header.
//! Same key powers both services — register at <https://auth.abuse.ch>.

use std::collections::HashSet;
use std::time::Duration;

use crate::constants;

use super::{ErrorPolicy, LayerResult, LayerVerdict};

/// External third-party network layer. Same reasoning as MalwareBazaar: an
/// outage at abuse.ch must not block every upload across the platform.
pub const ERROR_POLICY: ErrorPolicy = ErrorPolicy::FailOpen;

const URLHAUS_API_URL: &str = "https://urlhaus-api.abuse.ch/v1/host/";
/// Cap per-upload host lookups to keep within free-tier quotas and bound work.
const MAX_HOSTS_PER_FILE: usize = 5;
/// Cap the byte window we scan for URLs; URLs in the wild fit easily here.
const MAX_SCAN_BYTES: usize = 4 * 1024 * 1024;
/// Minimum printable-ASCII run length to consider as a potential string.
const MIN_RUN_LEN: usize = 6;

/// Check the file for URLs that appear in URLhaus's known-bad index.
pub async fn check_urlhaus(data: &[u8], auth_key: Option<&str>) -> LayerResult {
    let Some(key) = auth_key else {
        return LayerResult {
            layer: "urlhaus",
            verdict: LayerVerdict::Skip,
            detail: Some("No abuse.ch Auth-Key configured".to_string()),
        };
    };

    let hosts = extract_unique_hosts(data, MAX_HOSTS_PER_FILE);
    if hosts.is_empty() {
        return LayerResult {
            layer: "urlhaus",
            verdict: LayerVerdict::Pass,
            detail: Some("No URLs extracted".to_string()),
        };
    }

    let timeout = Duration::from_secs(constants::SCAN_MALWAREBAZAAR_TIMEOUT_SECS);
    for host in &hosts {
        match tokio::time::timeout(timeout, query_host(host, key)).await {
            Ok(Ok(verdict)) => match verdict {
                HostVerdict::Clean => continue,
                HostVerdict::Bad(reason) => {
                    return LayerResult {
                        layer: "urlhaus",
                        verdict: LayerVerdict::Fail,
                        detail: Some(format!("Known-malicious host {host}: {reason}")),
                    };
                }
                HostVerdict::AuthError(detail) => {
                    return LayerResult {
                        layer: "urlhaus",
                        verdict: LayerVerdict::Error,
                        detail: Some(detail),
                    };
                }
            },
            Ok(Err(e)) => {
                return LayerResult {
                    layer: "urlhaus",
                    verdict: LayerVerdict::Error,
                    detail: Some(format!("URLhaus error: {e}")),
                };
            }
            Err(_) => {
                return LayerResult {
                    layer: "urlhaus",
                    verdict: LayerVerdict::Error,
                    detail: Some("URLhaus lookup timed out".to_string()),
                };
            }
        }
    }

    LayerResult {
        layer: "urlhaus",
        verdict: LayerVerdict::Pass,
        detail: Some(format!("Checked {} host(s); none known-bad", hosts.len())),
    }
}

enum HostVerdict {
    Clean,
    Bad(String),
    AuthError(String),
}

async fn query_host(host: &str, auth_key: &str) -> Result<HostVerdict, String> {
    static CLIENT: std::sync::LazyLock<reqwest::Client> =
        std::sync::LazyLock::new(reqwest::Client::new);
    let client = &*CLIENT;

    let params = [("host", host)];
    let response = client
        .post(URLHAUS_API_URL)
        .header("Auth-Key", auth_key)
        .form(&params)
        .send()
        .await
        .map_err(|e| format!("HTTP request failed: {e}"))?;

    let status = response.status();
    if status.as_u16() == 401 || status.as_u16() == 403 {
        return Ok(HostVerdict::AuthError(format!("HTTP {}", status.as_u16())));
    }
    if !status.is_success() {
        return Err(format!("HTTP {} from URLhaus", status.as_u16()));
    }
    let body: serde_json::Value = response
        .json()
        .await
        .map_err(|e| format!("Failed to parse response: {e}"))?;
    Ok(classify_urlhaus_response(&body))
}

fn classify_urlhaus_response(body: &serde_json::Value) -> HostVerdict {
    let query_status = body
        .get("query_status")
        .and_then(|v| v.as_str())
        .unwrap_or("unknown");

    match query_status {
        // No record for this host.
        "no_results" => HostVerdict::Clean,
        // Host found in the URLhaus index.
        "ok" => {
            // Look at the urls array; report the threat label of the first
            // entry if available. URLhaus puts the actual classification in
            // `threat` / `tags` per-URL.
            let threat = body
                .get("urls")
                .and_then(|u| u.get(0))
                .and_then(|entry| entry.get("threat"))
                .and_then(|t| t.as_str())
                .unwrap_or("malicious");
            HostVerdict::Bad(threat.to_string())
        }
        "unauthorized" | "key_required" | "key_invalid" => {
            HostVerdict::AuthError(format!("abuse.ch auth: {query_status}"))
        }
        // URLhaus also returns "invalid_host" / "no_host_provided" — treat as
        // clean for this layer's purposes (the host was malformed, not malicious).
        "invalid_host" | "no_host_provided" => HostVerdict::Clean,
        _ => HostVerdict::Clean, // unknown status from a known-degraded API: don't fail closed
    }
}

/// Pull printable-ASCII URL hosts out of the byte buffer. Cap at `max` unique
/// hosts to bound per-upload work and free-tier quota use.
fn extract_unique_hosts(data: &[u8], max: usize) -> Vec<String> {
    let scan = if data.len() > MAX_SCAN_BYTES { &data[..MAX_SCAN_BYTES] } else { data };

    let mut hosts: HashSet<String> = HashSet::new();
    let mut out: Vec<String> = Vec::new();

    let mut start = 0usize;
    while start < scan.len() {
        // Find next printable run of length >= MIN_RUN_LEN.
        while start < scan.len() && !is_url_char(scan[start]) {
            start += 1;
        }
        let mut end = start;
        while end < scan.len() && is_url_char(scan[end]) {
            end += 1;
        }
        if end - start >= MIN_RUN_LEN {
            // Cheap heuristic: only attempt URL parse if the run contains "://".
            let bytes = &scan[start..end];
            if let Some(idx) = find_scheme(bytes) {
                let run = &bytes[idx..];
                if let Ok(s) = std::str::from_utf8(run)
                    && let Some(host) = extract_host(s)
                {
                    let host = host.to_ascii_lowercase();
                    if !hosts.contains(&host) {
                        hosts.insert(host.clone());
                        out.push(host);
                        if out.len() >= max {
                            return out;
                        }
                    }
                }
            }
        }
        start = end + 1;
    }
    out
}

fn is_url_char(b: u8) -> bool {
    // Printable ASCII excluding whitespace and common delimiters that would
    // break a URL run. Permissive enough to catch real URLs, strict enough to
    // exclude noise.
    matches!(
        b,
        b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' |
        b'-' | b'.' | b':' | b'/' | b'?' | b'&' | b'=' |
        b'#' | b'_' | b'%' | b'+' | b','  | b';' | b'~' | b'!' | b'*' | b'$' | b'@'
    )
}

fn find_scheme(bytes: &[u8]) -> Option<usize> {
    // Find ":/" position then back up to the scheme start.
    bytes.windows(3).position(|w| w == b"://").map(|colon_at| {
        let mut start = colon_at;
        while start > 0 && (bytes[start - 1].is_ascii_alphanumeric() || bytes[start - 1] == b'+' || bytes[start - 1] == b'-' || bytes[start - 1] == b'.') {
            start -= 1;
        }
        start
    })
}

fn extract_host(url_run: &str) -> Option<&str> {
    let after_scheme = url_run.split_once("://")?.1;
    let host = after_scheme
        .split(['/', '?', '#', ':'])
        .next()?;
    if host.is_empty() || !host.contains('.') {
        return None;
    }
    Some(host)
}

#[cfg(test)]
mod tests {
    use super::*;
    use serde_json::json;

    #[tokio::test]
    async fn no_auth_key_returns_skip() {
        let result = check_urlhaus(b"http://example.com/", None).await;
        assert_eq!(result.verdict, LayerVerdict::Skip);
    }

    #[tokio::test]
    async fn no_urls_in_data_passes() {
        // With an auth key, an empty buffer should still pass (no URLs to check).
        let result = check_urlhaus(b"plain binary blob no urls here", Some("test")).await;
        assert_eq!(result.verdict, LayerVerdict::Pass);
        assert!(result.detail.unwrap().contains("No URLs"));
    }

    #[test]
    fn extracts_http_url_host() {
        let data = b"\x00\x00\x00 random  https://malicious.example.com/payload.bin trailing junk";
        let hosts = extract_unique_hosts(data, 5);
        assert_eq!(hosts, vec!["malicious.example.com"]);
    }

    #[test]
    fn deduplicates_hosts_across_urls() {
        let data = b"https://a.example/x https://a.example/y http://b.example/z";
        let hosts = extract_unique_hosts(data, 5);
        assert_eq!(hosts.len(), 2);
        assert!(hosts.iter().any(|h| h == "a.example"));
        assert!(hosts.iter().any(|h| h == "b.example"));
    }

    #[test]
    fn respects_max_cap() {
        let data = b"https://a.example/ https://b.example/ https://c.example/ https://d.example/ https://e.example/ https://f.example/";
        let hosts = extract_unique_hosts(data, 3);
        assert_eq!(hosts.len(), 3);
    }

    #[test]
    fn ignores_runs_without_scheme() {
        let data = b"justastring not a url at all 1234567890";
        let hosts = extract_unique_hosts(data, 5);
        assert!(hosts.is_empty());
    }

    #[test]
    fn rejects_hosts_without_dot() {
        let data = b"http://localhost/payload";
        let hosts = extract_unique_hosts(data, 5);
        assert!(hosts.is_empty());
    }

    #[test]
    fn ipv4_literal_host_extracted() {
        let data = b"http://192.168.1.1/get";
        let hosts = extract_unique_hosts(data, 5);
        assert_eq!(hosts, vec!["192.168.1.1"]);
    }

    #[test]
    fn lowercases_host() {
        let data = b"https://EvIl.ExAMPle.com/x";
        let hosts = extract_unique_hosts(data, 5);
        assert_eq!(hosts, vec!["evil.example.com"]);
    }

    #[test]
    fn no_results_is_clean() {
        let body = json!({"query_status": "no_results"});
        assert!(matches!(classify_urlhaus_response(&body), HostVerdict::Clean));
    }

    #[test]
    fn ok_response_is_bad() {
        let body = json!({
            "query_status": "ok",
            "urls": [{"threat": "malware_download", "url": "http://bad.example/x"}]
        });
        match classify_urlhaus_response(&body) {
            HostVerdict::Bad(reason) => assert!(reason.contains("malware_download")),
            _ => panic!("expected Bad"),
        }
    }

    #[test]
    fn unauthorized_is_auth_error() {
        let body = json!({"query_status": "unauthorized"});
        match classify_urlhaus_response(&body) {
            HostVerdict::AuthError(d) => assert!(d.contains("unauthorized")),
            _ => panic!("expected AuthError"),
        }
    }

    #[test]
    fn invalid_host_treated_as_clean() {
        let body = json!({"query_status": "invalid_host"});
        assert!(matches!(classify_urlhaus_response(&body), HostVerdict::Clean));
    }

    #[test]
    fn unknown_status_defaults_to_clean() {
        // URLhaus is fail-open by policy. A response shape we don't recognize
        // shouldn't fail the upload — the layer aggregator's error counts
        // will surface this via the dashboard health panel separately.
        let body = json!({"query_status": "totally_new_thing"});
        assert!(matches!(classify_urlhaus_response(&body), HostVerdict::Clean));
    }
}