//! Layer 7: URLhaus URL-reputation lookups against strings extracted from //! the uploaded file. //! //! Pulls printable-ASCII URL substrings out of the binary, dedupes by host, //! and queries abuse.ch's URLhaus `host` endpoint for the first N hosts //! (capped to keep us within free-tier limits and to bound per-upload work). //! Any known-malicious host triggers `Fail`; clean lookups or no extracted //! URLs trigger `Pass`; auth or network errors fail-open by policy. //! //! Like MalwareBazaar, URLhaus requires the abuse.ch `Auth-Key` header. //! Same key powers both services — register at . use std::collections::HashSet; use std::time::Duration; use crate::constants; use super::{ErrorPolicy, LayerResult, LayerVerdict}; /// External third-party network layer. Same reasoning as MalwareBazaar: an /// outage at abuse.ch must not block every upload across the platform. pub const ERROR_POLICY: ErrorPolicy = ErrorPolicy::FailOpen; const URLHAUS_API_URL: &str = "https://urlhaus-api.abuse.ch/v1/host/"; /// Cap per-upload host lookups to keep within free-tier quotas and bound work. const MAX_HOSTS_PER_FILE: usize = 5; /// Cap the byte window we scan for URLs; URLs in the wild fit easily here. const MAX_SCAN_BYTES: usize = 4 * 1024 * 1024; /// Minimum printable-ASCII run length to consider as a potential string. const MIN_RUN_LEN: usize = 6; /// Check the file for URLs that appear in URLhaus's known-bad index. pub async fn check_urlhaus(data: &[u8], auth_key: Option<&str>) -> LayerResult { let Some(key) = auth_key else { return LayerResult { layer: "urlhaus", verdict: LayerVerdict::Skip, detail: Some("No abuse.ch Auth-Key configured".to_string()), }; }; let hosts = extract_unique_hosts(data, MAX_HOSTS_PER_FILE); if hosts.is_empty() { return LayerResult { layer: "urlhaus", verdict: LayerVerdict::Pass, detail: Some("No URLs extracted".to_string()), }; } let timeout = Duration::from_secs(constants::SCAN_MALWAREBAZAAR_TIMEOUT_SECS); for host in &hosts { match tokio::time::timeout(timeout, query_host(host, key)).await { Ok(Ok(verdict)) => match verdict { HostVerdict::Clean => continue, HostVerdict::Bad(reason) => { return LayerResult { layer: "urlhaus", verdict: LayerVerdict::Fail, detail: Some(format!("Known-malicious host {host}: {reason}")), }; } HostVerdict::AuthError(detail) => { return LayerResult { layer: "urlhaus", verdict: LayerVerdict::Error, detail: Some(detail), }; } }, Ok(Err(e)) => { return LayerResult { layer: "urlhaus", verdict: LayerVerdict::Error, detail: Some(format!("URLhaus error: {e}")), }; } Err(_) => { return LayerResult { layer: "urlhaus", verdict: LayerVerdict::Error, detail: Some("URLhaus lookup timed out".to_string()), }; } } } LayerResult { layer: "urlhaus", verdict: LayerVerdict::Pass, detail: Some(format!("Checked {} host(s); none known-bad", hosts.len())), } } enum HostVerdict { Clean, Bad(String), AuthError(String), } async fn query_host(host: &str, auth_key: &str) -> Result { static CLIENT: std::sync::LazyLock = std::sync::LazyLock::new(reqwest::Client::new); let client = &*CLIENT; let params = [("host", host)]; let response = client .post(URLHAUS_API_URL) .header("Auth-Key", auth_key) .form(¶ms) .send() .await .map_err(|e| format!("HTTP request failed: {e}"))?; let status = response.status(); if status.as_u16() == 401 || status.as_u16() == 403 { return Ok(HostVerdict::AuthError(format!("HTTP {}", status.as_u16()))); } if !status.is_success() { return Err(format!("HTTP {} from URLhaus", status.as_u16())); } let body: serde_json::Value = response .json() .await .map_err(|e| format!("Failed to parse response: {e}"))?; Ok(classify_urlhaus_response(&body)) } fn classify_urlhaus_response(body: &serde_json::Value) -> HostVerdict { let query_status = body .get("query_status") .and_then(|v| v.as_str()) .unwrap_or("unknown"); match query_status { // No record for this host. "no_results" => HostVerdict::Clean, // Host found in the URLhaus index. "ok" => { // Look at the urls array; report the threat label of the first // entry if available. URLhaus puts the actual classification in // `threat` / `tags` per-URL. let threat = body .get("urls") .and_then(|u| u.get(0)) .and_then(|entry| entry.get("threat")) .and_then(|t| t.as_str()) .unwrap_or("malicious"); HostVerdict::Bad(threat.to_string()) } "unauthorized" | "key_required" | "key_invalid" => { HostVerdict::AuthError(format!("abuse.ch auth: {query_status}")) } // URLhaus also returns "invalid_host" / "no_host_provided" — treat as // clean for this layer's purposes (the host was malformed, not malicious). "invalid_host" | "no_host_provided" => HostVerdict::Clean, _ => HostVerdict::Clean, // unknown status from a known-degraded API: don't fail closed } } /// Pull printable-ASCII URL hosts out of the byte buffer. Cap at `max` unique /// hosts to bound per-upload work and free-tier quota use. fn extract_unique_hosts(data: &[u8], max: usize) -> Vec { let scan = if data.len() > MAX_SCAN_BYTES { &data[..MAX_SCAN_BYTES] } else { data }; let mut hosts: HashSet = HashSet::new(); let mut out: Vec = Vec::new(); let mut start = 0usize; while start < scan.len() { // Find next printable run of length >= MIN_RUN_LEN. while start < scan.len() && !is_url_char(scan[start]) { start += 1; } let mut end = start; while end < scan.len() && is_url_char(scan[end]) { end += 1; } if end - start >= MIN_RUN_LEN { // Cheap heuristic: only attempt URL parse if the run contains "://". let bytes = &scan[start..end]; if let Some(idx) = find_scheme(bytes) { let run = &bytes[idx..]; if let Ok(s) = std::str::from_utf8(run) && let Some(host) = extract_host(s) { let host = host.to_ascii_lowercase(); if !hosts.contains(&host) { hosts.insert(host.clone()); out.push(host); if out.len() >= max { return out; } } } } } start = end + 1; } out } fn is_url_char(b: u8) -> bool { // Printable ASCII excluding whitespace and common delimiters that would // break a URL run. Permissive enough to catch real URLs, strict enough to // exclude noise. matches!( b, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'-' | b'.' | b':' | b'/' | b'?' | b'&' | b'=' | b'#' | b'_' | b'%' | b'+' | b',' | b';' | b'~' | b'!' | b'*' | b'$' | b'@' ) } fn find_scheme(bytes: &[u8]) -> Option { // Find ":/" position then back up to the scheme start. bytes.windows(3).position(|w| w == b"://").map(|colon_at| { let mut start = colon_at; while start > 0 && (bytes[start - 1].is_ascii_alphanumeric() || bytes[start - 1] == b'+' || bytes[start - 1] == b'-' || bytes[start - 1] == b'.') { start -= 1; } start }) } fn extract_host(url_run: &str) -> Option<&str> { let after_scheme = url_run.split_once("://")?.1; let host = after_scheme .split(['/', '?', '#', ':']) .next()?; if host.is_empty() || !host.contains('.') { return None; } Some(host) } #[cfg(test)] mod tests { use super::*; use serde_json::json; #[tokio::test] async fn no_auth_key_returns_skip() { let result = check_urlhaus(b"http://example.com/", None).await; assert_eq!(result.verdict, LayerVerdict::Skip); } #[tokio::test] async fn no_urls_in_data_passes() { // With an auth key, an empty buffer should still pass (no URLs to check). let result = check_urlhaus(b"plain binary blob no urls here", Some("test")).await; assert_eq!(result.verdict, LayerVerdict::Pass); assert!(result.detail.unwrap().contains("No URLs")); } #[test] fn extracts_http_url_host() { let data = b"\x00\x00\x00 random https://malicious.example.com/payload.bin trailing junk"; let hosts = extract_unique_hosts(data, 5); assert_eq!(hosts, vec!["malicious.example.com"]); } #[test] fn deduplicates_hosts_across_urls() { let data = b"https://a.example/x https://a.example/y http://b.example/z"; let hosts = extract_unique_hosts(data, 5); assert_eq!(hosts.len(), 2); assert!(hosts.iter().any(|h| h == "a.example")); assert!(hosts.iter().any(|h| h == "b.example")); } #[test] fn respects_max_cap() { let data = b"https://a.example/ https://b.example/ https://c.example/ https://d.example/ https://e.example/ https://f.example/"; let hosts = extract_unique_hosts(data, 3); assert_eq!(hosts.len(), 3); } #[test] fn ignores_runs_without_scheme() { let data = b"justastring not a url at all 1234567890"; let hosts = extract_unique_hosts(data, 5); assert!(hosts.is_empty()); } #[test] fn rejects_hosts_without_dot() { let data = b"http://localhost/payload"; let hosts = extract_unique_hosts(data, 5); assert!(hosts.is_empty()); } #[test] fn ipv4_literal_host_extracted() { let data = b"http://192.168.1.1/get"; let hosts = extract_unique_hosts(data, 5); assert_eq!(hosts, vec!["192.168.1.1"]); } #[test] fn lowercases_host() { let data = b"https://EvIl.ExAMPle.com/x"; let hosts = extract_unique_hosts(data, 5); assert_eq!(hosts, vec!["evil.example.com"]); } #[test] fn no_results_is_clean() { let body = json!({"query_status": "no_results"}); assert!(matches!(classify_urlhaus_response(&body), HostVerdict::Clean)); } #[test] fn ok_response_is_bad() { let body = json!({ "query_status": "ok", "urls": [{"threat": "malware_download", "url": "http://bad.example/x"}] }); match classify_urlhaus_response(&body) { HostVerdict::Bad(reason) => assert!(reason.contains("malware_download")), _ => panic!("expected Bad"), } } #[test] fn unauthorized_is_auth_error() { let body = json!({"query_status": "unauthorized"}); match classify_urlhaus_response(&body) { HostVerdict::AuthError(d) => assert!(d.contains("unauthorized")), _ => panic!("expected AuthError"), } } #[test] fn invalid_host_treated_as_clean() { let body = json!({"query_status": "invalid_host"}); assert!(matches!(classify_urlhaus_response(&body), HostVerdict::Clean)); } #[test] fn unknown_status_defaults_to_clean() { // URLhaus is fail-open by policy. A response shape we don't recognize // shouldn't fail the upload — the layer aggregator's error counts // will surface this via the dashboard health panel separately. let body = json!({"query_status": "totally_new_thing"}); assert!(matches!(classify_urlhaus_response(&body), HostVerdict::Clean)); } }