Skip to main content

max / makenotwork

12.2 KB · 349 lines History Blame Raw
1 //! Layer 7: URLhaus URL-reputation lookups against strings extracted from
2 //! the uploaded file.
3 //!
4 //! Pulls printable-ASCII URL substrings out of the binary, dedupes by host,
5 //! and queries abuse.ch's URLhaus `host` endpoint for the first N hosts
6 //! (capped to keep us within free-tier limits and to bound per-upload work).
7 //! Any known-malicious host triggers `Fail`; clean lookups or no extracted
8 //! URLs trigger `Pass`; auth or network errors fail-open by policy.
9 //!
10 //! Like MalwareBazaar, URLhaus requires the abuse.ch `Auth-Key` header.
11 //! Same key powers both services — register at <https://auth.abuse.ch>.
12
13 use std::collections::HashSet;
14 use std::time::Duration;
15
16 use crate::constants;
17
18 use super::{ErrorPolicy, LayerResult, LayerVerdict};
19
20 /// External third-party network layer. Same reasoning as MalwareBazaar: an
21 /// outage at abuse.ch must not block every upload across the platform.
22 pub const ERROR_POLICY: ErrorPolicy = ErrorPolicy::FailOpen;
23
24 const URLHAUS_API_URL: &str = "https://urlhaus-api.abuse.ch/v1/host/";
25 /// Cap per-upload host lookups to keep within free-tier quotas and bound work.
26 const MAX_HOSTS_PER_FILE: usize = 5;
27 /// Cap the byte window we scan for URLs; URLs in the wild fit easily here.
28 const MAX_SCAN_BYTES: usize = 4 * 1024 * 1024;
29 /// Minimum printable-ASCII run length to consider as a potential string.
30 const MIN_RUN_LEN: usize = 6;
31
32 /// Check the file for URLs that appear in URLhaus's known-bad index.
33 pub async fn check_urlhaus(data: &[u8], auth_key: Option<&str>) -> LayerResult {
34 let Some(key) = auth_key else {
35 return LayerResult {
36 layer: "urlhaus",
37 verdict: LayerVerdict::Skip,
38 detail: Some("No abuse.ch Auth-Key configured".to_string()),
39 };
40 };
41
42 let hosts = extract_unique_hosts(data, MAX_HOSTS_PER_FILE);
43 if hosts.is_empty() {
44 return LayerResult {
45 layer: "urlhaus",
46 verdict: LayerVerdict::Pass,
47 detail: Some("No URLs extracted".to_string()),
48 };
49 }
50
51 let timeout = Duration::from_secs(constants::SCAN_MALWAREBAZAAR_TIMEOUT_SECS);
52 for host in &hosts {
53 match tokio::time::timeout(timeout, query_host(host, key)).await {
54 Ok(Ok(verdict)) => match verdict {
55 HostVerdict::Clean => continue,
56 HostVerdict::Bad(reason) => {
57 return LayerResult {
58 layer: "urlhaus",
59 verdict: LayerVerdict::Fail,
60 detail: Some(format!("Known-malicious host {host}: {reason}")),
61 };
62 }
63 HostVerdict::AuthError(detail) => {
64 return LayerResult {
65 layer: "urlhaus",
66 verdict: LayerVerdict::Error,
67 detail: Some(detail),
68 };
69 }
70 },
71 Ok(Err(e)) => {
72 return LayerResult {
73 layer: "urlhaus",
74 verdict: LayerVerdict::Error,
75 detail: Some(format!("URLhaus error: {e}")),
76 };
77 }
78 Err(_) => {
79 return LayerResult {
80 layer: "urlhaus",
81 verdict: LayerVerdict::Error,
82 detail: Some("URLhaus lookup timed out".to_string()),
83 };
84 }
85 }
86 }
87
88 LayerResult {
89 layer: "urlhaus",
90 verdict: LayerVerdict::Pass,
91 detail: Some(format!("Checked {} host(s); none known-bad", hosts.len())),
92 }
93 }
94
95 enum HostVerdict {
96 Clean,
97 Bad(String),
98 AuthError(String),
99 }
100
101 async fn query_host(host: &str, auth_key: &str) -> Result<HostVerdict, String> {
102 static CLIENT: std::sync::LazyLock<reqwest::Client> =
103 std::sync::LazyLock::new(reqwest::Client::new);
104 let client = &*CLIENT;
105
106 let params = [("host", host)];
107 let response = client
108 .post(URLHAUS_API_URL)
109 .header("Auth-Key", auth_key)
110 .form(&params)
111 .send()
112 .await
113 .map_err(|e| format!("HTTP request failed: {e}"))?;
114
115 let status = response.status();
116 if status.as_u16() == 401 || status.as_u16() == 403 {
117 return Ok(HostVerdict::AuthError(format!("HTTP {}", status.as_u16())));
118 }
119 if !status.is_success() {
120 return Err(format!("HTTP {} from URLhaus", status.as_u16()));
121 }
122 let body: serde_json::Value = response
123 .json()
124 .await
125 .map_err(|e| format!("Failed to parse response: {e}"))?;
126 Ok(classify_urlhaus_response(&body))
127 }
128
129 fn classify_urlhaus_response(body: &serde_json::Value) -> HostVerdict {
130 let query_status = body
131 .get("query_status")
132 .and_then(|v| v.as_str())
133 .unwrap_or("unknown");
134
135 match query_status {
136 // No record for this host.
137 "no_results" => HostVerdict::Clean,
138 // Host found in the URLhaus index.
139 "ok" => {
140 // Look at the urls array; report the threat label of the first
141 // entry if available. URLhaus puts the actual classification in
142 // `threat` / `tags` per-URL.
143 let threat = body
144 .get("urls")
145 .and_then(|u| u.get(0))
146 .and_then(|entry| entry.get("threat"))
147 .and_then(|t| t.as_str())
148 .unwrap_or("malicious");
149 HostVerdict::Bad(threat.to_string())
150 }
151 "unauthorized" | "key_required" | "key_invalid" => {
152 HostVerdict::AuthError(format!("abuse.ch auth: {query_status}"))
153 }
154 // URLhaus also returns "invalid_host" / "no_host_provided" — treat as
155 // clean for this layer's purposes (the host was malformed, not malicious).
156 "invalid_host" | "no_host_provided" => HostVerdict::Clean,
157 _ => HostVerdict::Clean, // unknown status from a known-degraded API: don't fail closed
158 }
159 }
160
161 /// Pull printable-ASCII URL hosts out of the byte buffer. Cap at `max` unique
162 /// hosts to bound per-upload work and free-tier quota use.
163 fn extract_unique_hosts(data: &[u8], max: usize) -> Vec<String> {
164 let scan = if data.len() > MAX_SCAN_BYTES { &data[..MAX_SCAN_BYTES] } else { data };
165
166 let mut hosts: HashSet<String> = HashSet::new();
167 let mut out: Vec<String> = Vec::new();
168
169 let mut start = 0usize;
170 while start < scan.len() {
171 // Find next printable run of length >= MIN_RUN_LEN.
172 while start < scan.len() && !is_url_char(scan[start]) {
173 start += 1;
174 }
175 let mut end = start;
176 while end < scan.len() && is_url_char(scan[end]) {
177 end += 1;
178 }
179 if end - start >= MIN_RUN_LEN {
180 // Cheap heuristic: only attempt URL parse if the run contains "://".
181 let bytes = &scan[start..end];
182 if let Some(idx) = find_scheme(bytes) {
183 let run = &bytes[idx..];
184 if let Ok(s) = std::str::from_utf8(run)
185 && let Some(host) = extract_host(s)
186 {
187 let host = host.to_ascii_lowercase();
188 if !hosts.contains(&host) {
189 hosts.insert(host.clone());
190 out.push(host);
191 if out.len() >= max {
192 return out;
193 }
194 }
195 }
196 }
197 }
198 start = end + 1;
199 }
200 out
201 }
202
203 fn is_url_char(b: u8) -> bool {
204 // Printable ASCII excluding whitespace and common delimiters that would
205 // break a URL run. Permissive enough to catch real URLs, strict enough to
206 // exclude noise.
207 matches!(
208 b,
209 b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' |
210 b'-' | b'.' | b':' | b'/' | b'?' | b'&' | b'=' |
211 b'#' | b'_' | b'%' | b'+' | b',' | b';' | b'~' | b'!' | b'*' | b'$' | b'@'
212 )
213 }
214
215 fn find_scheme(bytes: &[u8]) -> Option<usize> {
216 // Find ":/" position then back up to the scheme start.
217 bytes.windows(3).position(|w| w == b"://").map(|colon_at| {
218 let mut start = colon_at;
219 while start > 0 && (bytes[start - 1].is_ascii_alphanumeric() || bytes[start - 1] == b'+' || bytes[start - 1] == b'-' || bytes[start - 1] == b'.') {
220 start -= 1;
221 }
222 start
223 })
224 }
225
226 fn extract_host(url_run: &str) -> Option<&str> {
227 let after_scheme = url_run.split_once("://")?.1;
228 let host = after_scheme
229 .split(['/', '?', '#', ':'])
230 .next()?;
231 if host.is_empty() || !host.contains('.') {
232 return None;
233 }
234 Some(host)
235 }
236
237 #[cfg(test)]
238 mod tests {
239 use super::*;
240 use serde_json::json;
241
242 #[tokio::test]
243 async fn no_auth_key_returns_skip() {
244 let result = check_urlhaus(b"http://example.com/", None).await;
245 assert_eq!(result.verdict, LayerVerdict::Skip);
246 }
247
248 #[tokio::test]
249 async fn no_urls_in_data_passes() {
250 // With an auth key, an empty buffer should still pass (no URLs to check).
251 let result = check_urlhaus(b"plain binary blob no urls here", Some("test")).await;
252 assert_eq!(result.verdict, LayerVerdict::Pass);
253 assert!(result.detail.unwrap().contains("No URLs"));
254 }
255
256 #[test]
257 fn extracts_http_url_host() {
258 let data = b"\x00\x00\x00 random https://malicious.example.com/payload.bin trailing junk";
259 let hosts = extract_unique_hosts(data, 5);
260 assert_eq!(hosts, vec!["malicious.example.com"]);
261 }
262
263 #[test]
264 fn deduplicates_hosts_across_urls() {
265 let data = b"https://a.example/x https://a.example/y http://b.example/z";
266 let hosts = extract_unique_hosts(data, 5);
267 assert_eq!(hosts.len(), 2);
268 assert!(hosts.iter().any(|h| h == "a.example"));
269 assert!(hosts.iter().any(|h| h == "b.example"));
270 }
271
272 #[test]
273 fn respects_max_cap() {
274 let data = b"https://a.example/ https://b.example/ https://c.example/ https://d.example/ https://e.example/ https://f.example/";
275 let hosts = extract_unique_hosts(data, 3);
276 assert_eq!(hosts.len(), 3);
277 }
278
279 #[test]
280 fn ignores_runs_without_scheme() {
281 let data = b"justastring not a url at all 1234567890";
282 let hosts = extract_unique_hosts(data, 5);
283 assert!(hosts.is_empty());
284 }
285
286 #[test]
287 fn rejects_hosts_without_dot() {
288 let data = b"http://localhost/payload";
289 let hosts = extract_unique_hosts(data, 5);
290 assert!(hosts.is_empty());
291 }
292
293 #[test]
294 fn ipv4_literal_host_extracted() {
295 let data = b"http://192.168.1.1/get";
296 let hosts = extract_unique_hosts(data, 5);
297 assert_eq!(hosts, vec!["192.168.1.1"]);
298 }
299
300 #[test]
301 fn lowercases_host() {
302 let data = b"https://EvIl.ExAMPle.com/x";
303 let hosts = extract_unique_hosts(data, 5);
304 assert_eq!(hosts, vec!["evil.example.com"]);
305 }
306
307 #[test]
308 fn no_results_is_clean() {
309 let body = json!({"query_status": "no_results"});
310 assert!(matches!(classify_urlhaus_response(&body), HostVerdict::Clean));
311 }
312
313 #[test]
314 fn ok_response_is_bad() {
315 let body = json!({
316 "query_status": "ok",
317 "urls": [{"threat": "malware_download", "url": "http://bad.example/x"}]
318 });
319 match classify_urlhaus_response(&body) {
320 HostVerdict::Bad(reason) => assert!(reason.contains("malware_download")),
321 _ => panic!("expected Bad"),
322 }
323 }
324
325 #[test]
326 fn unauthorized_is_auth_error() {
327 let body = json!({"query_status": "unauthorized"});
328 match classify_urlhaus_response(&body) {
329 HostVerdict::AuthError(d) => assert!(d.contains("unauthorized")),
330 _ => panic!("expected AuthError"),
331 }
332 }
333
334 #[test]
335 fn invalid_host_treated_as_clean() {
336 let body = json!({"query_status": "invalid_host"});
337 assert!(matches!(classify_urlhaus_response(&body), HostVerdict::Clean));
338 }
339
340 #[test]
341 fn unknown_status_defaults_to_clean() {
342 // URLhaus is fail-open by policy. A response shape we don't recognize
343 // shouldn't fail the upload — the layer aggregator's error counts
344 // will surface this via the dashboard health panel separately.
345 let body = json!({"query_status": "totally_new_thing"});
346 assert!(matches!(classify_urlhaus_response(&body), HostVerdict::Clean));
347 }
348 }
349