//! URL tracker parameter stripping. //! //! Removes known tracking parameters (utm_*, fbclid, gclid, etc.) from URLs //! to improve privacy and reduce link clutter. Also provides an HTML rewriter //! that cleans URLs inside `href` and `src` attributes. use std::sync::LazyLock; use regex::Regex; use url::Url; /// Tracking parameter prefixes — any query parameter whose name starts with one /// of these (case-insensitive) is stripped. const TRACKING_PREFIXES: &[&str] = &["utm_"]; /// Tracking parameter exact names (case-insensitive). const TRACKING_PARAMS: &[&str] = &[ "fbclid", "gclid", "msclkid", "twclid", "dclid", "mc_cid", "mc_eid", "oly_anon_id", "oly_enc_id", "_openstat", "vero_id", "wickedid", "yclid", "zanpid", "_hsenc", "_hsmi", "hsa_cam", "hsa_grp", "hsa_mt", "hsa_src", "hsa_ad", "hsa_acc", "hsa_net", "hsa_ver", "hsa_la", "hsa_ol", "hsa_kw", "hsa_tgt", ]; /// Returns `true` if a query parameter name is a known tracker. fn is_tracking_param(name: &str) -> bool { let lower = name.to_ascii_lowercase(); if TRACKING_PARAMS.iter().any(|&p| lower == p) { return true; } TRACKING_PREFIXES .iter() .any(|&prefix| lower.starts_with(prefix)) } #[tracing::instrument(skip_all)] /// Strip known tracking query parameters from a URL string pub fn strip_tracking_params(url_str: &str) -> String { let mut parsed = match Url::parse(url_str) { Ok(u) => u, Err(_) => return url_str.to_string(), }; let clean_pairs: Vec<(String, String)> = parsed .query_pairs() .filter(|(name, _)| !is_tracking_param(name)) .map(|(k, v)| (k.into_owned(), v.into_owned())) .collect(); // Clear and rebuild query string if clean_pairs.is_empty() { parsed.set_query(None); } else { parsed .query_pairs_mut() .clear() .extend_pairs(clean_pairs); } parsed.to_string() } /// Regex matching `href="..."` / `href='...'` and `src="..."` / `src='...'` /// attribute values in HTML (both double- and single-quoted). static ATTR_URL_RE: LazyLock = LazyLock::new(|| { Regex::new(r#"(href|src)\s*=\s*(?:"([^"]+)"|'([^']+)')"#).expect("invalid regex") }); #[tracing::instrument(skip_all)] /// Strip tracking parameters from all `href` and `src` URLs in an HTML string pub fn strip_tracking_from_html(html: &str) -> String { ATTR_URL_RE .replace_all(html, |caps: ®ex::Captures| { let attr = &caps[1]; // Group 2 = double-quoted value, group 3 = single-quoted value let (url, quote) = if let Some(m) = caps.get(2) { (m.as_str(), '"') } else { (caps.get(3).unwrap().as_str(), '\'') }; let cleaned = strip_tracking_params(url); format!("{}={}{}{}", attr, quote, cleaned, quote) }) .into_owned() } #[cfg(test)] mod tests { use super::*; #[test] fn strips_utm_params() { let url = "https://example.com/page?utm_source=twitter&utm_medium=social&id=42"; let cleaned = strip_tracking_params(url); assert_eq!(cleaned, "https://example.com/page?id=42"); } #[test] fn strips_fbclid() { let url = "https://example.com/?fbclid=abc123&page=1"; let cleaned = strip_tracking_params(url); assert_eq!(cleaned, "https://example.com/?page=1"); } #[test] fn strips_gclid() { let url = "https://example.com/?gclid=xyz&ref=home"; let cleaned = strip_tracking_params(url); assert_eq!(cleaned, "https://example.com/?ref=home"); } #[test] fn preserves_clean_params() { let url = "https://example.com/search?q=rust&page=2"; let cleaned = strip_tracking_params(url); assert_eq!(cleaned, "https://example.com/search?q=rust&page=2"); } #[test] fn handles_no_query() { let url = "https://example.com/page"; let cleaned = strip_tracking_params(url); assert_eq!(cleaned, "https://example.com/page"); } #[test] fn handles_all_tracking_removed() { let url = "https://example.com/?utm_source=x&fbclid=y"; let cleaned = strip_tracking_params(url); assert_eq!(cleaned, "https://example.com/"); } #[test] fn handles_invalid_url() { let url = "not a url at all"; let cleaned = strip_tracking_params(url); assert_eq!(cleaned, "not a url at all"); } #[test] fn strips_from_html_body() { let html = r#"click and "#; let cleaned = strip_tracking_from_html(html); assert_eq!( cleaned, r#"click and "# ); } #[test] fn html_preserves_clean_urls() { let html = r#"link"#; let cleaned = strip_tracking_from_html(html); assert_eq!(cleaned, html); } #[test] fn case_insensitive_param_match() { let url = "https://example.com/?UTM_SOURCE=x&FBCLID=y&keep=z"; let cleaned = strip_tracking_params(url); assert_eq!(cleaned, "https://example.com/?keep=z"); } }