| 1 |
|
| 2 |
|
| 3 |
|
| 4 |
|
| 5 |
|
| 6 |
|
| 7 |
use std::sync::LazyLock; |
| 8 |
|
| 9 |
use regex::Regex; |
| 10 |
use url::Url; |
| 11 |
|
| 12 |
|
| 13 |
|
| 14 |
const TRACKING_PREFIXES: &[&str] = &["utm_"]; |
| 15 |
|
| 16 |
|
| 17 |
const TRACKING_PARAMS: &[&str] = &[ |
| 18 |
"fbclid", |
| 19 |
"gclid", |
| 20 |
"msclkid", |
| 21 |
"twclid", |
| 22 |
"dclid", |
| 23 |
"mc_cid", |
| 24 |
"mc_eid", |
| 25 |
"oly_anon_id", |
| 26 |
"oly_enc_id", |
| 27 |
"_openstat", |
| 28 |
"vero_id", |
| 29 |
"wickedid", |
| 30 |
"yclid", |
| 31 |
"zanpid", |
| 32 |
"_hsenc", |
| 33 |
"_hsmi", |
| 34 |
"hsa_cam", |
| 35 |
"hsa_grp", |
| 36 |
"hsa_mt", |
| 37 |
"hsa_src", |
| 38 |
"hsa_ad", |
| 39 |
"hsa_acc", |
| 40 |
"hsa_net", |
| 41 |
"hsa_ver", |
| 42 |
"hsa_la", |
| 43 |
"hsa_ol", |
| 44 |
"hsa_kw", |
| 45 |
"hsa_tgt", |
| 46 |
]; |
| 47 |
|
| 48 |
|
| 49 |
fn is_tracking_param(name: &str) -> bool { |
| 50 |
let lower = name.to_ascii_lowercase(); |
| 51 |
if TRACKING_PARAMS.iter().any(|&p| lower == p) { |
| 52 |
return true; |
| 53 |
} |
| 54 |
TRACKING_PREFIXES |
| 55 |
.iter() |
| 56 |
.any(|&prefix| lower.starts_with(prefix)) |
| 57 |
} |
| 58 |
|
| 59 |
#[tracing::instrument(skip_all)] |
| 60 |
|
| 61 |
pub fn strip_tracking_params(url_str: &str) -> String { |
| 62 |
let mut parsed = match Url::parse(url_str) { |
| 63 |
Ok(u) => u, |
| 64 |
Err(_) => return url_str.to_string(), |
| 65 |
}; |
| 66 |
|
| 67 |
let clean_pairs: Vec<(String, String)> = parsed |
| 68 |
.query_pairs() |
| 69 |
.filter(|(name, _)| !is_tracking_param(name)) |
| 70 |
.map(|(k, v)| (k.into_owned(), v.into_owned())) |
| 71 |
.collect(); |
| 72 |
|
| 73 |
|
| 74 |
if clean_pairs.is_empty() { |
| 75 |
parsed.set_query(None); |
| 76 |
} else { |
| 77 |
parsed |
| 78 |
.query_pairs_mut() |
| 79 |
.clear() |
| 80 |
.extend_pairs(clean_pairs); |
| 81 |
} |
| 82 |
|
| 83 |
parsed.to_string() |
| 84 |
} |
| 85 |
|
| 86 |
|
| 87 |
|
| 88 |
static ATTR_URL_RE: LazyLock<Regex> = LazyLock::new(|| { |
| 89 |
Regex::new(r#"(href|src)\s*=\s*(?:"([^"]+)"|'([^']+)')"#).expect("invalid regex") |
| 90 |
}); |
| 91 |
|
| 92 |
#[tracing::instrument(skip_all)] |
| 93 |
|
| 94 |
pub fn strip_tracking_from_html(html: &str) -> String { |
| 95 |
ATTR_URL_RE |
| 96 |
.replace_all(html, |caps: ®ex::Captures| { |
| 97 |
let attr = &caps[1]; |
| 98 |
|
| 99 |
let (url, quote) = if let Some(m) = caps.get(2) { |
| 100 |
(m.as_str(), '"') |
| 101 |
} else { |
| 102 |
(caps.get(3).unwrap().as_str(), '\'') |
| 103 |
}; |
| 104 |
let cleaned = strip_tracking_params(url); |
| 105 |
format!("{}={}{}{}", attr, quote, cleaned, quote) |
| 106 |
}) |
| 107 |
.into_owned() |
| 108 |
} |
| 109 |
|
| 110 |
#[cfg(test)] |
| 111 |
mod tests { |
| 112 |
use super::*; |
| 113 |
|
| 114 |
#[test] |
| 115 |
fn strips_utm_params() { |
| 116 |
let url = "https://example.com/page?utm_source=twitter&utm_medium=social&id=42"; |
| 117 |
let cleaned = strip_tracking_params(url); |
| 118 |
assert_eq!(cleaned, "https://example.com/page?id=42"); |
| 119 |
} |
| 120 |
|
| 121 |
#[test] |
| 122 |
fn strips_fbclid() { |
| 123 |
let url = "https://example.com/?fbclid=abc123&page=1"; |
| 124 |
let cleaned = strip_tracking_params(url); |
| 125 |
assert_eq!(cleaned, "https://example.com/?page=1"); |
| 126 |
} |
| 127 |
|
| 128 |
#[test] |
| 129 |
fn strips_gclid() { |
| 130 |
let url = "https://example.com/?gclid=xyz&ref=home"; |
| 131 |
let cleaned = strip_tracking_params(url); |
| 132 |
assert_eq!(cleaned, "https://example.com/?ref=home"); |
| 133 |
} |
| 134 |
|
| 135 |
#[test] |
| 136 |
fn preserves_clean_params() { |
| 137 |
let url = "https://example.com/search?q=rust&page=2"; |
| 138 |
let cleaned = strip_tracking_params(url); |
| 139 |
assert_eq!(cleaned, "https://example.com/search?q=rust&page=2"); |
| 140 |
} |
| 141 |
|
| 142 |
#[test] |
| 143 |
fn handles_no_query() { |
| 144 |
let url = "https://example.com/page"; |
| 145 |
let cleaned = strip_tracking_params(url); |
| 146 |
assert_eq!(cleaned, "https://example.com/page"); |
| 147 |
} |
| 148 |
|
| 149 |
#[test] |
| 150 |
fn handles_all_tracking_removed() { |
| 151 |
let url = "https://example.com/?utm_source=x&fbclid=y"; |
| 152 |
let cleaned = strip_tracking_params(url); |
| 153 |
assert_eq!(cleaned, "https://example.com/"); |
| 154 |
} |
| 155 |
|
| 156 |
#[test] |
| 157 |
fn handles_invalid_url() { |
| 158 |
let url = "not a url at all"; |
| 159 |
let cleaned = strip_tracking_params(url); |
| 160 |
assert_eq!(cleaned, "not a url at all"); |
| 161 |
} |
| 162 |
|
| 163 |
#[test] |
| 164 |
fn strips_from_html_body() { |
| 165 |
let html = r#"<a href="https://example.com/?utm_source=rss&id=1">click</a> and <img src="https://img.example.com/pic.jpg?fbclid=abc">"#; |
| 166 |
let cleaned = strip_tracking_from_html(html); |
| 167 |
assert_eq!( |
| 168 |
cleaned, |
| 169 |
r#"<a href="https://example.com/?id=1">click</a> and <img src="https://img.example.com/pic.jpg">"# |
| 170 |
); |
| 171 |
} |
| 172 |
|
| 173 |
#[test] |
| 174 |
fn html_preserves_clean_urls() { |
| 175 |
let html = r#"<a href="https://example.com/page">link</a>"#; |
| 176 |
let cleaned = strip_tracking_from_html(html); |
| 177 |
assert_eq!(cleaned, html); |
| 178 |
} |
| 179 |
|
| 180 |
#[test] |
| 181 |
fn case_insensitive_param_match() { |
| 182 |
let url = "https://example.com/?UTM_SOURCE=x&FBCLID=y&keep=z"; |
| 183 |
let cleaned = strip_tracking_params(url); |
| 184 |
assert_eq!(cleaned, "https://example.com/?keep=z"); |
| 185 |
} |
| 186 |
} |
| 187 |
|