Skip to main content

max / balanced_breakfast

5.3 KB · 187 lines History Blame Raw
1 //! URL tracker parameter stripping.
2 //!
3 //! Removes known tracking parameters (utm_*, fbclid, gclid, etc.) from URLs
4 //! to improve privacy and reduce link clutter. Also provides an HTML rewriter
5 //! that cleans URLs inside `href` and `src` attributes.
6
7 use std::sync::LazyLock;
8
9 use regex::Regex;
10 use url::Url;
11
12 /// Tracking parameter prefixes — any query parameter whose name starts with one
13 /// of these (case-insensitive) is stripped.
14 const TRACKING_PREFIXES: &[&str] = &["utm_"];
15
16 /// Tracking parameter exact names (case-insensitive).
17 const TRACKING_PARAMS: &[&str] = &[
18 "fbclid",
19 "gclid",
20 "msclkid",
21 "twclid",
22 "dclid",
23 "mc_cid",
24 "mc_eid",
25 "oly_anon_id",
26 "oly_enc_id",
27 "_openstat",
28 "vero_id",
29 "wickedid",
30 "yclid",
31 "zanpid",
32 "_hsenc",
33 "_hsmi",
34 "hsa_cam",
35 "hsa_grp",
36 "hsa_mt",
37 "hsa_src",
38 "hsa_ad",
39 "hsa_acc",
40 "hsa_net",
41 "hsa_ver",
42 "hsa_la",
43 "hsa_ol",
44 "hsa_kw",
45 "hsa_tgt",
46 ];
47
48 /// Returns `true` if a query parameter name is a known tracker.
49 fn is_tracking_param(name: &str) -> bool {
50 let lower = name.to_ascii_lowercase();
51 if TRACKING_PARAMS.iter().any(|&p| lower == p) {
52 return true;
53 }
54 TRACKING_PREFIXES
55 .iter()
56 .any(|&prefix| lower.starts_with(prefix))
57 }
58
59 #[tracing::instrument(skip_all)]
60 /// Strip known tracking query parameters from a URL string
61 pub fn strip_tracking_params(url_str: &str) -> String {
62 let mut parsed = match Url::parse(url_str) {
63 Ok(u) => u,
64 Err(_) => return url_str.to_string(),
65 };
66
67 let clean_pairs: Vec<(String, String)> = parsed
68 .query_pairs()
69 .filter(|(name, _)| !is_tracking_param(name))
70 .map(|(k, v)| (k.into_owned(), v.into_owned()))
71 .collect();
72
73 // Clear and rebuild query string
74 if clean_pairs.is_empty() {
75 parsed.set_query(None);
76 } else {
77 parsed
78 .query_pairs_mut()
79 .clear()
80 .extend_pairs(clean_pairs);
81 }
82
83 parsed.to_string()
84 }
85
86 /// Regex matching `href="..."` / `href='...'` and `src="..."` / `src='...'`
87 /// attribute values in HTML (both double- and single-quoted).
88 static ATTR_URL_RE: LazyLock<Regex> = LazyLock::new(|| {
89 Regex::new(r#"(href|src)\s*=\s*(?:"([^"]+)"|'([^']+)')"#).expect("invalid regex")
90 });
91
92 #[tracing::instrument(skip_all)]
93 /// Strip tracking parameters from all `href` and `src` URLs in an HTML string
94 pub fn strip_tracking_from_html(html: &str) -> String {
95 ATTR_URL_RE
96 .replace_all(html, |caps: &regex::Captures| {
97 let attr = &caps[1];
98 // Group 2 = double-quoted value, group 3 = single-quoted value
99 let (url, quote) = if let Some(m) = caps.get(2) {
100 (m.as_str(), '"')
101 } else {
102 (caps.get(3).unwrap().as_str(), '\'')
103 };
104 let cleaned = strip_tracking_params(url);
105 format!("{}={}{}{}", attr, quote, cleaned, quote)
106 })
107 .into_owned()
108 }
109
110 #[cfg(test)]
111 mod tests {
112 use super::*;
113
114 #[test]
115 fn strips_utm_params() {
116 let url = "https://example.com/page?utm_source=twitter&utm_medium=social&id=42";
117 let cleaned = strip_tracking_params(url);
118 assert_eq!(cleaned, "https://example.com/page?id=42");
119 }
120
121 #[test]
122 fn strips_fbclid() {
123 let url = "https://example.com/?fbclid=abc123&page=1";
124 let cleaned = strip_tracking_params(url);
125 assert_eq!(cleaned, "https://example.com/?page=1");
126 }
127
128 #[test]
129 fn strips_gclid() {
130 let url = "https://example.com/?gclid=xyz&ref=home";
131 let cleaned = strip_tracking_params(url);
132 assert_eq!(cleaned, "https://example.com/?ref=home");
133 }
134
135 #[test]
136 fn preserves_clean_params() {
137 let url = "https://example.com/search?q=rust&page=2";
138 let cleaned = strip_tracking_params(url);
139 assert_eq!(cleaned, "https://example.com/search?q=rust&page=2");
140 }
141
142 #[test]
143 fn handles_no_query() {
144 let url = "https://example.com/page";
145 let cleaned = strip_tracking_params(url);
146 assert_eq!(cleaned, "https://example.com/page");
147 }
148
149 #[test]
150 fn handles_all_tracking_removed() {
151 let url = "https://example.com/?utm_source=x&fbclid=y";
152 let cleaned = strip_tracking_params(url);
153 assert_eq!(cleaned, "https://example.com/");
154 }
155
156 #[test]
157 fn handles_invalid_url() {
158 let url = "not a url at all";
159 let cleaned = strip_tracking_params(url);
160 assert_eq!(cleaned, "not a url at all");
161 }
162
163 #[test]
164 fn strips_from_html_body() {
165 let html = r#"<a href="https://example.com/?utm_source=rss&id=1">click</a> and <img src="https://img.example.com/pic.jpg?fbclid=abc">"#;
166 let cleaned = strip_tracking_from_html(html);
167 assert_eq!(
168 cleaned,
169 r#"<a href="https://example.com/?id=1">click</a> and <img src="https://img.example.com/pic.jpg">"#
170 );
171 }
172
173 #[test]
174 fn html_preserves_clean_urls() {
175 let html = r#"<a href="https://example.com/page">link</a>"#;
176 let cleaned = strip_tracking_from_html(html);
177 assert_eq!(cleaned, html);
178 }
179
180 #[test]
181 fn case_insensitive_param_match() {
182 let url = "https://example.com/?UTM_SOURCE=x&FBCLID=y&keep=z";
183 let cleaned = strip_tracking_params(url);
184 assert_eq!(cleaned, "https://example.com/?keep=z");
185 }
186 }
187