| 1 |
|
| 2 |
|
| 3 |
|
| 4 |
|
| 5 |
|
| 6 |
|
| 7 |
|
| 8 |
|
| 9 |
|
| 10 |
|
| 11 |
|
| 12 |
|
| 13 |
use std::borrow::Cow; |
| 14 |
use std::collections::{HashMap, HashSet}; |
| 15 |
use std::sync::{Arc, Mutex}; |
| 16 |
|
| 17 |
use super::url_filter::{UrlPolicy, resolve_internal_url, resolve_srcset}; |
| 18 |
use super::Rejection; |
| 19 |
|
| 20 |
|
| 21 |
const ALLOWED_TAGS: &[&str] = &[ |
| 22 |
"a", "abbr", "article", "aside", "b", "blockquote", "br", "caption", "cite", |
| 23 |
"code", "col", "colgroup", "dd", "details", "div", "dl", "dt", "em", |
| 24 |
"figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", |
| 25 |
"header", "hr", "i", "img", "kbd", "li", "main", "mark", "nav", "ol", "p", |
| 26 |
"picture", "pre", "q", "s", "samp", "section", "small", "source", "span", |
| 27 |
"strong", "sub", "summary", "sup", "table", "tbody", "td", "tfoot", "th", |
| 28 |
"thead", "time", "tr", "u", "ul", "video", "audio", "track", |
| 29 |
]; |
| 30 |
|
| 31 |
|
| 32 |
const GENERIC_ATTRS: &[&str] = &["class", "id", "title", "lang", "dir"]; |
| 33 |
|
| 34 |
|
| 35 |
|
| 36 |
const CLEAN_CONTENT_TAGS: &[&str] = &[ |
| 37 |
"script", "style", "iframe", "object", "embed", "noscript", "template", |
| 38 |
"svg", "math", "frame", "frameset", "head", "title", "base", "meta", |
| 39 |
"link", "applet", "param", "canvas", "form", "input", "button", "select", |
| 40 |
"textarea", |
| 41 |
]; |
| 42 |
|
| 43 |
|
| 44 |
|
| 45 |
fn tag_attributes() -> HashMap<&'static str, HashSet<&'static str>> { |
| 46 |
let set = |attrs: &[&'static str]| attrs.iter().copied().collect::<HashSet<_>>(); |
| 47 |
HashMap::from([ |
| 48 |
("a", set(&["href"])), |
| 49 |
("img", set(&["src", "alt", "width", "height", "loading", "srcset"])), |
| 50 |
("source", set(&["src", "srcset", "type", "media", "width", "height"])), |
| 51 |
|
| 52 |
("video", set(&["src", "controls", "loop", "muted", "poster", "preload", "width", "height"])), |
| 53 |
|
| 54 |
("audio", set(&["src", "controls", "preload"])), |
| 55 |
("track", set(&["src", "kind", "srclang", "label", "default"])), |
| 56 |
("time", set(&["datetime"])), |
| 57 |
("th", set(&["colspan", "rowspan", "scope"])), |
| 58 |
("td", set(&["colspan", "rowspan", "scope"])), |
| 59 |
("col", set(&["span"])), |
| 60 |
("colgroup", set(&["span"])), |
| 61 |
("details", set(&["open"])), |
| 62 |
("ol", set(&["start", "reversed"])), |
| 63 |
]) |
| 64 |
} |
| 65 |
|
| 66 |
|
| 67 |
enum UrlAttr { |
| 68 |
Single, |
| 69 |
SrcSet, |
| 70 |
} |
| 71 |
|
| 72 |
fn url_attribute(element: &str, attribute: &str) -> Option<UrlAttr> { |
| 73 |
match (element, attribute) { |
| 74 |
("a", "href") => Some(UrlAttr::Single), |
| 75 |
("img" | "source" | "video" | "audio" | "track", "src") => Some(UrlAttr::Single), |
| 76 |
("video", "poster") => Some(UrlAttr::Single), |
| 77 |
("img" | "source", "srcset") => Some(UrlAttr::SrcSet), |
| 78 |
_ => None, |
| 79 |
} |
| 80 |
} |
| 81 |
|
| 82 |
|
| 83 |
|
| 84 |
|
| 85 |
pub fn sanitize_html(input: &str, policy: &UrlPolicy) -> (String, Vec<Rejection>) { |
| 86 |
let rejections: Arc<Mutex<Vec<Rejection>>> = Arc::new(Mutex::new(Vec::new())); |
| 87 |
|
| 88 |
let tags: HashSet<&str> = ALLOWED_TAGS.iter().copied().collect(); |
| 89 |
let generic: HashSet<&str> = GENERIC_ATTRS.iter().copied().collect(); |
| 90 |
let clean_content: HashSet<&str> = CLEAN_CONTENT_TAGS.iter().copied().collect(); |
| 91 |
let per_tag = tag_attributes(); |
| 92 |
|
| 93 |
let filter_policy = policy.clone(); |
| 94 |
let filter_sink = Arc::clone(&rejections); |
| 95 |
|
| 96 |
let mut builder = ammonia::Builder::default(); |
| 97 |
builder |
| 98 |
.tags(tags) |
| 99 |
.generic_attributes(generic) |
| 100 |
.tag_attributes(per_tag) |
| 101 |
.clean_content_tags(clean_content) |
| 102 |
|
| 103 |
|
| 104 |
|
| 105 |
|
| 106 |
|
| 107 |
.url_schemes(HashSet::from([ |
| 108 |
"https", "http", "data", "javascript", "mailto", "ftp", "blob", "file", "vbscript", |
| 109 |
])) |
| 110 |
.url_relative(ammonia::UrlRelative::PassThrough) |
| 111 |
|
| 112 |
|
| 113 |
.link_rel(Some("nofollow ugc")) |
| 114 |
.strip_comments(true) |
| 115 |
.attribute_filter(move |element, attribute, value| { |
| 116 |
match url_attribute(element, attribute) { |
| 117 |
None => Some(Cow::Borrowed(value)), |
| 118 |
Some(kind) => { |
| 119 |
let location = format!("{element} {attribute}"); |
| 120 |
let result = match kind { |
| 121 |
UrlAttr::Single => resolve_internal_url(value, &filter_policy, &location), |
| 122 |
UrlAttr::SrcSet => resolve_srcset(value, &filter_policy, &location), |
| 123 |
}; |
| 124 |
match result { |
| 125 |
Ok(v) => Some(Cow::Owned(v)), |
| 126 |
Err(rej) => { |
| 127 |
filter_sink.lock().expect("rejection sink poisoned").push(rej); |
| 128 |
None |
| 129 |
} |
| 130 |
} |
| 131 |
} |
| 132 |
} |
| 133 |
}); |
| 134 |
|
| 135 |
let cleaned = builder.clean(input).to_string(); |
| 136 |
let collected = std::mem::take(&mut *rejections.lock().expect("rejection sink poisoned")); |
| 137 |
(cleaned, collected) |
| 138 |
} |
| 139 |
|
| 140 |
#[cfg(test)] |
| 141 |
mod tests { |
| 142 |
use super::super::RejectionKind; |
| 143 |
use super::*; |
| 144 |
|
| 145 |
fn policy() -> UrlPolicy { |
| 146 |
UrlPolicy::new( |
| 147 |
"https://u.makenot.work/alice/proj", |
| 148 |
["makenot.work".to_string(), "u.makenot.work".to_string(), "cdn.makenot.work".to_string()], |
| 149 |
) |
| 150 |
.unwrap() |
| 151 |
} |
| 152 |
|
| 153 |
fn clean(html: &str) -> String { |
| 154 |
sanitize_html(html, &policy()).0 |
| 155 |
} |
| 156 |
|
| 157 |
#[test] |
| 158 |
fn keeps_allowed_structure() { |
| 159 |
let out = clean("<section><h1 class=\"t\">Hi</h1><p>Hello <strong>world</strong></p></section>"); |
| 160 |
assert!(out.contains("<section>")); |
| 161 |
assert!(out.contains("<h1 class=\"t\">")); |
| 162 |
assert!(out.contains("<strong>world</strong>")); |
| 163 |
} |
| 164 |
|
| 165 |
#[test] |
| 166 |
fn strips_script_and_its_content() { |
| 167 |
let out = clean("<p>ok</p><script>alert(1)</script>"); |
| 168 |
assert!(out.contains("ok")); |
| 169 |
assert!(!out.contains("alert")); |
| 170 |
assert!(!out.contains("<script")); |
| 171 |
} |
| 172 |
|
| 173 |
#[test] |
| 174 |
fn strips_style_tag_and_content() { |
| 175 |
let out = clean("<style>body{display:none}</style><p>hi</p>"); |
| 176 |
assert!(!out.to_lowercase().contains("display")); |
| 177 |
assert!(out.contains("hi")); |
| 178 |
} |
| 179 |
|
| 180 |
#[test] |
| 181 |
fn strips_inline_style_attribute() { |
| 182 |
let out = clean("<p style=\"color:red\">x</p>"); |
| 183 |
assert!(!out.contains("style")); |
| 184 |
assert!(out.contains("<p>x</p>")); |
| 185 |
} |
| 186 |
|
| 187 |
#[test] |
| 188 |
fn strips_event_handlers() { |
| 189 |
let out = clean("<div onclick=\"steal()\">x</div>"); |
| 190 |
assert!(!out.to_lowercase().contains("onclick")); |
| 191 |
assert!(!out.contains("steal")); |
| 192 |
} |
| 193 |
|
| 194 |
#[test] |
| 195 |
fn strips_iframe_object_embed_form() { |
| 196 |
for tag in ["iframe", "object", "embed", "form"] { |
| 197 |
let out = clean(&format!("<{tag}>x</{tag}><p>keep</p>")); |
| 198 |
assert!(!out.contains(&format!("<{tag}")), "{tag} must be stripped"); |
| 199 |
assert!(out.contains("keep")); |
| 200 |
} |
| 201 |
} |
| 202 |
|
| 203 |
#[test] |
| 204 |
fn rejects_external_image_src_and_records_it() { |
| 205 |
let (out, rej) = sanitize_html("<img src=\"https://evil.com/x.png\" alt=\"a\">", &policy()); |
| 206 |
assert!(!out.contains("evil.com")); |
| 207 |
assert_eq!(rej.len(), 1); |
| 208 |
assert!(matches!(rej[0].kind, RejectionKind::ExternalUrl)); |
| 209 |
assert_eq!(rej[0].location, "img src"); |
| 210 |
} |
| 211 |
|
| 212 |
#[test] |
| 213 |
fn keeps_internal_and_relative_media() { |
| 214 |
let out = clean("<img src=\"/static/p.png\" alt=\"a\"><img src=\"https://cdn.makenot.work/b\" alt=\"b\">"); |
| 215 |
assert!(out.contains("/static/p.png")); |
| 216 |
assert!(out.contains("cdn.makenot.work/b")); |
| 217 |
} |
| 218 |
|
| 219 |
#[test] |
| 220 |
fn drops_javascript_href_and_records() { |
| 221 |
let (out, rej) = sanitize_html("<a href=\"javascript:alert(1)\">x</a>", &policy()); |
| 222 |
assert!(!out.to_lowercase().contains("javascript")); |
| 223 |
assert!(rej.iter().any(|r| matches!(r.kind, RejectionKind::DisallowedScheme))); |
| 224 |
} |
| 225 |
|
| 226 |
#[test] |
| 227 |
fn anchors_get_nofollow_ugc() { |
| 228 |
let out = clean("<a href=\"/alice\">me</a>"); |
| 229 |
assert!(out.contains("rel=\"nofollow ugc\"")); |
| 230 |
} |
| 231 |
|
| 232 |
#[test] |
| 233 |
fn drops_autoplay_and_loop_audio_attrs() { |
| 234 |
let out = clean("<audio src=\"/a.mp3\" controls loop autoplay muted></audio>"); |
| 235 |
assert!(out.contains("controls")); |
| 236 |
assert!(!out.contains("autoplay")); |
| 237 |
assert!(!out.contains("loop")); |
| 238 |
assert!(!out.contains("muted")); |
| 239 |
} |
| 240 |
|
| 241 |
#[test] |
| 242 |
fn drops_video_autoplay_keeps_loop() { |
| 243 |
let out = clean("<video src=\"/v.mp4\" controls loop autoplay></video>"); |
| 244 |
assert!(out.contains("loop")); |
| 245 |
assert!(!out.contains("autoplay")); |
| 246 |
} |
| 247 |
|
| 248 |
#[test] |
| 249 |
fn srcset_with_external_candidate_is_dropped() { |
| 250 |
let (out, rej) = sanitize_html( |
| 251 |
"<img srcset=\"/a.png 1x, https://evil.com/b.png 2x\" alt=\"a\">", |
| 252 |
&policy(), |
| 253 |
); |
| 254 |
assert!(!out.contains("evil.com")); |
| 255 |
assert!(!out.contains("srcset")); |
| 256 |
assert!(!rej.is_empty()); |
| 257 |
} |
| 258 |
|
| 259 |
#[test] |
| 260 |
fn comments_stripped() { |
| 261 |
let out = clean("<p>a</p><!-- secret -->"); |
| 262 |
assert!(!out.contains("secret")); |
| 263 |
} |
| 264 |
|
| 265 |
#[test] |
| 266 |
fn idempotent() { |
| 267 |
let input = "<section><a href=\"/x\">l</a><img src=\"https://evil.com/y\"><script>z</script></section>"; |
| 268 |
let once = clean(input); |
| 269 |
let twice = clean(&once); |
| 270 |
assert_eq!(once, twice); |
| 271 |
} |
| 272 |
} |
| 273 |
|
| 274 |
#[cfg(test)] |
| 275 |
mod proptests { |
| 276 |
use super::*; |
| 277 |
use proptest::prelude::*; |
| 278 |
|
| 279 |
fn policy() -> UrlPolicy { |
| 280 |
UrlPolicy::new( |
| 281 |
"https://u.makenot.work/a/p", |
| 282 |
["makenot.work".to_string(), "u.makenot.work".to_string(), "cdn.makenot.work".to_string()], |
| 283 |
) |
| 284 |
.unwrap() |
| 285 |
} |
| 286 |
|
| 287 |
proptest! { |
| 288 |
|
| 289 |
|
| 290 |
|
| 291 |
#[test] |
| 292 |
fn never_panics_no_dangerous_tags(input in "\\PC{0,400}") { |
| 293 |
let (out, _rej) = sanitize_html(&input, &policy()); |
| 294 |
let low = out.to_lowercase(); |
| 295 |
for tag in ["<script", "<iframe", "<object", "<embed", "<form", |
| 296 |
"<style", "<svg", "<math", "<link", "<meta", "<base"] { |
| 297 |
prop_assert!(!low.contains(tag), "leaked {tag}: {out}"); |
| 298 |
} |
| 299 |
} |
| 300 |
|
| 301 |
|
| 302 |
#[test] |
| 303 |
fn idempotent_fuzz(input in "\\PC{0,400}") { |
| 304 |
let once = sanitize_html(&input, &policy()).0; |
| 305 |
let twice = sanitize_html(&once, &policy()).0; |
| 306 |
prop_assert_eq!(once, twice); |
| 307 |
} |
| 308 |
|
| 309 |
|
| 310 |
#[test] |
| 311 |
fn external_img_always_stripped(host in "[a-z]{3,10}", tld in "(com|net|io|xyz)") { |
| 312 |
let domain = format!("{host}.{tld}"); |
| 313 |
let html = format!("<img src=\"https://{domain}/p.png\" alt=\"x\">"); |
| 314 |
let out = sanitize_html(&html, &policy()).0; |
| 315 |
prop_assert!(!out.contains(&domain)); |
| 316 |
} |
| 317 |
} |
| 318 |
} |
| 319 |
|