Skip to main content

max / makenotwork

12.3 KB · 319 lines History Blame Raw
1 //! HTML sanitization for custom pages, built on [`ammonia`].
2 //!
3 //! The policy is an explicit allowlist (see `plans/custom-pages.md`): structural
4 //! and text elements plus media, no scripting, no embeds, no forms, no inline
5 //! `style` attribute (all CSS goes in the dedicated CSS field, for a single
6 //! sanitization path and better caching). Every URL-bearing attribute is routed
7 //! through [`resolve_internal_url`], so a custom page can reference only MNW.
8 //!
9 //! Anything outside the allowlist is dropped by ammonia. Dropped *URLs* are
10 //! additionally recorded as [`Rejection`]s for the editor's blocked-references
11 //! panel -- the primary teaching surface for the closed-system rule.
12
13 use std::borrow::Cow;
14 use std::collections::{HashMap, HashSet};
15 use std::sync::{Arc, Mutex};
16
17 use super::url_filter::{UrlPolicy, resolve_internal_url, resolve_srcset};
18 use super::Rejection;
19
20 /// Allowed element names.
21 const ALLOWED_TAGS: &[&str] = &[
22 "a", "abbr", "article", "aside", "b", "blockquote", "br", "caption", "cite",
23 "code", "col", "colgroup", "dd", "details", "div", "dl", "dt", "em",
24 "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6",
25 "header", "hr", "i", "img", "kbd", "li", "main", "mark", "nav", "ol", "p",
26 "picture", "pre", "q", "s", "samp", "section", "small", "source", "span",
27 "strong", "sub", "summary", "sup", "table", "tbody", "td", "tfoot", "th",
28 "thead", "time", "tr", "u", "ul", "video", "audio", "track",
29 ];
30
31 /// Attributes allowed on any element.
32 const GENERIC_ATTRS: &[&str] = &["class", "id", "title", "lang", "dir"];
33
34 /// Tags whose entire content is discarded (not unwrapped) when the tag is
35 /// stripped -- script/style bodies and metadata must never resurface as text.
36 const CLEAN_CONTENT_TAGS: &[&str] = &[
37 "script", "style", "iframe", "object", "embed", "noscript", "template",
38 "svg", "math", "frame", "frameset", "head", "title", "base", "meta",
39 "link", "applet", "param", "canvas", "form", "input", "button", "select",
40 "textarea",
41 ];
42
43 /// Per-tag attribute allowlist. URL-bearing attributes here are still validated
44 /// by the attribute filter; listing them only makes them *eligible*.
45 fn tag_attributes() -> HashMap<&'static str, HashSet<&'static str>> {
46 let set = |attrs: &[&'static str]| attrs.iter().copied().collect::<HashSet<_>>();
47 HashMap::from([
48 ("a", set(&["href"])),
49 ("img", set(&["src", "alt", "width", "height", "loading", "srcset"])),
50 ("source", set(&["src", "srcset", "type", "media", "width", "height"])),
51 // autoplay dropped; looping silent video allowed (decision #4).
52 ("video", set(&["src", "controls", "loop", "muted", "poster", "preload", "width", "height"])),
53 // loop/muted/autoplay dropped -- no surprise / looping audio (decision #4).
54 ("audio", set(&["src", "controls", "preload"])),
55 ("track", set(&["src", "kind", "srclang", "label", "default"])),
56 ("time", set(&["datetime"])),
57 ("th", set(&["colspan", "rowspan", "scope"])),
58 ("td", set(&["colspan", "rowspan", "scope"])),
59 ("col", set(&["span"])),
60 ("colgroup", set(&["span"])),
61 ("details", set(&["open"])),
62 ("ol", set(&["start", "reversed"])),
63 ])
64 }
65
66 /// Which attributes carry URLs, and how to parse them.
67 enum UrlAttr {
68 Single,
69 SrcSet,
70 }
71
72 fn url_attribute(element: &str, attribute: &str) -> Option<UrlAttr> {
73 match (element, attribute) {
74 ("a", "href") => Some(UrlAttr::Single),
75 ("img" | "source" | "video" | "audio" | "track", "src") => Some(UrlAttr::Single),
76 ("video", "poster") => Some(UrlAttr::Single),
77 ("img" | "source", "srcset") => Some(UrlAttr::SrcSet),
78 _ => None,
79 }
80 }
81
82 /// Sanitize user HTML. Returns the cleaned markup plus every URL the sanitizer
83 /// stripped (for the blocked-references panel). The output references only MNW
84 /// and contains no scripting, embeds, forms, or inline styles.
85 pub fn sanitize_html(input: &str, policy: &UrlPolicy) -> (String, Vec<Rejection>) {
86 let rejections: Arc<Mutex<Vec<Rejection>>> = Arc::new(Mutex::new(Vec::new()));
87
88 let tags: HashSet<&str> = ALLOWED_TAGS.iter().copied().collect();
89 let generic: HashSet<&str> = GENERIC_ATTRS.iter().copied().collect();
90 let clean_content: HashSet<&str> = CLEAN_CONTENT_TAGS.iter().copied().collect();
91 let per_tag = tag_attributes();
92
93 let filter_policy = policy.clone();
94 let filter_sink = Arc::clone(&rejections);
95
96 let mut builder = ammonia::Builder::default();
97 builder
98 .tags(tags)
99 .generic_attributes(generic)
100 .tag_attributes(per_tag)
101 .clean_content_tags(clean_content)
102 // Let candidate schemes through ammonia's built-in check so our
103 // attribute filter is the single authority -- it rejects everything
104 // that does not resolve to on-platform https, and recording happens
105 // there (ammonia's own scheme drop is silent). Safe because the filter
106 // covers every URL-bearing attribute on every allowed tag.
107 .url_schemes(HashSet::from([
108 "https", "http", "data", "javascript", "mailto", "ftp", "blob", "file", "vbscript",
109 ]))
110 .url_relative(ammonia::UrlRelative::PassThrough)
111 // User-authored anchors never influence ranking and are flagged as
112 // user-generated content (decision: per-page link rel).
113 .link_rel(Some("nofollow ugc"))
114 .strip_comments(true)
115 .attribute_filter(move |element, attribute, value| {
116 match url_attribute(element, attribute) {
117 None => Some(Cow::Borrowed(value)),
118 Some(kind) => {
119 let location = format!("{element} {attribute}");
120 let result = match kind {
121 UrlAttr::Single => resolve_internal_url(value, &filter_policy, &location),
122 UrlAttr::SrcSet => resolve_srcset(value, &filter_policy, &location),
123 };
124 match result {
125 Ok(v) => Some(Cow::Owned(v)),
126 Err(rej) => {
127 filter_sink.lock().expect("rejection sink poisoned").push(rej);
128 None
129 }
130 }
131 }
132 }
133 });
134
135 let cleaned = builder.clean(input).to_string();
136 let collected = std::mem::take(&mut *rejections.lock().expect("rejection sink poisoned"));
137 (cleaned, collected)
138 }
139
140 #[cfg(test)]
141 mod tests {
142 use super::super::RejectionKind;
143 use super::*;
144
145 fn policy() -> UrlPolicy {
146 UrlPolicy::new(
147 "https://u.makenot.work/alice/proj",
148 ["makenot.work".to_string(), "u.makenot.work".to_string(), "cdn.makenot.work".to_string()],
149 )
150 .unwrap()
151 }
152
153 fn clean(html: &str) -> String {
154 sanitize_html(html, &policy()).0
155 }
156
157 #[test]
158 fn keeps_allowed_structure() {
159 let out = clean("<section><h1 class=\"t\">Hi</h1><p>Hello <strong>world</strong></p></section>");
160 assert!(out.contains("<section>"));
161 assert!(out.contains("<h1 class=\"t\">"));
162 assert!(out.contains("<strong>world</strong>"));
163 }
164
165 #[test]
166 fn strips_script_and_its_content() {
167 let out = clean("<p>ok</p><script>alert(1)</script>");
168 assert!(out.contains("ok"));
169 assert!(!out.contains("alert"));
170 assert!(!out.contains("<script"));
171 }
172
173 #[test]
174 fn strips_style_tag_and_content() {
175 let out = clean("<style>body{display:none}</style><p>hi</p>");
176 assert!(!out.to_lowercase().contains("display"));
177 assert!(out.contains("hi"));
178 }
179
180 #[test]
181 fn strips_inline_style_attribute() {
182 let out = clean("<p style=\"color:red\">x</p>");
183 assert!(!out.contains("style"));
184 assert!(out.contains("<p>x</p>"));
185 }
186
187 #[test]
188 fn strips_event_handlers() {
189 let out = clean("<div onclick=\"steal()\">x</div>");
190 assert!(!out.to_lowercase().contains("onclick"));
191 assert!(!out.contains("steal"));
192 }
193
194 #[test]
195 fn strips_iframe_object_embed_form() {
196 for tag in ["iframe", "object", "embed", "form"] {
197 let out = clean(&format!("<{tag}>x</{tag}><p>keep</p>"));
198 assert!(!out.contains(&format!("<{tag}")), "{tag} must be stripped");
199 assert!(out.contains("keep"));
200 }
201 }
202
203 #[test]
204 fn rejects_external_image_src_and_records_it() {
205 let (out, rej) = sanitize_html("<img src=\"https://evil.com/x.png\" alt=\"a\">", &policy());
206 assert!(!out.contains("evil.com"));
207 assert_eq!(rej.len(), 1);
208 assert!(matches!(rej[0].kind, RejectionKind::ExternalUrl));
209 assert_eq!(rej[0].location, "img src");
210 }
211
212 #[test]
213 fn keeps_internal_and_relative_media() {
214 let out = clean("<img src=\"/static/p.png\" alt=\"a\"><img src=\"https://cdn.makenot.work/b\" alt=\"b\">");
215 assert!(out.contains("/static/p.png"));
216 assert!(out.contains("cdn.makenot.work/b"));
217 }
218
219 #[test]
220 fn drops_javascript_href_and_records() {
221 let (out, rej) = sanitize_html("<a href=\"javascript:alert(1)\">x</a>", &policy());
222 assert!(!out.to_lowercase().contains("javascript"));
223 assert!(rej.iter().any(|r| matches!(r.kind, RejectionKind::DisallowedScheme)));
224 }
225
226 #[test]
227 fn anchors_get_nofollow_ugc() {
228 let out = clean("<a href=\"/alice\">me</a>");
229 assert!(out.contains("rel=\"nofollow ugc\""));
230 }
231
232 #[test]
233 fn drops_autoplay_and_loop_audio_attrs() {
234 let out = clean("<audio src=\"/a.mp3\" controls loop autoplay muted></audio>");
235 assert!(out.contains("controls"));
236 assert!(!out.contains("autoplay"));
237 assert!(!out.contains("loop"));
238 assert!(!out.contains("muted"));
239 }
240
241 #[test]
242 fn drops_video_autoplay_keeps_loop() {
243 let out = clean("<video src=\"/v.mp4\" controls loop autoplay></video>");
244 assert!(out.contains("loop"));
245 assert!(!out.contains("autoplay"));
246 }
247
248 #[test]
249 fn srcset_with_external_candidate_is_dropped() {
250 let (out, rej) = sanitize_html(
251 "<img srcset=\"/a.png 1x, https://evil.com/b.png 2x\" alt=\"a\">",
252 &policy(),
253 );
254 assert!(!out.contains("evil.com"));
255 assert!(!out.contains("srcset"));
256 assert!(!rej.is_empty());
257 }
258
259 #[test]
260 fn comments_stripped() {
261 let out = clean("<p>a</p><!-- secret -->");
262 assert!(!out.contains("secret"));
263 }
264
265 #[test]
266 fn idempotent() {
267 let input = "<section><a href=\"/x\">l</a><img src=\"https://evil.com/y\"><script>z</script></section>";
268 let once = clean(input);
269 let twice = clean(&once);
270 assert_eq!(once, twice);
271 }
272 }
273
274 #[cfg(test)]
275 mod proptests {
276 use super::*;
277 use proptest::prelude::*;
278
279 fn policy() -> UrlPolicy {
280 UrlPolicy::new(
281 "https://u.makenot.work/a/p",
282 ["makenot.work".to_string(), "u.makenot.work".to_string(), "cdn.makenot.work".to_string()],
283 )
284 .unwrap()
285 }
286
287 proptest! {
288 // Arbitrary input must never panic, and no dangerous *element* may
289 // survive. (We only assert on tags, not substrings: a real `<script`
290 // can only appear as an element -- angle brackets in text are escaped.)
291 #[test]
292 fn never_panics_no_dangerous_tags(input in "\\PC{0,400}") {
293 let (out, _rej) = sanitize_html(&input, &policy());
294 let low = out.to_lowercase();
295 for tag in ["<script", "<iframe", "<object", "<embed", "<form",
296 "<style", "<svg", "<math", "<link", "<meta", "<base"] {
297 prop_assert!(!low.contains(tag), "leaked {tag}: {out}");
298 }
299 }
300
301 // Output is stable under re-sanitization (ammonia idempotency).
302 #[test]
303 fn idempotent_fuzz(input in "\\PC{0,400}") {
304 let once = sanitize_html(&input, &policy()).0;
305 let twice = sanitize_html(&once, &policy()).0;
306 prop_assert_eq!(once, twice);
307 }
308
309 // A randomly-built external image src is always stripped.
310 #[test]
311 fn external_img_always_stripped(host in "[a-z]{3,10}", tld in "(com|net|io|xyz)") {
312 let domain = format!("{host}.{tld}");
313 let html = format!("<img src=\"https://{domain}/p.png\" alt=\"x\">");
314 let out = sanitize_html(&html, &policy()).0;
315 prop_assert!(!out.contains(&domain));
316 }
317 }
318 }
319