//! HTML sanitization for custom pages, built on [`ammonia`]. //! //! The policy is an explicit allowlist (see `plans/custom-pages.md`): structural //! and text elements plus media, no scripting, no embeds, no forms, no inline //! `style` attribute (all CSS goes in the dedicated CSS field, for a single //! sanitization path and better caching). Every URL-bearing attribute is routed //! through [`resolve_internal_url`], so a custom page can reference only MNW. //! //! Anything outside the allowlist is dropped by ammonia. Dropped *URLs* are //! additionally recorded as [`Rejection`]s for the editor's blocked-references //! panel -- the primary teaching surface for the closed-system rule. use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::sync::{Arc, Mutex}; use super::url_filter::{UrlPolicy, resolve_internal_url, resolve_srcset}; use super::Rejection; /// Allowed element names. const ALLOWED_TAGS: &[&str] = &[ "a", "abbr", "article", "aside", "b", "blockquote", "br", "caption", "cite", "code", "col", "colgroup", "dd", "details", "div", "dl", "dt", "em", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "i", "img", "kbd", "li", "main", "mark", "nav", "ol", "p", "picture", "pre", "q", "s", "samp", "section", "small", "source", "span", "strong", "sub", "summary", "sup", "table", "tbody", "td", "tfoot", "th", "thead", "time", "tr", "u", "ul", "video", "audio", "track", ]; /// Attributes allowed on any element. const GENERIC_ATTRS: &[&str] = &["class", "id", "title", "lang", "dir"]; /// Tags whose entire content is discarded (not unwrapped) when the tag is /// stripped -- script/style bodies and metadata must never resurface as text. const CLEAN_CONTENT_TAGS: &[&str] = &[ "script", "style", "iframe", "object", "embed", "noscript", "template", "svg", "math", "frame", "frameset", "head", "title", "base", "meta", "link", "applet", "param", "canvas", "form", "input", "button", "select", "textarea", ]; /// Per-tag attribute allowlist. URL-bearing attributes here are still validated /// by the attribute filter; listing them only makes them *eligible*. fn tag_attributes() -> HashMap<&'static str, HashSet<&'static str>> { let set = |attrs: &[&'static str]| attrs.iter().copied().collect::>(); HashMap::from([ ("a", set(&["href"])), ("img", set(&["src", "alt", "width", "height", "loading", "srcset"])), ("source", set(&["src", "srcset", "type", "media", "width", "height"])), // autoplay dropped; looping silent video allowed (decision #4). ("video", set(&["src", "controls", "loop", "muted", "poster", "preload", "width", "height"])), // loop/muted/autoplay dropped -- no surprise / looping audio (decision #4). ("audio", set(&["src", "controls", "preload"])), ("track", set(&["src", "kind", "srclang", "label", "default"])), ("time", set(&["datetime"])), ("th", set(&["colspan", "rowspan", "scope"])), ("td", set(&["colspan", "rowspan", "scope"])), ("col", set(&["span"])), ("colgroup", set(&["span"])), ("details", set(&["open"])), ("ol", set(&["start", "reversed"])), ]) } /// Which attributes carry URLs, and how to parse them. enum UrlAttr { Single, SrcSet, } fn url_attribute(element: &str, attribute: &str) -> Option { match (element, attribute) { ("a", "href") => Some(UrlAttr::Single), ("img" | "source" | "video" | "audio" | "track", "src") => Some(UrlAttr::Single), ("video", "poster") => Some(UrlAttr::Single), ("img" | "source", "srcset") => Some(UrlAttr::SrcSet), _ => None, } } /// Sanitize user HTML. Returns the cleaned markup plus every URL the sanitizer /// stripped (for the blocked-references panel). The output references only MNW /// and contains no scripting, embeds, forms, or inline styles. pub fn sanitize_html(input: &str, policy: &UrlPolicy) -> (String, Vec) { let rejections: Arc>> = Arc::new(Mutex::new(Vec::new())); let tags: HashSet<&str> = ALLOWED_TAGS.iter().copied().collect(); let generic: HashSet<&str> = GENERIC_ATTRS.iter().copied().collect(); let clean_content: HashSet<&str> = CLEAN_CONTENT_TAGS.iter().copied().collect(); let per_tag = tag_attributes(); let filter_policy = policy.clone(); let filter_sink = Arc::clone(&rejections); let mut builder = ammonia::Builder::default(); builder .tags(tags) .generic_attributes(generic) .tag_attributes(per_tag) .clean_content_tags(clean_content) // Let candidate schemes through ammonia's built-in check so our // attribute filter is the single authority -- it rejects everything // that does not resolve to on-platform https, and recording happens // there (ammonia's own scheme drop is silent). Safe because the filter // covers every URL-bearing attribute on every allowed tag. .url_schemes(HashSet::from([ "https", "http", "data", "javascript", "mailto", "ftp", "blob", "file", "vbscript", ])) .url_relative(ammonia::UrlRelative::PassThrough) // User-authored anchors never influence ranking and are flagged as // user-generated content (decision: per-page link rel). .link_rel(Some("nofollow ugc")) .strip_comments(true) .attribute_filter(move |element, attribute, value| { match url_attribute(element, attribute) { None => Some(Cow::Borrowed(value)), Some(kind) => { let location = format!("{element} {attribute}"); let result = match kind { UrlAttr::Single => resolve_internal_url(value, &filter_policy, &location), UrlAttr::SrcSet => resolve_srcset(value, &filter_policy, &location), }; match result { Ok(v) => Some(Cow::Owned(v)), Err(rej) => { filter_sink.lock().expect("rejection sink poisoned").push(rej); None } } } } }); let cleaned = builder.clean(input).to_string(); let collected = std::mem::take(&mut *rejections.lock().expect("rejection sink poisoned")); (cleaned, collected) } #[cfg(test)] mod tests { use super::super::RejectionKind; use super::*; fn policy() -> UrlPolicy { UrlPolicy::new( "https://u.makenot.work/alice/proj", ["makenot.work".to_string(), "u.makenot.work".to_string(), "cdn.makenot.work".to_string()], ) .unwrap() } fn clean(html: &str) -> String { sanitize_html(html, &policy()).0 } #[test] fn keeps_allowed_structure() { let out = clean("

Hi

Hello world

"); assert!(out.contains("
")); assert!(out.contains("

")); assert!(out.contains("world")); } #[test] fn strips_script_and_its_content() { let out = clean("

ok

"); assert!(out.contains("ok")); assert!(!out.contains("alert")); assert!(!out.contains("body{display:none}

hi

"); assert!(!out.to_lowercase().contains("display")); assert!(out.contains("hi")); } #[test] fn strips_inline_style_attribute() { let out = clean("

x

"); assert!(!out.contains("style")); assert!(out.contains("

x

")); } #[test] fn strips_event_handlers() { let out = clean("
x
"); assert!(!out.to_lowercase().contains("onclick")); assert!(!out.contains("steal")); } #[test] fn strips_iframe_object_embed_form() { for tag in ["iframe", "object", "embed", "form"] { let out = clean(&format!("<{tag}>x

keep

")); assert!(!out.contains(&format!("<{tag}")), "{tag} must be stripped"); assert!(out.contains("keep")); } } #[test] fn rejects_external_image_src_and_records_it() { let (out, rej) = sanitize_html("\"a\"", &policy()); assert!(!out.contains("evil.com")); assert_eq!(rej.len(), 1); assert!(matches!(rej[0].kind, RejectionKind::ExternalUrl)); assert_eq!(rej[0].location, "img src"); } #[test] fn keeps_internal_and_relative_media() { let out = clean("\"a\"\"b\""); assert!(out.contains("/static/p.png")); assert!(out.contains("cdn.makenot.work/b")); } #[test] fn drops_javascript_href_and_records() { let (out, rej) = sanitize_html("x", &policy()); assert!(!out.to_lowercase().contains("javascript")); assert!(rej.iter().any(|r| matches!(r.kind, RejectionKind::DisallowedScheme))); } #[test] fn anchors_get_nofollow_ugc() { let out = clean("me"); assert!(out.contains("rel=\"nofollow ugc\"")); } #[test] fn drops_autoplay_and_loop_audio_attrs() { let out = clean(""); assert!(out.contains("controls")); assert!(!out.contains("autoplay")); assert!(!out.contains("loop")); assert!(!out.contains("muted")); } #[test] fn drops_video_autoplay_keeps_loop() { let out = clean(""); assert!(out.contains("loop")); assert!(!out.contains("autoplay")); } #[test] fn srcset_with_external_candidate_is_dropped() { let (out, rej) = sanitize_html( "\"a\"", &policy(), ); assert!(!out.contains("evil.com")); assert!(!out.contains("srcset")); assert!(!rej.is_empty()); } #[test] fn comments_stripped() { let out = clean("

a

"); assert!(!out.contains("secret")); } #[test] fn idempotent() { let input = "
l
"; let once = clean(input); let twice = clean(&once); assert_eq!(once, twice); } } #[cfg(test)] mod proptests { use super::*; use proptest::prelude::*; fn policy() -> UrlPolicy { UrlPolicy::new( "https://u.makenot.work/a/p", ["makenot.work".to_string(), "u.makenot.work".to_string(), "cdn.makenot.work".to_string()], ) .unwrap() } proptest! { // Arbitrary input must never panic, and no dangerous *element* may // survive. (We only assert on tags, not substrings: a real `"); let out = sanitize_html(&html, &policy()).0; prop_assert!(!out.contains(&domain)); } } }