//! HTML sanitization for custom pages, built on [`ammonia`].
//!
//! The policy is an explicit allowlist (see `plans/custom-pages.md`): structural
//! and text elements plus media, no scripting, no embeds, no forms, no inline
//! `style` attribute (all CSS goes in the dedicated CSS field, for a single
//! sanitization path and better caching). Every URL-bearing attribute is routed
//! through [`resolve_internal_url`], so a custom page can reference only MNW.
//!
//! Anything outside the allowlist is dropped by ammonia. Dropped *URLs* are
//! additionally recorded as [`Rejection`]s for the editor's blocked-references
//! panel -- the primary teaching surface for the closed-system rule.
use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::sync::{Arc, Mutex};
use super::url_filter::{UrlPolicy, resolve_internal_url, resolve_srcset};
use super::Rejection;
/// Allowed element names.
const ALLOWED_TAGS: &[&str] = &[
"a", "abbr", "article", "aside", "b", "blockquote", "br", "caption", "cite",
"code", "col", "colgroup", "dd", "details", "div", "dl", "dt", "em",
"figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6",
"header", "hr", "i", "img", "kbd", "li", "main", "mark", "nav", "ol", "p",
"picture", "pre", "q", "s", "samp", "section", "small", "source", "span",
"strong", "sub", "summary", "sup", "table", "tbody", "td", "tfoot", "th",
"thead", "time", "tr", "u", "ul", "video", "audio", "track",
];
/// Attributes allowed on any element.
const GENERIC_ATTRS: &[&str] = &["class", "id", "title", "lang", "dir"];
/// Tags whose entire content is discarded (not unwrapped) when the tag is
/// stripped -- script/style bodies and metadata must never resurface as text.
const CLEAN_CONTENT_TAGS: &[&str] = &[
"script", "style", "iframe", "object", "embed", "noscript", "template",
"svg", "math", "frame", "frameset", "head", "title", "base", "meta",
"link", "applet", "param", "canvas", "form", "input", "button", "select",
"textarea",
];
/// Per-tag attribute allowlist. URL-bearing attributes here are still validated
/// by the attribute filter; listing them only makes them *eligible*.
fn tag_attributes() -> HashMap<&'static str, HashSet<&'static str>> {
let set = |attrs: &[&'static str]| attrs.iter().copied().collect::>();
HashMap::from([
("a", set(&["href"])),
("img", set(&["src", "alt", "width", "height", "loading", "srcset"])),
("source", set(&["src", "srcset", "type", "media", "width", "height"])),
// autoplay dropped; looping silent video allowed (decision #4).
("video", set(&["src", "controls", "loop", "muted", "poster", "preload", "width", "height"])),
// loop/muted/autoplay dropped -- no surprise / looping audio (decision #4).
("audio", set(&["src", "controls", "preload"])),
("track", set(&["src", "kind", "srclang", "label", "default"])),
("time", set(&["datetime"])),
("th", set(&["colspan", "rowspan", "scope"])),
("td", set(&["colspan", "rowspan", "scope"])),
("col", set(&["span"])),
("colgroup", set(&["span"])),
("details", set(&["open"])),
("ol", set(&["start", "reversed"])),
])
}
/// Which attributes carry URLs, and how to parse them.
enum UrlAttr {
Single,
SrcSet,
}
fn url_attribute(element: &str, attribute: &str) -> Option {
match (element, attribute) {
("a", "href") => Some(UrlAttr::Single),
("img" | "source" | "video" | "audio" | "track", "src") => Some(UrlAttr::Single),
("video", "poster") => Some(UrlAttr::Single),
("img" | "source", "srcset") => Some(UrlAttr::SrcSet),
_ => None,
}
}
/// Sanitize user HTML. Returns the cleaned markup plus every URL the sanitizer
/// stripped (for the blocked-references panel). The output references only MNW
/// and contains no scripting, embeds, forms, or inline styles.
pub fn sanitize_html(input: &str, policy: &UrlPolicy) -> (String, Vec) {
let rejections: Arc>> = Arc::new(Mutex::new(Vec::new()));
let tags: HashSet<&str> = ALLOWED_TAGS.iter().copied().collect();
let generic: HashSet<&str> = GENERIC_ATTRS.iter().copied().collect();
let clean_content: HashSet<&str> = CLEAN_CONTENT_TAGS.iter().copied().collect();
let per_tag = tag_attributes();
let filter_policy = policy.clone();
let filter_sink = Arc::clone(&rejections);
let mut builder = ammonia::Builder::default();
builder
.tags(tags)
.generic_attributes(generic)
.tag_attributes(per_tag)
.clean_content_tags(clean_content)
// Let candidate schemes through ammonia's built-in check so our
// attribute filter is the single authority -- it rejects everything
// that does not resolve to on-platform https, and recording happens
// there (ammonia's own scheme drop is silent). Safe because the filter
// covers every URL-bearing attribute on every allowed tag.
.url_schemes(HashSet::from([
"https", "http", "data", "javascript", "mailto", "ftp", "blob", "file", "vbscript",
]))
.url_relative(ammonia::UrlRelative::PassThrough)
// User-authored anchors never influence ranking and are flagged as
// user-generated content (decision: per-page link rel).
.link_rel(Some("nofollow ugc"))
.strip_comments(true)
.attribute_filter(move |element, attribute, value| {
match url_attribute(element, attribute) {
None => Some(Cow::Borrowed(value)),
Some(kind) => {
let location = format!("{element} {attribute}");
let result = match kind {
UrlAttr::Single => resolve_internal_url(value, &filter_policy, &location),
UrlAttr::SrcSet => resolve_srcset(value, &filter_policy, &location),
};
match result {
Ok(v) => Some(Cow::Owned(v)),
Err(rej) => {
filter_sink.lock().expect("rejection sink poisoned").push(rej);
None
}
}
}
}
});
let cleaned = builder.clean(input).to_string();
let collected = std::mem::take(&mut *rejections.lock().expect("rejection sink poisoned"));
(cleaned, collected)
}
#[cfg(test)]
mod tests {
use super::super::RejectionKind;
use super::*;
fn policy() -> UrlPolicy {
UrlPolicy::new(
"https://u.makenot.work/alice/proj",
["makenot.work".to_string(), "u.makenot.work".to_string(), "cdn.makenot.work".to_string()],
)
.unwrap()
}
fn clean(html: &str) -> String {
sanitize_html(html, &policy()).0
}
#[test]
fn keeps_allowed_structure() {
let out = clean("");
assert!(out.contains(""));
assert!(out.contains(""));
assert!(out.contains("world"));
}
#[test]
fn strips_script_and_its_content() {
let out = clean("
ok
");
assert!(out.contains("ok"));
assert!(!out.contains("alert"));
assert!(!out.contains("";
let once = clean(input);
let twice = clean(&once);
assert_eq!(once, twice);
}
}
#[cfg(test)]
mod proptests {
use super::*;
use proptest::prelude::*;
fn policy() -> UrlPolicy {
UrlPolicy::new(
"https://u.makenot.work/a/p",
["makenot.work".to_string(), "u.makenot.work".to_string(), "cdn.makenot.work".to_string()],
)
.unwrap()
}
proptest! {
// Arbitrary input must never panic, and no dangerous *element* may
// survive. (We only assert on tags, not substrings: a real `