Skip to main content

max / makenotwork

16.0 KB · 539 lines History Blame Raw
1 use pulldown_cmark::{CowStr, Event, Options, Parser, Tag, TagEnd, html};
2
3 use crate::sanitize::SanitizePreset;
4
5 /// Returns true if the URL uses a scheme not in the safe allowlist.
6 ///
7 /// Safe schemes: `http`, `https`, `mailto`, `ftp`. Relative URLs (no scheme) are safe.
8 fn has_dangerous_scheme(url: &str) -> bool {
9 let trimmed = url.trim();
10 if let Some(colon_pos) = trimmed.find(':') {
11 let before_colon = &trimmed[..colon_pos];
12 if before_colon.contains('/')
13 || before_colon.contains('#')
14 || before_colon.contains('?')
15 {
16 return false;
17 }
18 let scheme = before_colon.to_ascii_lowercase();
19 !matches!(scheme.as_str(), "http" | "https" | "mailto" | "ftp")
20 } else {
21 false
22 }
23 }
24
25 /// Result of rendering markdown with metadata.
26 #[derive(Debug, Clone)]
27 pub struct RenderResult {
28 pub html: String,
29 pub word_count: u32,
30 pub reading_time_minutes: u32,
31 }
32
33 /// Configurable markdown renderer with builder pattern.
34 pub struct Renderer {
35 tables: bool,
36 strikethrough: bool,
37 footnotes: bool,
38 smart_punctuation: bool,
39 tasklists: bool,
40 strip_images: bool,
41 strip_raw_html: bool,
42 dangerous_scheme_filter: bool,
43 sanitize: SanitizePreset,
44 }
45
46 impl Renderer {
47 /// GFM features, default ammonia sanitization. Suitable for trusted content
48 /// like docs and blog posts.
49 pub fn permissive() -> Self {
50 Self {
51 tables: true,
52 strikethrough: true,
53 footnotes: true,
54 smart_punctuation: true,
55 tasklists: true,
56 strip_images: false,
57 strip_raw_html: false,
58 dangerous_scheme_filter: false,
59 sanitize: SanitizePreset::Permissive,
60 }
61 }
62
63 /// GFM features, no images. Suitable for app text fields (descriptions,
64 /// notes).
65 pub fn standard() -> Self {
66 Self {
67 tables: true,
68 strikethrough: true,
69 footnotes: false,
70 smart_punctuation: true,
71 tasklists: true,
72 strip_images: true,
73 strip_raw_html: false,
74 dangerous_scheme_filter: false,
75 sanitize: SanitizePreset::Standard,
76 }
77 }
78
79 /// No images, no raw HTML, dangerous scheme blocking, nofollow on links.
80 /// Suitable for user-generated content (forum posts).
81 pub fn strict() -> Self {
82 Self {
83 tables: false,
84 strikethrough: false,
85 footnotes: false,
86 smart_punctuation: false,
87 tasklists: false,
88 strip_images: true,
89 strip_raw_html: true,
90 dangerous_scheme_filter: true,
91 sanitize: SanitizePreset::Strict,
92 }
93 }
94
95 /// No markdown parsing, only ammonia sanitization. Suitable for HTML from
96 /// external sources (RSS feeds).
97 pub fn sanitize_only() -> Self {
98 Self {
99 tables: false,
100 strikethrough: false,
101 footnotes: false,
102 smart_punctuation: false,
103 tasklists: false,
104 strip_images: false,
105 strip_raw_html: false,
106 dangerous_scheme_filter: false,
107 sanitize: SanitizePreset::Permissive,
108 }
109 }
110
111 pub fn with_tables(mut self, enabled: bool) -> Self {
112 self.tables = enabled;
113 self
114 }
115
116 pub fn with_strikethrough(mut self, enabled: bool) -> Self {
117 self.strikethrough = enabled;
118 self
119 }
120
121 pub fn with_footnotes(mut self, enabled: bool) -> Self {
122 self.footnotes = enabled;
123 self
124 }
125
126 pub fn with_smart_punctuation(mut self, enabled: bool) -> Self {
127 self.smart_punctuation = enabled;
128 self
129 }
130
131 pub fn with_tasklists(mut self, enabled: bool) -> Self {
132 self.tasklists = enabled;
133 self
134 }
135
136 pub fn with_strip_images(mut self, enabled: bool) -> Self {
137 self.strip_images = enabled;
138 self
139 }
140
141 pub fn with_strip_raw_html(mut self, enabled: bool) -> Self {
142 self.strip_raw_html = enabled;
143 self
144 }
145
146 pub fn with_dangerous_scheme_filter(mut self, enabled: bool) -> Self {
147 self.dangerous_scheme_filter = enabled;
148 self
149 }
150
151 pub fn with_sanitize(mut self, preset: SanitizePreset) -> Self {
152 self.sanitize = preset;
153 self
154 }
155
156 fn build_options(&self) -> Options {
157 let mut opts = Options::empty();
158 if self.tables {
159 opts.insert(Options::ENABLE_TABLES);
160 }
161 if self.strikethrough {
162 opts.insert(Options::ENABLE_STRIKETHROUGH);
163 }
164 if self.footnotes {
165 opts.insert(Options::ENABLE_FOOTNOTES);
166 }
167 if self.smart_punctuation {
168 opts.insert(Options::ENABLE_SMART_PUNCTUATION);
169 }
170 if self.tasklists {
171 opts.insert(Options::ENABLE_TASKLISTS);
172 }
173 opts
174 }
175
176 /// Render markdown to sanitized HTML.
177 pub fn render(&self, input: &str) -> String {
178 if input.is_empty() {
179 return String::new();
180 }
181 let html_output = self.render_raw(input);
182 self.sanitize.clean(&html_output)
183 }
184
185 /// Render markdown to sanitized HTML with metadata.
186 pub fn render_with_meta(&self, input: &str) -> RenderResult {
187 let html = self.render(input);
188 let wc = crate::text::word_count(input);
189 RenderResult {
190 html,
191 word_count: wc,
192 reading_time_minutes: crate::text::reading_time_minutes(wc),
193 }
194 }
195
196 /// Sanitize pre-rendered HTML without markdown parsing.
197 pub fn sanitize_html(&self, html: &str) -> String {
198 self.sanitize.clean(html)
199 }
200
201 fn render_raw(&self, input: &str) -> String {
202 let options = self.build_options();
203 let parser = Parser::new_ext(input, options);
204
205 let strip_images = self.strip_images;
206 let strip_raw_html = self.strip_raw_html;
207 let scheme_filter = self.dangerous_scheme_filter;
208
209 let filtered = parser.filter_map(move |event| match event {
210 // Strip raw HTML events
211 Event::Html(_) | Event::InlineHtml(_) if strip_raw_html => None,
212 // Neutralize dangerous schemes on links
213 Event::Start(Tag::Link {
214 link_type,
215 dest_url,
216 title,
217 id,
218 }) if scheme_filter && has_dangerous_scheme(&dest_url) => {
219 Some(Event::Start(Tag::Link {
220 link_type,
221 dest_url: CowStr::Borrowed("#"),
222 title,
223 id,
224 }))
225 }
226 // Strip images entirely (alt text passes through as plain text)
227 Event::Start(Tag::Image { .. }) | Event::End(TagEnd::Image) if strip_images => None,
228 other => Some(other),
229 });
230
231 let mut output = String::new();
232 html::push_html(&mut output, filtered);
233 output
234 }
235 }
236
237 #[cfg(test)]
238 mod tests {
239 use super::*;
240
241 // ===== has_dangerous_scheme =====
242
243 #[test]
244 fn safe_schemes() {
245 assert!(!has_dangerous_scheme("https://example.com"));
246 assert!(!has_dangerous_scheme("http://example.com"));
247 assert!(!has_dangerous_scheme("mailto:user@example.com"));
248 assert!(!has_dangerous_scheme("ftp://files.example.com"));
249 }
250
251 #[test]
252 fn dangerous_schemes() {
253 assert!(has_dangerous_scheme("javascript:alert(1)"));
254 assert!(has_dangerous_scheme("data:text/html,<script>"));
255 assert!(has_dangerous_scheme("vbscript:msgbox"));
256 }
257
258 #[test]
259 fn case_insensitive_schemes() {
260 assert!(has_dangerous_scheme("JaVaScRiPt:alert(1)"));
261 assert!(has_dangerous_scheme("DATA:text/html,x"));
262 }
263
264 #[test]
265 fn relative_urls_are_safe() {
266 assert!(!has_dangerous_scheme("/about"));
267 assert!(!has_dangerous_scheme("#heading"));
268 assert!(!has_dangerous_scheme("page.html"));
269 assert!(!has_dangerous_scheme("path/to:file"));
270 }
271
272 #[test]
273 fn query_string_before_colon_is_safe() {
274 // "x?y:z" — '?' before ':' means the part before ':' isn't a scheme.
275 // Pins the `|| before_colon.contains('?')` arm of the disjunction.
276 assert!(!has_dangerous_scheme("page?q=foo:bar"));
277 }
278
279 #[test]
280 fn fragment_before_colon_is_safe() {
281 // "x#y:z" — '#' before ':' likewise. Pins the `|| contains('#')` arm.
282 assert!(!has_dangerous_scheme("page#sec:1"));
283 }
284
285 // ===== Permissive preset =====
286
287 #[test]
288 fn permissive_basic_markdown() {
289 let r = Renderer::permissive();
290 let html = r.render("# Hello\n\nThis is a **test**.");
291 assert!(html.contains("<h1>Hello</h1>"));
292 assert!(html.contains("<strong>test</strong>"));
293 }
294
295 #[test]
296 fn permissive_tables() {
297 let r = Renderer::permissive();
298 let html = r.render("| A | B |\n|---|---|\n| 1 | 2 |");
299 assert!(html.contains("<table>"));
300 assert!(html.contains("<td>"));
301 }
302
303 #[test]
304 fn permissive_smart_punctuation() {
305 let r = Renderer::permissive();
306 let html = r.render("It's a \"test\"");
307 assert!(
308 html.contains('\u{201c}') || html.contains('\u{201d}') || html.contains("\"")
309 );
310 }
311
312 #[test]
313 fn permissive_strips_script() {
314 let r = Renderer::permissive();
315 let html = r.render("Hello <script>alert('xss')</script> world");
316 assert!(!html.contains("<script>"));
317 }
318
319 #[test]
320 fn permissive_keeps_images() {
321 let r = Renderer::permissive();
322 let html = r.render("![alt](https://example.com/img.png)");
323 assert!(html.contains("<img"));
324 }
325
326 #[test]
327 fn permissive_empty_input() {
328 assert_eq!(Renderer::permissive().render(""), "");
329 }
330
331 // ===== Standard preset =====
332
333 #[test]
334 fn standard_strips_images() {
335 let r = Renderer::standard();
336 let html = r.render("![alt text](https://example.com/img.png)");
337 assert!(!html.contains("<img"));
338 assert!(html.contains("alt text"));
339 }
340
341 #[test]
342 fn standard_keeps_tables() {
343 let r = Renderer::standard();
344 let html = r.render("| A |\n|---|\n| 1 |");
345 assert!(html.contains("<table>"));
346 }
347
348 // ===== Strict preset =====
349
350 #[test]
351 fn with_strip_raw_html_toggle_is_observable() {
352 // Pins the `if strip_raw_html` guard in render_raw: the same renderer
353 // preset with the flag toggled must produce different output. `<u>` is
354 // allowed by ammonia's permissive sanitizer, so the only thing removing
355 // it is the pulldown-stage Event::Html filter.
356 let kept = Renderer::permissive().render("hello <u>raw</u> world");
357 let stripped = Renderer::permissive()
358 .with_strip_raw_html(true)
359 .render("hello <u>raw</u> world");
360 assert!(kept.contains("<u>"), "<u> should survive permissive: {}", kept);
361 assert!(!stripped.contains("<u>"), "<u> should be stripped: {}", stripped);
362 }
363
364 #[test]
365 fn strict_strips_raw_html() {
366 let r = Renderer::strict();
367 let html = r.render("<script>alert('xss')</script>");
368 assert!(!html.contains("<script>"));
369 assert!(!html.contains("</script>"));
370 }
371
372 #[test]
373 fn strict_strips_inline_html() {
374 let r = Renderer::strict();
375 let html = r.render("hello <b>bold</b> world");
376 assert!(!html.contains("<b>"));
377 assert!(html.contains("hello"));
378 assert!(html.contains("world"));
379 }
380
381 #[test]
382 fn strict_strips_images() {
383 let r = Renderer::strict();
384 let html = r.render("![alt text](https://example.com/img.png)");
385 assert!(!html.contains("<img"));
386 assert!(html.contains("alt text"));
387 }
388
389 #[test]
390 fn strict_neutralizes_javascript_urls() {
391 let r = Renderer::strict();
392 let html = r.render("[click me](javascript:alert(1))");
393 assert!(html.contains("click me"));
394 assert!(!html.contains("javascript:"));
395 assert!(html.contains(r##"href="#""##));
396 }
397
398 #[test]
399 fn strict_neutralizes_case_insensitive() {
400 let r = Renderer::strict();
401 let html = r.render("[xss](JaVaScRiPt:alert(1))");
402 assert!(!html.contains("javascript:"));
403 assert!(!html.contains("JaVaScRiPt:"));
404 }
405
406 #[test]
407 fn strict_neutralizes_data_urls() {
408 let r = Renderer::strict();
409 let html = r.render("[xss](data:text/html,<script>alert(1)</script>)");
410 assert!(!html.contains("data:text"));
411 }
412
413 #[test]
414 fn strict_neutralizes_vbscript() {
415 let r = Renderer::strict();
416 let html = r.render("[xss](vbscript:msgbox)");
417 assert!(!html.contains("vbscript:"));
418 }
419
420 #[test]
421 fn strict_preserves_safe_urls() {
422 let r = Renderer::strict();
423 let html = r.render("[link](https://example.com)");
424 assert!(html.contains(r#"href="https://example.com""#));
425
426 let html = r.render("[mail](mailto:user@example.com)");
427 assert!(html.contains(r#"href="mailto:user@example.com""#));
428 }
429
430 #[test]
431 fn strict_preserves_relative_urls() {
432 let r = Renderer::strict();
433 let html = r.render("[page](/about)");
434 assert!(html.contains(r#"href="/about""#));
435
436 let html = r.render("[section](#heading)");
437 assert!(html.contains(r##"href="#heading""##));
438 }
439
440 #[test]
441 fn strict_links_have_nofollow() {
442 let r = Renderer::strict();
443 let html = r.render("[example](https://example.com)");
444 assert!(result_has_rel(&html, "nofollow"));
445 assert!(result_has_rel(&html, "noopener"));
446 }
447
448 #[test]
449 fn strict_bold_italic() {
450 let r = Renderer::strict();
451 let html = r.render("**bold** and *italic*");
452 assert!(html.contains("<strong>bold</strong>"));
453 assert!(html.contains("<em>italic</em>"));
454 }
455
456 #[test]
457 fn strict_inline_code() {
458 let r = Renderer::strict();
459 let html = r.render("use `foo()` here");
460 assert!(html.contains("<code>foo()</code>"));
461 }
462
463 #[test]
464 fn strict_code_block() {
465 let r = Renderer::strict();
466 let html = r.render("```\nlet x = 1;\n```");
467 assert!(html.contains("<pre><code>"));
468 assert!(html.contains("let x = 1;"));
469 }
470
471 #[test]
472 fn strict_blockquote() {
473 let r = Renderer::strict();
474 let html = r.render("> quoted text");
475 assert!(html.contains("<blockquote>"));
476 assert!(html.contains("quoted text"));
477 }
478
479 #[test]
480 fn strict_unordered_list() {
481 let r = Renderer::strict();
482 let html = r.render("- item one\n- item two");
483 assert!(html.contains("<ul>"));
484 assert!(html.contains("<li>item one</li>"));
485 }
486
487 #[test]
488 fn strict_heading() {
489 let r = Renderer::strict();
490 let html = r.render("## Section Title");
491 assert!(html.contains("<h2>Section Title</h2>"));
492 }
493
494 #[test]
495 fn strict_plain_text() {
496 let r = Renderer::strict();
497 assert_eq!(r.render("hello world"), "<p>hello world</p>\n");
498 }
499
500 #[test]
501 fn strict_empty_input() {
502 assert_eq!(Renderer::strict().render(""), "");
503 }
504
505 // ===== Sanitize-only preset =====
506
507 #[test]
508 fn sanitize_only_cleans_html() {
509 let r = Renderer::sanitize_only();
510 let html = r.sanitize_html("<p>Hello</p><script>bad</script>");
511 assert!(html.contains("<p>Hello</p>"));
512 assert!(!html.contains("<script>"));
513 }
514
515 // ===== Builder methods =====
516
517 #[test]
518 fn builder_override() {
519 let r = Renderer::strict().with_strip_images(false);
520 let html = r.render("![alt](https://example.com/img.png)");
521 assert!(html.contains("<img"));
522 }
523
524 // ===== render_with_meta =====
525
526 #[test]
527 fn render_with_meta_includes_counts() {
528 let r = Renderer::permissive();
529 let result = r.render_with_meta("Hello world. This is a test.");
530 assert!(result.html.contains("Hello world"));
531 assert_eq!(result.word_count, 6);
532 assert_eq!(result.reading_time_minutes, 1);
533 }
534
535 fn result_has_rel(html: &str, rel_value: &str) -> bool {
536 html.contains(rel_value)
537 }
538 }
539