Skip to main content

max / docengine

14.8 KB · 512 lines History Blame Raw
1 use pulldown_cmark::{CowStr, Event, Options, Parser, Tag, TagEnd, html};
2
3 use crate::sanitize::SanitizePreset;
4
5 /// Returns true if the URL uses a scheme not in the safe allowlist.
6 ///
7 /// Safe schemes: `http`, `https`, `mailto`, `ftp`. Relative URLs (no scheme) are safe.
8 fn has_dangerous_scheme(url: &str) -> bool {
9 let trimmed = url.trim();
10 if let Some(colon_pos) = trimmed.find(':') {
11 let before_colon = &trimmed[..colon_pos];
12 if before_colon.contains('/')
13 || before_colon.contains('#')
14 || before_colon.contains('?')
15 {
16 return false;
17 }
18 let scheme = before_colon.to_ascii_lowercase();
19 !matches!(scheme.as_str(), "http" | "https" | "mailto" | "ftp")
20 } else {
21 false
22 }
23 }
24
25 /// Result of rendering markdown with metadata.
26 #[derive(Debug, Clone)]
27 pub struct RenderResult {
28 pub html: String,
29 pub word_count: u32,
30 pub reading_time_minutes: u32,
31 }
32
33 /// Configurable markdown renderer with builder pattern.
34 pub struct Renderer {
35 tables: bool,
36 strikethrough: bool,
37 footnotes: bool,
38 smart_punctuation: bool,
39 tasklists: bool,
40 strip_images: bool,
41 strip_raw_html: bool,
42 dangerous_scheme_filter: bool,
43 sanitize: SanitizePreset,
44 }
45
46 impl Renderer {
47 /// GFM features, default ammonia sanitization. Suitable for trusted content
48 /// like docs and blog posts.
49 pub fn permissive() -> Self {
50 Self {
51 tables: true,
52 strikethrough: true,
53 footnotes: true,
54 smart_punctuation: true,
55 tasklists: true,
56 strip_images: false,
57 strip_raw_html: false,
58 dangerous_scheme_filter: false,
59 sanitize: SanitizePreset::Permissive,
60 }
61 }
62
63 /// GFM features, no images. Suitable for app text fields (descriptions,
64 /// notes).
65 pub fn standard() -> Self {
66 Self {
67 tables: true,
68 strikethrough: true,
69 footnotes: false,
70 smart_punctuation: true,
71 tasklists: true,
72 strip_images: true,
73 strip_raw_html: false,
74 dangerous_scheme_filter: false,
75 sanitize: SanitizePreset::Standard,
76 }
77 }
78
79 /// No images, no raw HTML, dangerous scheme blocking, nofollow on links.
80 /// Suitable for user-generated content (forum posts).
81 pub fn strict() -> Self {
82 Self {
83 tables: false,
84 strikethrough: false,
85 footnotes: false,
86 smart_punctuation: false,
87 tasklists: false,
88 strip_images: true,
89 strip_raw_html: true,
90 dangerous_scheme_filter: true,
91 sanitize: SanitizePreset::Strict,
92 }
93 }
94
95 /// No markdown parsing, just ammonia sanitization. Suitable for HTML from
96 /// external sources (RSS feeds).
97 pub fn sanitize_only() -> Self {
98 Self {
99 tables: false,
100 strikethrough: false,
101 footnotes: false,
102 smart_punctuation: false,
103 tasklists: false,
104 strip_images: false,
105 strip_raw_html: false,
106 dangerous_scheme_filter: false,
107 sanitize: SanitizePreset::Permissive,
108 }
109 }
110
111 pub fn with_tables(mut self, enabled: bool) -> Self {
112 self.tables = enabled;
113 self
114 }
115
116 pub fn with_strikethrough(mut self, enabled: bool) -> Self {
117 self.strikethrough = enabled;
118 self
119 }
120
121 pub fn with_footnotes(mut self, enabled: bool) -> Self {
122 self.footnotes = enabled;
123 self
124 }
125
126 pub fn with_smart_punctuation(mut self, enabled: bool) -> Self {
127 self.smart_punctuation = enabled;
128 self
129 }
130
131 pub fn with_tasklists(mut self, enabled: bool) -> Self {
132 self.tasklists = enabled;
133 self
134 }
135
136 pub fn with_strip_images(mut self, enabled: bool) -> Self {
137 self.strip_images = enabled;
138 self
139 }
140
141 pub fn with_strip_raw_html(mut self, enabled: bool) -> Self {
142 self.strip_raw_html = enabled;
143 self
144 }
145
146 pub fn with_dangerous_scheme_filter(mut self, enabled: bool) -> Self {
147 self.dangerous_scheme_filter = enabled;
148 self
149 }
150
151 pub fn with_sanitize(mut self, preset: SanitizePreset) -> Self {
152 self.sanitize = preset;
153 self
154 }
155
156 fn build_options(&self) -> Options {
157 let mut opts = Options::empty();
158 if self.tables {
159 opts.insert(Options::ENABLE_TABLES);
160 }
161 if self.strikethrough {
162 opts.insert(Options::ENABLE_STRIKETHROUGH);
163 }
164 if self.footnotes {
165 opts.insert(Options::ENABLE_FOOTNOTES);
166 }
167 if self.smart_punctuation {
168 opts.insert(Options::ENABLE_SMART_PUNCTUATION);
169 }
170 if self.tasklists {
171 opts.insert(Options::ENABLE_TASKLISTS);
172 }
173 opts
174 }
175
176 /// Render markdown to sanitized HTML.
177 pub fn render(&self, input: &str) -> String {
178 if input.is_empty() {
179 return String::new();
180 }
181 let html_output = self.render_raw(input);
182 self.sanitize.clean(&html_output)
183 }
184
185 /// Render markdown to sanitized HTML with metadata.
186 pub fn render_with_meta(&self, input: &str) -> RenderResult {
187 let html = self.render(input);
188 let wc = crate::text::word_count(input);
189 RenderResult {
190 html,
191 word_count: wc,
192 reading_time_minutes: crate::text::reading_time_minutes(wc),
193 }
194 }
195
196 /// Sanitize pre-rendered HTML without markdown parsing.
197 pub fn sanitize_html(&self, html: &str) -> String {
198 self.sanitize.clean(html)
199 }
200
201 fn render_raw(&self, input: &str) -> String {
202 let options = self.build_options();
203 let parser = Parser::new_ext(input, options);
204
205 let strip_images = self.strip_images;
206 let strip_raw_html = self.strip_raw_html;
207 let scheme_filter = self.dangerous_scheme_filter;
208
209 let filtered = parser.filter_map(move |event| match event {
210 // Strip raw HTML events
211 Event::Html(_) | Event::InlineHtml(_) if strip_raw_html => None,
212 // Neutralize dangerous schemes on links
213 Event::Start(Tag::Link {
214 link_type,
215 dest_url,
216 title,
217 id,
218 }) if scheme_filter && has_dangerous_scheme(&dest_url) => {
219 Some(Event::Start(Tag::Link {
220 link_type,
221 dest_url: CowStr::Borrowed("#"),
222 title,
223 id,
224 }))
225 }
226 // Strip images entirely (alt text passes through as plain text)
227 Event::Start(Tag::Image { .. }) | Event::End(TagEnd::Image) if strip_images => None,
228 other => Some(other),
229 });
230
231 let mut output = String::new();
232 html::push_html(&mut output, filtered);
233 output
234 }
235 }
236
237 #[cfg(test)]
238 mod tests {
239 use super::*;
240
241 // ===== has_dangerous_scheme =====
242
243 #[test]
244 fn safe_schemes() {
245 assert!(!has_dangerous_scheme("https://example.com"));
246 assert!(!has_dangerous_scheme("http://example.com"));
247 assert!(!has_dangerous_scheme("mailto:user@example.com"));
248 assert!(!has_dangerous_scheme("ftp://files.example.com"));
249 }
250
251 #[test]
252 fn dangerous_schemes() {
253 assert!(has_dangerous_scheme("javascript:alert(1)"));
254 assert!(has_dangerous_scheme("data:text/html,<script>"));
255 assert!(has_dangerous_scheme("vbscript:msgbox"));
256 }
257
258 #[test]
259 fn case_insensitive_schemes() {
260 assert!(has_dangerous_scheme("JaVaScRiPt:alert(1)"));
261 assert!(has_dangerous_scheme("DATA:text/html,x"));
262 }
263
264 #[test]
265 fn relative_urls_are_safe() {
266 assert!(!has_dangerous_scheme("/about"));
267 assert!(!has_dangerous_scheme("#heading"));
268 assert!(!has_dangerous_scheme("page.html"));
269 assert!(!has_dangerous_scheme("path/to:file"));
270 }
271
272 // ===== Permissive preset =====
273
274 #[test]
275 fn permissive_basic_markdown() {
276 let r = Renderer::permissive();
277 let html = r.render("# Hello\n\nThis is a **test**.");
278 assert!(html.contains("<h1>Hello</h1>"));
279 assert!(html.contains("<strong>test</strong>"));
280 }
281
282 #[test]
283 fn permissive_tables() {
284 let r = Renderer::permissive();
285 let html = r.render("| A | B |\n|---|---|\n| 1 | 2 |");
286 assert!(html.contains("<table>"));
287 assert!(html.contains("<td>"));
288 }
289
290 #[test]
291 fn permissive_smart_punctuation() {
292 let r = Renderer::permissive();
293 let html = r.render("It's a \"test\"");
294 assert!(
295 html.contains('\u{201c}') || html.contains('\u{201d}') || html.contains("\"")
296 );
297 }
298
299 #[test]
300 fn permissive_strips_script() {
301 let r = Renderer::permissive();
302 let html = r.render("Hello <script>alert('xss')</script> world");
303 assert!(!html.contains("<script>"));
304 }
305
306 #[test]
307 fn permissive_keeps_images() {
308 let r = Renderer::permissive();
309 let html = r.render("![alt](https://example.com/img.png)");
310 assert!(html.contains("<img"));
311 }
312
313 #[test]
314 fn permissive_empty_input() {
315 assert_eq!(Renderer::permissive().render(""), "");
316 }
317
318 // ===== Standard preset =====
319
320 #[test]
321 fn standard_strips_images() {
322 let r = Renderer::standard();
323 let html = r.render("![alt text](https://example.com/img.png)");
324 assert!(!html.contains("<img"));
325 assert!(html.contains("alt text"));
326 }
327
328 #[test]
329 fn standard_keeps_tables() {
330 let r = Renderer::standard();
331 let html = r.render("| A |\n|---|\n| 1 |");
332 assert!(html.contains("<table>"));
333 }
334
335 // ===== Strict preset =====
336
337 #[test]
338 fn strict_strips_raw_html() {
339 let r = Renderer::strict();
340 let html = r.render("<script>alert('xss')</script>");
341 assert!(!html.contains("<script>"));
342 assert!(!html.contains("</script>"));
343 }
344
345 #[test]
346 fn strict_strips_inline_html() {
347 let r = Renderer::strict();
348 let html = r.render("hello <b>bold</b> world");
349 assert!(!html.contains("<b>"));
350 assert!(html.contains("hello"));
351 assert!(html.contains("world"));
352 }
353
354 #[test]
355 fn strict_strips_images() {
356 let r = Renderer::strict();
357 let html = r.render("![alt text](https://example.com/img.png)");
358 assert!(!html.contains("<img"));
359 assert!(html.contains("alt text"));
360 }
361
362 #[test]
363 fn strict_neutralizes_javascript_urls() {
364 let r = Renderer::strict();
365 let html = r.render("[click me](javascript:alert(1))");
366 assert!(html.contains("click me"));
367 assert!(!html.contains("javascript:"));
368 assert!(html.contains(r##"href="#""##));
369 }
370
371 #[test]
372 fn strict_neutralizes_case_insensitive() {
373 let r = Renderer::strict();
374 let html = r.render("[xss](JaVaScRiPt:alert(1))");
375 assert!(!html.contains("javascript:"));
376 assert!(!html.contains("JaVaScRiPt:"));
377 }
378
379 #[test]
380 fn strict_neutralizes_data_urls() {
381 let r = Renderer::strict();
382 let html = r.render("[xss](data:text/html,<script>alert(1)</script>)");
383 assert!(!html.contains("data:text"));
384 }
385
386 #[test]
387 fn strict_neutralizes_vbscript() {
388 let r = Renderer::strict();
389 let html = r.render("[xss](vbscript:msgbox)");
390 assert!(!html.contains("vbscript:"));
391 }
392
393 #[test]
394 fn strict_preserves_safe_urls() {
395 let r = Renderer::strict();
396 let html = r.render("[link](https://example.com)");
397 assert!(html.contains(r#"href="https://example.com""#));
398
399 let html = r.render("[mail](mailto:user@example.com)");
400 assert!(html.contains(r#"href="mailto:user@example.com""#));
401 }
402
403 #[test]
404 fn strict_preserves_relative_urls() {
405 let r = Renderer::strict();
406 let html = r.render("[page](/about)");
407 assert!(html.contains(r#"href="/about""#));
408
409 let html = r.render("[section](#heading)");
410 assert!(html.contains(r##"href="#heading""##));
411 }
412
413 #[test]
414 fn strict_links_have_nofollow() {
415 let r = Renderer::strict();
416 let html = r.render("[example](https://example.com)");
417 assert!(result_has_rel(&html, "nofollow"));
418 assert!(result_has_rel(&html, "noopener"));
419 }
420
421 #[test]
422 fn strict_bold_italic() {
423 let r = Renderer::strict();
424 let html = r.render("**bold** and *italic*");
425 assert!(html.contains("<strong>bold</strong>"));
426 assert!(html.contains("<em>italic</em>"));
427 }
428
429 #[test]
430 fn strict_inline_code() {
431 let r = Renderer::strict();
432 let html = r.render("use `foo()` here");
433 assert!(html.contains("<code>foo()</code>"));
434 }
435
436 #[test]
437 fn strict_code_block() {
438 let r = Renderer::strict();
439 let html = r.render("```\nlet x = 1;\n```");
440 assert!(html.contains("<pre><code>"));
441 assert!(html.contains("let x = 1;"));
442 }
443
444 #[test]
445 fn strict_blockquote() {
446 let r = Renderer::strict();
447 let html = r.render("> quoted text");
448 assert!(html.contains("<blockquote>"));
449 assert!(html.contains("quoted text"));
450 }
451
452 #[test]
453 fn strict_unordered_list() {
454 let r = Renderer::strict();
455 let html = r.render("- item one\n- item two");
456 assert!(html.contains("<ul>"));
457 assert!(html.contains("<li>item one</li>"));
458 }
459
460 #[test]
461 fn strict_heading() {
462 let r = Renderer::strict();
463 let html = r.render("## Section Title");
464 assert!(html.contains("<h2>Section Title</h2>"));
465 }
466
467 #[test]
468 fn strict_plain_text() {
469 let r = Renderer::strict();
470 assert_eq!(r.render("hello world"), "<p>hello world</p>\n");
471 }
472
473 #[test]
474 fn strict_empty_input() {
475 assert_eq!(Renderer::strict().render(""), "");
476 }
477
478 // ===== Sanitize-only preset =====
479
480 #[test]
481 fn sanitize_only_cleans_html() {
482 let r = Renderer::sanitize_only();
483 let html = r.sanitize_html("<p>Hello</p><script>bad</script>");
484 assert!(html.contains("<p>Hello</p>"));
485 assert!(!html.contains("<script>"));
486 }
487
488 // ===== Builder methods =====
489
490 #[test]
491 fn builder_override() {
492 let r = Renderer::strict().with_strip_images(false);
493 let html = r.render("![alt](https://example.com/img.png)");
494 assert!(html.contains("<img"));
495 }
496
497 // ===== render_with_meta =====
498
499 #[test]
500 fn render_with_meta_includes_counts() {
501 let r = Renderer::permissive();
502 let result = r.render_with_meta("Hello world. This is a test.");
503 assert!(result.html.contains("Hello world"));
504 assert_eq!(result.word_count, 6);
505 assert_eq!(result.reading_time_minutes, 1);
506 }
507
508 fn result_has_rel(html: &str, rel_value: &str) -> bool {
509 html.contains(rel_value)
510 }
511 }
512