Skip to main content

max / pter

21.4 KB · 746 lines History Blame Raw
1 use scraper::node::Node;
2 use scraper::{ElementRef, Html};
3
4 use crate::elements::{self, BlockKind, ElementAction, InlineKind};
5 use crate::replies;
6 use crate::tables;
7 use crate::whitespace;
8
9 /// Convert an HTML email body into readable markdown.
10 ///
11 /// This is the main entry point for pter. Pass in an HTML string
12 /// (just the body, not MIME structure) and get back clean markdown.
13 ///
14 /// ```
15 /// let md = pter::convert("<p>Hello <strong>world</strong></p>");
16 /// assert_eq!(md, "Hello **world**");
17 /// ```
18 pub fn convert(html: &str) -> String {
19 if html.is_empty() {
20 return String::new();
21 }
22
23 let document = Html::parse_document(html);
24 let mut ctx = Context::new();
25 walk_children(document.root_element(), &mut ctx);
26 whitespace::normalize(&ctx.output)
27 }
28
29 /// Conversion state threaded through the tree walk.
30 struct Context {
31 output: String,
32 /// Current list nesting depth (for indentation).
33 list_depth: u32,
34 /// Whether we're inside a <pre> block (preserve whitespace).
35 in_pre: bool,
36 /// Whether we're inside an <a> tag (don't nest links).
37 in_link: bool,
38 /// Stack of list types for proper ordered/unordered rendering.
39 list_stack: Vec<ListType>,
40 }
41
42 #[derive(Clone, Copy)]
43 enum ListType {
44 Unordered,
45 Ordered(u32), // current item number
46 }
47
48 impl Context {
49 fn new() -> Self {
50 Self {
51 output: String::with_capacity(4096),
52 list_depth: 0,
53 in_pre: false,
54 in_link: false,
55 list_stack: Vec::new(),
56 }
57 }
58
59 fn push(&mut self, s: &str) {
60 self.output.push_str(s);
61 }
62
63 fn push_char(&mut self, c: char) {
64 self.output.push(c);
65 }
66
67 fn ensure_blank_line(&mut self) {
68 let trimmed = self.output.trim_end_matches(' ');
69 if trimmed.is_empty() {
70 return;
71 }
72 if trimmed.ends_with("\n\n") {
73 return;
74 }
75 self.output.truncate(trimmed.len());
76 self.output.push_str("\n\n");
77 }
78
79 fn ensure_newline(&mut self) {
80 if !self.output.is_empty() && !self.output.ends_with('\n') {
81 self.output.push('\n');
82 }
83 }
84
85 fn list_indent(&self) -> String {
86 if self.list_depth <= 1 {
87 return String::new();
88 }
89 " ".repeat((self.list_depth - 1) as usize)
90 }
91 }
92
93 /// Walk all children of a node, converting each to markdown.
94 fn walk_children(parent: ElementRef, ctx: &mut Context) {
95 for child in parent.children() {
96 match child.value() {
97 Node::Text(text) => {
98 handle_text(&text.text, ctx);
99 }
100 Node::Element(_) => {
101 if let Some(el_ref) = ElementRef::wrap(child) {
102 handle_element(el_ref, ctx);
103 }
104 }
105 _ => {}
106 }
107 }
108 }
109
110 /// Handle a text node.
111 fn handle_text(text: &str, ctx: &mut Context) {
112 if ctx.in_pre {
113 ctx.push(text);
114 return;
115 }
116
117 // Collapse whitespace in normal flow
118 let mut last_was_space = ctx.output.ends_with(' ') || ctx.output.ends_with('\n');
119 for ch in text.chars() {
120 if ch.is_ascii_whitespace() {
121 if !last_was_space {
122 ctx.push_char(' ');
123 last_was_space = true;
124 }
125 } else {
126 ctx.push_char(ch);
127 last_was_space = false;
128 }
129 }
130 }
131
132 /// Handle an element node — classify it and render accordingly.
133 fn handle_element(el: ElementRef, ctx: &mut Context) {
134 let element = el.value();
135
136 // Check hidden elements
137 if elements::is_hidden(element) {
138 return;
139 }
140
141 // Check for reply boundaries before normal classification.
142 // Reply boundaries (gmail_quote, type=cite, etc.) get rendered
143 // as blockquotes regardless of their actual element type.
144 if replies::is_reply_boundary(el) {
145 render_reply_block(el, ctx);
146 return;
147 }
148
149 // Check for Outlook-style "From: ... Sent: ..." separator blocks.
150 // These introduce quoted content that follows them.
151 if replies::is_outlook_separator(el) {
152 ctx.ensure_blank_line();
153 // Render the separator header as attribution
154 let text: String = el.text().collect();
155 let trimmed = text.split_whitespace().collect::<Vec<_>>().join(" ");
156 ctx.push(&trimmed);
157 ctx.ensure_blank_line();
158 return;
159 }
160
161 match elements::classify(element) {
162 ElementAction::Skip => {}
163 ElementAction::Transparent => walk_children(el, ctx),
164 ElementAction::Block(kind) => handle_block(el, ctx, kind),
165 ElementAction::Inline(kind) => handle_inline(el, ctx, kind),
166 }
167 }
168
169 fn handle_block(el: ElementRef, ctx: &mut Context, kind: BlockKind) {
170 match kind {
171 BlockKind::Paragraph => {
172 ctx.ensure_blank_line();
173 walk_children(el, ctx);
174 ctx.ensure_blank_line();
175 }
176
177 BlockKind::Heading(level) => {
178 ctx.ensure_blank_line();
179 let prefix = "#".repeat(level as usize);
180 ctx.push(&prefix);
181 ctx.push_char(' ');
182 walk_children(el, ctx);
183 ctx.ensure_blank_line();
184 }
185
186 BlockKind::Blockquote => {
187 ctx.ensure_blank_line();
188 // Render children into a temporary buffer, then prefix each line with >
189 let mut inner_ctx = Context::new();
190 inner_ctx.in_pre = ctx.in_pre;
191 inner_ctx.in_link = ctx.in_link;
192 walk_children(el, &mut inner_ctx);
193 let inner = whitespace::normalize(&inner_ctx.output);
194 for line in inner.lines() {
195 ctx.push("> ");
196 ctx.push(line);
197 ctx.push_char('\n');
198 }
199 ctx.push_char('\n');
200 }
201
202 BlockKind::UnorderedList => {
203 ctx.ensure_blank_line();
204 ctx.list_depth += 1;
205 ctx.list_stack.push(ListType::Unordered);
206 walk_children(el, ctx);
207 ctx.list_stack.pop();
208 ctx.list_depth -= 1;
209 ctx.ensure_blank_line();
210 }
211
212 BlockKind::OrderedList => {
213 ctx.ensure_blank_line();
214 ctx.list_depth += 1;
215 ctx.list_stack.push(ListType::Ordered(0));
216 walk_children(el, ctx);
217 ctx.list_stack.pop();
218 ctx.list_depth -= 1;
219 ctx.ensure_blank_line();
220 }
221
222 BlockKind::ListItem => {
223 ctx.ensure_newline();
224 let indent = ctx.list_indent();
225 ctx.push(&indent);
226
227 // Determine bullet or number
228 let marker = match ctx.list_stack.last_mut() {
229 Some(ListType::Unordered) => "- ".to_string(),
230 Some(ListType::Ordered(n)) => {
231 *n += 1;
232 format!("{}. ", *n)
233 }
234 None => "- ".to_string(),
235 };
236 ctx.push(&marker);
237 walk_children(el, ctx);
238 ctx.ensure_newline();
239 }
240
241 BlockKind::PreFormatted => {
242 ctx.ensure_blank_line();
243 ctx.push("```\n");
244 ctx.in_pre = true;
245 walk_children(el, ctx);
246 ctx.in_pre = false;
247 ctx.ensure_newline();
248 ctx.push("```");
249 ctx.ensure_blank_line();
250 }
251
252 BlockKind::HorizontalRule => {
253 ctx.ensure_blank_line();
254 ctx.push("---");
255 ctx.ensure_blank_line();
256 }
257
258 BlockKind::Table => {
259 ctx.ensure_blank_line();
260 if tables::is_data_table(el) {
261 let (headers, rows) = tables::extract_table_data(el);
262 let md = tables::render_markdown_table(&headers, &rows);
263 if !md.is_empty() {
264 ctx.push(&md);
265 }
266 } else {
267 // Layout table — unwrap and render cell contents directly
268 render_layout_table(el, ctx);
269 }
270 ctx.ensure_blank_line();
271 }
272
273 BlockKind::Div => {
274 // Divs act as block separators but don't add their own markup
275 ctx.ensure_blank_line();
276 walk_children(el, ctx);
277 ctx.ensure_blank_line();
278 }
279 }
280 }
281
282 fn handle_inline(el: ElementRef, ctx: &mut Context, kind: InlineKind) {
283 match kind {
284 InlineKind::Bold => {
285 ctx.push("**");
286 walk_children(el, ctx);
287 ctx.push("**");
288 }
289
290 InlineKind::Italic => {
291 ctx.push("*");
292 walk_children(el, ctx);
293 ctx.push("*");
294 }
295
296 InlineKind::Strikethrough => {
297 ctx.push("~~");
298 walk_children(el, ctx);
299 ctx.push("~~");
300 }
301
302 InlineKind::Code => {
303 if ctx.in_pre {
304 // Inside a <pre>, don't double-wrap
305 walk_children(el, ctx);
306 } else {
307 ctx.push("`");
308 walk_children(el, ctx);
309 ctx.push("`");
310 }
311 }
312
313 InlineKind::Link => {
314 if ctx.in_link {
315 // Don't nest links
316 walk_children(el, ctx);
317 return;
318 }
319
320 let href = el.value().attr("href").unwrap_or("");
321
322 if href.is_empty() || href == "#" {
323 walk_children(el, ctx);
324 return;
325 }
326
327 // Collect the link text
328 let mut text_ctx = Context::new();
329 text_ctx.in_link = true;
330 walk_children(el, &mut text_ctx);
331 let text = text_ctx.output.trim().to_string();
332
333 if text.is_empty() {
334 // Link with no text — just show the URL
335 ctx.push(href);
336 } else if text == href {
337 // Link text matches URL — no need for markdown link syntax
338 ctx.push(href);
339 } else {
340 ctx.push("[");
341 ctx.push(&text);
342 ctx.push("](");
343 ctx.push(href);
344 ctx.push(")");
345 }
346 }
347
348 InlineKind::Image => {
349 let element = el.value();
350 if elements::is_tracking_pixel(element) {
351 return;
352 }
353
354 let alt = element.attr("alt").unwrap_or("");
355 let src = element.attr("src").unwrap_or("");
356
357 if src.is_empty() {
358 return;
359 }
360
361 ctx.push("![");
362 ctx.push(alt);
363 ctx.push("](");
364 ctx.push(src);
365 ctx.push(")");
366 }
367
368 InlineKind::LineBreak => {
369 ctx.push_char('\n');
370 }
371
372 InlineKind::Superscript => {
373 ctx.push("^");
374 walk_children(el, ctx);
375 }
376
377 InlineKind::Subscript => {
378 ctx.push("~");
379 walk_children(el, ctx);
380 }
381 }
382 }
383
384 /// Render a reply boundary as a quoted block.
385 ///
386 /// This is the same rendering logic as `<blockquote>` — children are
387 /// rendered into a temporary buffer and each line gets `> ` prefixed.
388 /// Attribution lines (e.g. "On ... wrote:") are rendered above the quote.
389 fn render_reply_block(el: ElementRef, ctx: &mut Context) {
390 ctx.ensure_blank_line();
391
392 // Look for attribution text
393 if let Some(attribution) = replies::find_attribution(el) {
394 ctx.push(&attribution);
395 ctx.push_char('\n');
396 }
397
398 // Render children into temp buffer, then prefix with >
399 let mut inner_ctx = Context::new();
400 inner_ctx.in_pre = ctx.in_pre;
401 inner_ctx.in_link = ctx.in_link;
402 walk_children(el, &mut inner_ctx);
403 let inner = whitespace::normalize(&inner_ctx.output);
404
405 if !inner.is_empty() {
406 for line in inner.lines() {
407 ctx.push("> ");
408 ctx.push(line);
409 ctx.push_char('\n');
410 }
411 ctx.push_char('\n');
412 }
413 }
414
415 /// Unwrap a layout table by rendering cell contents sequentially.
416 ///
417 /// Walks through rows and cells, rendering each cell's content as if
418 /// the table wrapper didn't exist. This handles the common email pattern
419 /// of wrapping everything in `<table><tr><td>...</td></tr></table>`.
420 fn render_layout_table(table: ElementRef, ctx: &mut Context) {
421 for descendant in table.descendants() {
422 if let Some(el_ref) = ElementRef::wrap(descendant) {
423 let name = el_ref.value().name();
424 if name == "td" || name == "th" {
425 // Check if the cell itself is hidden
426 if !elements::is_hidden(el_ref.value()) {
427 walk_children(el_ref, ctx);
428 ctx.ensure_blank_line();
429 }
430 }
431 }
432 }
433 }
434
435 #[cfg(test)]
436 mod tests {
437 use super::*;
438
439 // -- Basic elements --
440
441 #[test]
442 fn empty_input() {
443 assert_eq!(convert(""), "");
444 }
445
446 #[test]
447 fn plain_text() {
448 assert_eq!(convert("hello world"), "hello world");
449 }
450
451 #[test]
452 fn paragraph() {
453 assert_eq!(convert("<p>one</p><p>two</p>"), "one\n\ntwo");
454 }
455
456 #[test]
457 fn headings() {
458 assert_eq!(convert("<h1>Title</h1>"), "# Title");
459 assert_eq!(convert("<h3>Sub</h3>"), "### Sub");
460 }
461
462 #[test]
463 fn bold_and_italic() {
464 assert_eq!(
465 convert("<p><strong>bold</strong> and <em>italic</em></p>"),
466 "**bold** and *italic*"
467 );
468 }
469
470 #[test]
471 fn link() {
472 assert_eq!(
473 convert(r#"<a href="https://example.com">click</a>"#),
474 "[click](https://example.com)"
475 );
476 }
477
478 #[test]
479 fn link_text_matches_url() {
480 assert_eq!(
481 convert(r#"<a href="https://example.com">https://example.com</a>"#),
482 "https://example.com"
483 );
484 }
485
486 #[test]
487 fn link_empty_href() {
488 assert_eq!(convert(r#"<a href="">click</a>"#), "click");
489 }
490
491 #[test]
492 fn image() {
493 assert_eq!(
494 convert(r#"<img src="photo.jpg" alt="A photo">"#),
495 "![A photo](photo.jpg)"
496 );
497 }
498
499 #[test]
500 fn tracking_pixel_skipped() {
501 assert_eq!(convert(r#"<img src="track.gif" width="1" height="1">"#), "");
502 }
503
504 #[test]
505 fn unordered_list() {
506 assert_eq!(
507 convert("<ul><li>one</li><li>two</li></ul>"),
508 "- one\n- two"
509 );
510 }
511
512 #[test]
513 fn ordered_list() {
514 assert_eq!(
515 convert("<ol><li>first</li><li>second</li></ol>"),
516 "1. first\n2. second"
517 );
518 }
519
520 #[test]
521 fn nested_list() {
522 let html = "<ul><li>outer<ul><li>inner</li></ul></li></ul>";
523 let md = convert(html);
524 assert!(md.contains("- outer"));
525 assert!(md.contains(" - inner"));
526 }
527
528 #[test]
529 fn nested_list_exact_indent_depth_2() {
530 // At depth 2, `list_indent` returns `" "` (exactly two spaces, one indent level).
531 // Catches `list_indent` mutations:
532 // - `(depth - 1)` → `(depth + 1)`: would produce 3 indent levels (6 spaces).
533 // - `(depth - 1)` → `(depth / 1)`: would produce 2 indent levels (4 spaces).
534 // Either makes this exact-match assertion fail.
535 // (The converter emits a blank line before each nested list — that's a
536 // separate stylistic question; the *indent* is what we're pinning down here.)
537 assert_eq!(
538 convert("<ul><li>A<ul><li>B</li></ul></li></ul>"),
539 "- A\n\n - B"
540 );
541 }
542
543 #[test]
544 fn triple_nested_list_exact_indent_depth_3() {
545 // At depth 3, indent is exactly `" "` (four spaces).
546 assert_eq!(
547 convert("<ul><li>A<ul><li>B<ul><li>C</li></ul></li></ul></li></ul>"),
548 "- A\n\n - B\n\n - C"
549 );
550 }
551
552 #[test]
553 fn sibling_top_level_lists_have_no_indent_after_nesting() {
554 // After a nested <ul> closes, `list_depth -= 1` must execute to return
555 // to outer scope. If mutated to `+= 1` or `/= 1`, list_depth stays
556 // elevated and the SECOND top-level list ends up incorrectly indented.
557 let md = convert(
558 "<ul><li>A<ul><li>B</li></ul></li></ul><ul><li>C</li></ul>",
559 );
560 // The second list's "C" item must appear at column 0, not indented.
561 // We check the exact substring "\n- C" (newline then no leading whitespace).
562 assert!(
563 md.contains("\n- C"),
564 "second top-level list must not be indented after a nested list closes; got: {md:?}"
565 );
566 // And explicitly: it must NOT appear with leading spaces.
567 assert!(
568 !md.contains("\n - C"),
569 "second list incorrectly indented; got: {md:?}"
570 );
571 }
572
573 #[test]
574 fn ordered_list_decrements_depth_after_nesting() {
575 // Same shape but with <ol> — exercises the L218 `-= 1` mutation in the
576 // OrderedList block, distinct from UnorderedList's L208.
577 let md = convert(
578 "<ol><li>A<ol><li>B</li></ol></li></ol><ol><li>C</li></ol>",
579 );
580 assert!(md.contains("\n1. C"), "second ol must restart at depth 1: {md:?}");
581 assert!(!md.contains("\n 1. C"), "second ol indented incorrectly: {md:?}");
582 }
583
584 #[test]
585 fn blockquote() {
586 assert_eq!(convert("<blockquote>quoted text</blockquote>"), "> quoted text");
587 }
588
589 #[test]
590 fn nested_blockquote() {
591 let html = "<blockquote>outer<blockquote>inner</blockquote></blockquote>";
592 let md = convert(html);
593 assert!(md.contains("> outer"));
594 assert!(md.contains("> > inner"));
595 }
596
597 #[test]
598 fn preformatted() {
599 let html = "<pre><code>fn main() {\n println!(\"hi\");\n}</code></pre>";
600 let md = convert(html);
601 assert!(md.starts_with("```\n"));
602 assert!(md.contains("fn main()"));
603 assert!(md.ends_with("\n```"));
604 }
605
606 #[test]
607 fn horizontal_rule() {
608 assert_eq!(convert("<p>above</p><hr><p>below</p>"), "above\n\n---\n\nbelow");
609 }
610
611 #[test]
612 fn br_tag() {
613 assert_eq!(convert("line one<br>line two"), "line one\nline two");
614 }
615
616 #[test]
617 fn strikethrough() {
618 assert_eq!(convert("<del>removed</del>"), "~~removed~~");
619 }
620
621 #[test]
622 fn inline_code() {
623 assert_eq!(convert("use <code>pter</code> here"), "use `pter` here");
624 }
625
626 #[test]
627 fn script_and_style_stripped() {
628 assert_eq!(
629 convert("<p>text</p><script>alert('x')</script><style>.x{}</style>"),
630 "text"
631 );
632 }
633
634 #[test]
635 fn unknown_elements_transparent() {
636 assert_eq!(convert("<span>hello</span>"), "hello");
637 }
638
639 #[test]
640 fn hidden_element_skipped() {
641 assert_eq!(
642 convert(r#"<p>visible</p><div style="display:none">hidden</div>"#),
643 "visible"
644 );
645 }
646
647 #[test]
648 fn whitespace_collapsed() {
649 assert_eq!(convert(" lots of space "), "lots of space");
650 }
651
652 #[test]
653 fn entities_decoded() {
654 // html5ever decodes entities during parsing
655 assert_eq!(convert("<p>&amp; &lt; &gt; &quot;</p>"), "& < > \"");
656 }
657
658 #[test]
659 fn sup_and_sub() {
660 assert_eq!(convert("x<sup>2</sup>"), "x^2");
661 assert_eq!(convert("H<sub>2</sub>O"), "H~2O");
662 }
663
664 // -- Div / section as block separator --
665
666 #[test]
667 fn div_separates_blocks() {
668 assert_eq!(convert("<div>one</div><div>two</div>"), "one\n\ntwo");
669 }
670
671 // -- Tables --
672
673 #[test]
674 fn layout_table_single_cell_unwrapped() {
675 let html = "<table><tr><td><p>Hello world</p></td></tr></table>";
676 assert_eq!(convert(html), "Hello world");
677 }
678
679 #[test]
680 fn layout_table_multi_column_linearized() {
681 let html = "<table><tr><td>Left</td><td>Right</td></tr></table>";
682 let md = convert(html);
683 assert!(md.contains("Left"));
684 assert!(md.contains("Right"));
685 }
686
687 #[test]
688 fn data_table_rendered_as_markdown() {
689 let html = "<table><tr><th>Name</th><th>Age</th></tr>\
690 <tr><td>Alice</td><td>30</td></tr>\
691 <tr><td>Bob</td><td>25</td></tr></table>";
692 let md = convert(html);
693 assert!(md.contains("| Name | Age |"));
694 assert!(md.contains("| --- | --- |"));
695 assert!(md.contains("| Alice | 30 |"));
696 assert!(md.contains("| Bob | 25 |"));
697 }
698
699 #[test]
700 fn nested_layout_tables_unwrapped() {
701 let html = "<table><tr><td>\
702 <table><tr><td>Inner content</td></tr></table>\
703 </td></tr></table>";
704 let md = convert(html);
705 assert!(md.contains("Inner content"));
706 assert!(!md.contains("|"));
707 }
708
709 #[test]
710 fn presentation_role_is_layout() {
711 let html = r#"<table role="presentation"><tr><td>Content</td><td>&nbsp;</td></tr></table>"#;
712 let md = convert(html);
713 assert!(md.contains("Content"));
714 assert!(!md.contains("|"));
715 }
716
717 #[test]
718 fn spacer_element_hidden() {
719 let html = r#"<p>real</p><div style="font-size:0">spacer</div><p>also real</p>"#;
720 let md = convert(html);
721 assert!(md.contains("real"));
722 assert!(!md.contains("spacer"));
723 assert!(md.contains("also real"));
724 }
725
726 // -- Combined --
727
728 #[test]
729 fn mixed_content() {
730 let html = r#"
731 <h1>Subject</h1>
732 <p>Hello <strong>Max</strong>,</p>
733 <p>Check out <a href="https://example.com">this link</a>.</p>
734 <ul>
735 <li>Item one</li>
736 <li>Item two</li>
737 </ul>
738 "#;
739 let md = convert(html);
740 assert!(md.starts_with("# Subject"));
741 assert!(md.contains("Hello **Max**,"));
742 assert!(md.contains("[this link](https://example.com)"));
743 assert!(md.contains("- Item one\n- Item two"));
744 }
745 }
746