use scraper::node::Node; use scraper::{ElementRef, Html}; use crate::elements::{self, BlockKind, ElementAction, InlineKind}; use crate::replies; use crate::tables; use crate::whitespace; /// Convert an HTML email body into readable markdown. /// /// This is the main entry point for pter. Pass in an HTML string /// (just the body, not MIME structure) and get back clean markdown. /// /// ``` /// let md = pter::convert("
Hello world
"); /// assert_eq!(md, "Hello **world**"); /// ``` pub fn convert(html: &str) -> String { if html.is_empty() { return String::new(); } let document = Html::parse_document(html); let mut ctx = Context::new(); walk_children(document.root_element(), &mut ctx); whitespace::normalize(&ctx.output) } /// Conversion state threaded through the tree walk. struct Context { output: String, /// Current list nesting depth (for indentation). list_depth: u32, /// Whether we're inside a block (preserve whitespace).
in_pre: bool,
/// Whether we're inside an tag (don't nest links).
in_link: bool,
/// Stack of list types for proper ordered/unordered rendering.
list_stack: Vec,
}
#[derive(Clone, Copy)]
enum ListType {
Unordered,
Ordered(u32), // current item number
}
impl Context {
fn new() -> Self {
Self {
output: String::with_capacity(4096),
list_depth: 0,
in_pre: false,
in_link: false,
list_stack: Vec::new(),
}
}
fn push(&mut self, s: &str) {
self.output.push_str(s);
}
fn push_char(&mut self, c: char) {
self.output.push(c);
}
fn ensure_blank_line(&mut self) {
let trimmed = self.output.trim_end_matches(' ');
if trimmed.is_empty() {
return;
}
if trimmed.ends_with("\n\n") {
return;
}
self.output.truncate(trimmed.len());
self.output.push_str("\n\n");
}
fn ensure_newline(&mut self) {
if !self.output.is_empty() && !self.output.ends_with('\n') {
self.output.push('\n');
}
}
fn list_indent(&self) -> String {
if self.list_depth <= 1 {
return String::new();
}
" ".repeat((self.list_depth - 1) as usize)
}
}
/// Walk all children of a node, converting each to markdown.
fn walk_children(parent: ElementRef, ctx: &mut Context) {
for child in parent.children() {
match child.value() {
Node::Text(text) => {
handle_text(&text.text, ctx);
}
Node::Element(_) => {
if let Some(el_ref) = ElementRef::wrap(child) {
handle_element(el_ref, ctx);
}
}
_ => {}
}
}
}
/// Handle a text node.
fn handle_text(text: &str, ctx: &mut Context) {
if ctx.in_pre {
ctx.push(text);
return;
}
// Collapse whitespace in normal flow
let mut last_was_space = ctx.output.ends_with(' ') || ctx.output.ends_with('\n');
for ch in text.chars() {
if ch.is_ascii_whitespace() {
if !last_was_space {
ctx.push_char(' ');
last_was_space = true;
}
} else {
ctx.push_char(ch);
last_was_space = false;
}
}
}
/// Handle an element node — classify it and render accordingly.
fn handle_element(el: ElementRef, ctx: &mut Context) {
let element = el.value();
// Check hidden elements
if elements::is_hidden(element) {
return;
}
// Check for reply boundaries before normal classification.
// Reply boundaries (gmail_quote, type=cite, etc.) get rendered
// as blockquotes regardless of their actual element type.
if replies::is_reply_boundary(el) {
render_reply_block(el, ctx);
return;
}
// Check for Outlook-style "From: ... Sent: ..." separator blocks.
// These introduce quoted content that follows them.
if replies::is_outlook_separator(el) {
ctx.ensure_blank_line();
// Render the separator header as attribution
let text: String = el.text().collect();
let trimmed = text.split_whitespace().collect::>().join(" ");
ctx.push(&trimmed);
ctx.ensure_blank_line();
return;
}
match elements::classify(element) {
ElementAction::Skip => {}
ElementAction::Transparent => walk_children(el, ctx),
ElementAction::Block(kind) => handle_block(el, ctx, kind),
ElementAction::Inline(kind) => handle_inline(el, ctx, kind),
}
}
fn handle_block(el: ElementRef, ctx: &mut Context, kind: BlockKind) {
match kind {
BlockKind::Paragraph => {
ctx.ensure_blank_line();
walk_children(el, ctx);
ctx.ensure_blank_line();
}
BlockKind::Heading(level) => {
ctx.ensure_blank_line();
let prefix = "#".repeat(level as usize);
ctx.push(&prefix);
ctx.push_char(' ');
walk_children(el, ctx);
ctx.ensure_blank_line();
}
BlockKind::Blockquote => {
ctx.ensure_blank_line();
// Render children into a temporary buffer, then prefix each line with >
let mut inner_ctx = Context::new();
inner_ctx.in_pre = ctx.in_pre;
inner_ctx.in_link = ctx.in_link;
walk_children(el, &mut inner_ctx);
let inner = whitespace::normalize(&inner_ctx.output);
for line in inner.lines() {
ctx.push("> ");
ctx.push(line);
ctx.push_char('\n');
}
ctx.push_char('\n');
}
BlockKind::UnorderedList => {
ctx.ensure_blank_line();
ctx.list_depth += 1;
ctx.list_stack.push(ListType::Unordered);
walk_children(el, ctx);
ctx.list_stack.pop();
ctx.list_depth -= 1;
ctx.ensure_blank_line();
}
BlockKind::OrderedList => {
ctx.ensure_blank_line();
ctx.list_depth += 1;
ctx.list_stack.push(ListType::Ordered(0));
walk_children(el, ctx);
ctx.list_stack.pop();
ctx.list_depth -= 1;
ctx.ensure_blank_line();
}
BlockKind::ListItem => {
ctx.ensure_newline();
let indent = ctx.list_indent();
ctx.push(&indent);
// Determine bullet or number
let marker = match ctx.list_stack.last_mut() {
Some(ListType::Unordered) => "- ".to_string(),
Some(ListType::Ordered(n)) => {
*n += 1;
format!("{}. ", *n)
}
None => "- ".to_string(),
};
ctx.push(&marker);
walk_children(el, ctx);
ctx.ensure_newline();
}
BlockKind::PreFormatted => {
ctx.ensure_blank_line();
ctx.push("```\n");
ctx.in_pre = true;
walk_children(el, ctx);
ctx.in_pre = false;
ctx.ensure_newline();
ctx.push("```");
ctx.ensure_blank_line();
}
BlockKind::HorizontalRule => {
ctx.ensure_blank_line();
ctx.push("---");
ctx.ensure_blank_line();
}
BlockKind::Table => {
ctx.ensure_blank_line();
if tables::is_data_table(el) {
let (headers, rows) = tables::extract_table_data(el);
let md = tables::render_markdown_table(&headers, &rows);
if !md.is_empty() {
ctx.push(&md);
}
} else {
// Layout table — unwrap and render cell contents directly
render_layout_table(el, ctx);
}
ctx.ensure_blank_line();
}
BlockKind::Div => {
// Divs act as block separators but don't add their own markup
ctx.ensure_blank_line();
walk_children(el, ctx);
ctx.ensure_blank_line();
}
}
}
fn handle_inline(el: ElementRef, ctx: &mut Context, kind: InlineKind) {
match kind {
InlineKind::Bold => {
ctx.push("**");
walk_children(el, ctx);
ctx.push("**");
}
InlineKind::Italic => {
ctx.push("*");
walk_children(el, ctx);
ctx.push("*");
}
InlineKind::Strikethrough => {
ctx.push("~~");
walk_children(el, ctx);
ctx.push("~~");
}
InlineKind::Code => {
if ctx.in_pre {
// Inside a , don't double-wrap
walk_children(el, ctx);
} else {
ctx.push("`");
walk_children(el, ctx);
ctx.push("`");
}
}
InlineKind::Link => {
if ctx.in_link {
// Don't nest links
walk_children(el, ctx);
return;
}
let href = el.value().attr("href").unwrap_or("");
if href.is_empty() || href == "#" {
walk_children(el, ctx);
return;
}
// Collect the link text
let mut text_ctx = Context::new();
text_ctx.in_link = true;
walk_children(el, &mut text_ctx);
let text = text_ctx.output.trim().to_string();
if text.is_empty() {
// Link with no text — just show the URL
ctx.push(href);
} else if text == href {
// Link text matches URL — no need for markdown link syntax
ctx.push(href);
} else {
ctx.push("[");
ctx.push(&text);
ctx.push("](");
ctx.push(href);
ctx.push(")");
}
}
InlineKind::Image => {
let element = el.value();
if elements::is_tracking_pixel(element) {
return;
}
let alt = element.attr("alt").unwrap_or("");
let src = element.attr("src").unwrap_or("");
if src.is_empty() {
return;
}
ctx.push(";
ctx.push(src);
ctx.push(")");
}
InlineKind::LineBreak => {
ctx.push_char('\n');
}
InlineKind::Superscript => {
ctx.push("^");
walk_children(el, ctx);
}
InlineKind::Subscript => {
ctx.push("~");
walk_children(el, ctx);
}
}
}
/// Render a reply boundary as a quoted block.
///
/// This is the same rendering logic as `` — children are
/// rendered into a temporary buffer and each line gets `> ` prefixed.
/// Attribution lines (e.g. "On ... wrote:") are rendered above the quote.
fn render_reply_block(el: ElementRef, ctx: &mut Context) {
ctx.ensure_blank_line();
// Look for attribution text
if let Some(attribution) = replies::find_attribution(el) {
ctx.push(&attribution);
ctx.push_char('\n');
}
// Render children into temp buffer, then prefix with >
let mut inner_ctx = Context::new();
inner_ctx.in_pre = ctx.in_pre;
inner_ctx.in_link = ctx.in_link;
walk_children(el, &mut inner_ctx);
let inner = whitespace::normalize(&inner_ctx.output);
if !inner.is_empty() {
for line in inner.lines() {
ctx.push("> ");
ctx.push(line);
ctx.push_char('\n');
}
ctx.push_char('\n');
}
}
/// Unwrap a layout table by rendering cell contents sequentially.
///
/// Walks through rows and cells, rendering each cell's content as if
/// the table wrapper didn't exist. This handles the common email pattern
/// of wrapping everything in `...
`.
fn render_layout_table(table: ElementRef, ctx: &mut Context) {
for descendant in table.descendants() {
if let Some(el_ref) = ElementRef::wrap(descendant) {
let name = el_ref.value().name();
if name == "td" || name == "th" {
// Check if the cell itself is hidden
if !elements::is_hidden(el_ref.value()) {
walk_children(el_ref, ctx);
ctx.ensure_blank_line();
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
// -- Basic elements --
#[test]
fn empty_input() {
assert_eq!(convert(""), "");
}
#[test]
fn plain_text() {
assert_eq!(convert("hello world"), "hello world");
}
#[test]
fn paragraph() {
assert_eq!(convert("one
two
"), "one\n\ntwo");
}
#[test]
fn headings() {
assert_eq!(convert("Title
"), "# Title");
assert_eq!(convert("Sub
"), "### Sub");
}
#[test]
fn bold_and_italic() {
assert_eq!(
convert("bold and italic
"),
"**bold** and *italic*"
);
}
#[test]
fn link() {
assert_eq!(
convert(r#"click"#),
"[click](https://example.com)"
);
}
#[test]
fn link_text_matches_url() {
assert_eq!(
convert(r#"https://example.com"#),
"https://example.com"
);
}
#[test]
fn link_empty_href() {
assert_eq!(convert(r#"click"#), "click");
}
#[test]
fn image() {
assert_eq!(
convert(r#"
"#),
""
);
}
#[test]
fn tracking_pixel_skipped() {
assert_eq!(convert(r#"
"#), "");
}
#[test]
fn unordered_list() {
assert_eq!(
convert("- one
- two
"),
"- one\n- two"
);
}
#[test]
fn ordered_list() {
assert_eq!(
convert("- first
- second
"),
"1. first\n2. second"
);
}
#[test]
fn nested_list() {
let html = "- outer
- inner
";
let md = convert(html);
assert!(md.contains("- outer"));
assert!(md.contains(" - inner"));
}
#[test]
fn nested_list_exact_indent_depth_2() {
// At depth 2, `list_indent` returns `" "` (exactly two spaces, one indent level).
// Catches `list_indent` mutations:
// - `(depth - 1)` → `(depth + 1)`: would produce 3 indent levels (6 spaces).
// - `(depth - 1)` → `(depth / 1)`: would produce 2 indent levels (4 spaces).
// Either makes this exact-match assertion fail.
// (The converter emits a blank line before each nested list — that's a
// separate stylistic question; the *indent* is what we're pinning down here.)
assert_eq!(
convert("- A
- B
"),
"- A\n\n - B"
);
}
#[test]
fn triple_nested_list_exact_indent_depth_3() {
// At depth 3, indent is exactly `" "` (four spaces).
assert_eq!(
convert("- A
- B
- C
"),
"- A\n\n - B\n\n - C"
);
}
#[test]
fn sibling_top_level_lists_have_no_indent_after_nesting() {
// After a nested closes, `list_depth -= 1` must execute to return
// to outer scope. If mutated to `+= 1` or `/= 1`, list_depth stays
// elevated and the SECOND top-level list ends up incorrectly indented.
let md = convert(
"- A
- B
- C
",
);
// The second list's "C" item must appear at column 0, not indented.
// We check the exact substring "\n- C" (newline then no leading whitespace).
assert!(
md.contains("\n- C"),
"second top-level list must not be indented after a nested list closes; got: {md:?}"
);
// And explicitly: it must NOT appear with leading spaces.
assert!(
!md.contains("\n - C"),
"second list incorrectly indented; got: {md:?}"
);
}
#[test]
fn ordered_list_decrements_depth_after_nesting() {
// Same shape but with — exercises the L218 `-= 1` mutation in the
// OrderedList block, distinct from UnorderedList's L208.
let md = convert(
"- A
- B
- C
",
);
assert!(md.contains("\n1. C"), "second ol must restart at depth 1: {md:?}");
assert!(!md.contains("\n 1. C"), "second ol indented incorrectly: {md:?}");
}
#[test]
fn blockquote() {
assert_eq!(convert("quoted text
"), "> quoted text");
}
#[test]
fn nested_blockquote() {
let html = "outerinner
";
let md = convert(html);
assert!(md.contains("> outer"));
assert!(md.contains("> > inner"));
}
#[test]
fn preformatted() {
let html = "fn main() {\n println!(\"hi\");\n}
";
let md = convert(html);
assert!(md.starts_with("```\n"));
assert!(md.contains("fn main()"));
assert!(md.ends_with("\n```"));
}
#[test]
fn horizontal_rule() {
assert_eq!(convert("above
below
"), "above\n\n---\n\nbelow");
}
#[test]
fn br_tag() {
assert_eq!(convert("line one
line two"), "line one\nline two");
}
#[test]
fn strikethrough() {
assert_eq!(convert("removed"), "~~removed~~");
}
#[test]
fn inline_code() {
assert_eq!(convert("use pter here"), "use `pter` here");
}
#[test]
fn script_and_style_stripped() {
assert_eq!(
convert("text
"),
"text"
);
}
#[test]
fn unknown_elements_transparent() {
assert_eq!(convert("hello"), "hello");
}
#[test]
fn hidden_element_skipped() {
assert_eq!(
convert(r#"visible
"#),
"visible"
);
}
#[test]
fn whitespace_collapsed() {
assert_eq!(convert(" lots of space "), "lots of space");
}
#[test]
fn entities_decoded() {
// html5ever decodes entities during parsing
assert_eq!(convert("& < > "
"), "& < > \"");
}
#[test]
fn sup_and_sub() {
assert_eq!(convert("x2"), "x^2");
assert_eq!(convert("H2O"), "H~2O");
}
// -- Div / section as block separator --
#[test]
fn div_separates_blocks() {
assert_eq!(convert("onetwo"), "one\n\ntwo");
}
// -- Tables --
#[test]
fn layout_table_single_cell_unwrapped() {
let html = "Hello world
";
assert_eq!(convert(html), "Hello world");
}
#[test]
fn layout_table_multi_column_linearized() {
let html = "Left Right
";
let md = convert(html);
assert!(md.contains("Left"));
assert!(md.contains("Right"));
}
#[test]
fn data_table_rendered_as_markdown() {
let html = "Name Age \
Alice 30 \
Bob 25
";
let md = convert(html);
assert!(md.contains("| Name | Age |"));
assert!(md.contains("| --- | --- |"));
assert!(md.contains("| Alice | 30 |"));
assert!(md.contains("| Bob | 25 |"));
}
#[test]
fn nested_layout_tables_unwrapped() {
let html = "\
Inner content
\
";
let md = convert(html);
assert!(md.contains("Inner content"));
assert!(!md.contains("|"));
}
#[test]
fn presentation_role_is_layout() {
let html = r#"Content
"#;
let md = convert(html);
assert!(md.contains("Content"));
assert!(!md.contains("|"));
}
#[test]
fn spacer_element_hidden() {
let html = r#"real
spaceralso real
"#;
let md = convert(html);
assert!(md.contains("real"));
assert!(!md.contains("spacer"));
assert!(md.contains("also real"));
}
// -- Combined --
#[test]
fn mixed_content() {
let html = r#"
Subject
Hello Max,
Check out this link.
- Item one
- Item two
"#;
let md = convert(html);
assert!(md.starts_with("# Subject"));
assert!(md.contains("Hello **Max**,"));
assert!(md.contains("[this link](https://example.com)"));
assert!(md.contains("- Item one\n- Item two"));
}
}