use scraper::node::Node; use scraper::{ElementRef, Html}; use crate::elements::{self, BlockKind, ElementAction, InlineKind}; use crate::replies; use crate::tables; use crate::whitespace; /// Convert an HTML email body into readable markdown. /// /// This is the main entry point for pter. Pass in an HTML string /// (just the body, not MIME structure) and get back clean markdown. /// /// ``` /// let md = pter::convert("

Hello world

"); /// assert_eq!(md, "Hello **world**"); /// ``` pub fn convert(html: &str) -> String { if html.is_empty() { return String::new(); } let document = Html::parse_document(html); let mut ctx = Context::new(); walk_children(document.root_element(), &mut ctx); whitespace::normalize(&ctx.output) } /// Conversion state threaded through the tree walk. struct Context { output: String, /// Current list nesting depth (for indentation). list_depth: u32, /// Whether we're inside a
 block (preserve whitespace).
    in_pre: bool,
    /// Whether we're inside an  tag (don't nest links).
    in_link: bool,
    /// Stack of list types for proper ordered/unordered rendering.
    list_stack: Vec,
}

#[derive(Clone, Copy)]
enum ListType {
    Unordered,
    Ordered(u32), // current item number
}

impl Context {
    fn new() -> Self {
        Self {
            output: String::with_capacity(4096),
            list_depth: 0,
            in_pre: false,
            in_link: false,
            list_stack: Vec::new(),
        }
    }

    fn push(&mut self, s: &str) {
        self.output.push_str(s);
    }

    fn push_char(&mut self, c: char) {
        self.output.push(c);
    }

    fn ensure_blank_line(&mut self) {
        let trimmed = self.output.trim_end_matches(' ');
        if trimmed.is_empty() {
            return;
        }
        if trimmed.ends_with("\n\n") {
            return;
        }
        self.output.truncate(trimmed.len());
        self.output.push_str("\n\n");
    }

    fn ensure_newline(&mut self) {
        if !self.output.is_empty() && !self.output.ends_with('\n') {
            self.output.push('\n');
        }
    }

    fn list_indent(&self) -> String {
        if self.list_depth <= 1 {
            return String::new();
        }
        "  ".repeat((self.list_depth - 1) as usize)
    }
}

/// Walk all children of a node, converting each to markdown.
fn walk_children(parent: ElementRef, ctx: &mut Context) {
    for child in parent.children() {
        match child.value() {
            Node::Text(text) => {
                handle_text(&text.text, ctx);
            }
            Node::Element(_) => {
                if let Some(el_ref) = ElementRef::wrap(child) {
                    handle_element(el_ref, ctx);
                }
            }
            _ => {}
        }
    }
}

/// Handle a text node.
fn handle_text(text: &str, ctx: &mut Context) {
    if ctx.in_pre {
        ctx.push(text);
        return;
    }

    // Collapse whitespace in normal flow
    let mut last_was_space = ctx.output.ends_with(' ') || ctx.output.ends_with('\n');
    for ch in text.chars() {
        if ch.is_ascii_whitespace() {
            if !last_was_space {
                ctx.push_char(' ');
                last_was_space = true;
            }
        } else {
            ctx.push_char(ch);
            last_was_space = false;
        }
    }
}

/// Handle an element node — classify it and render accordingly.
fn handle_element(el: ElementRef, ctx: &mut Context) {
    let element = el.value();

    // Check hidden elements
    if elements::is_hidden(element) {
        return;
    }

    // Check for reply boundaries before normal classification.
    // Reply boundaries (gmail_quote, type=cite, etc.) get rendered
    // as blockquotes regardless of their actual element type.
    if replies::is_reply_boundary(el) {
        render_reply_block(el, ctx);
        return;
    }

    // Check for Outlook-style "From: ... Sent: ..." separator blocks.
    // These introduce quoted content that follows them.
    if replies::is_outlook_separator(el) {
        ctx.ensure_blank_line();
        // Render the separator header as attribution
        let text: String = el.text().collect();
        let trimmed = text.split_whitespace().collect::>().join(" ");
        ctx.push(&trimmed);
        ctx.ensure_blank_line();
        return;
    }

    match elements::classify(element) {
        ElementAction::Skip => {}
        ElementAction::Transparent => walk_children(el, ctx),
        ElementAction::Block(kind) => handle_block(el, ctx, kind),
        ElementAction::Inline(kind) => handle_inline(el, ctx, kind),
    }
}

fn handle_block(el: ElementRef, ctx: &mut Context, kind: BlockKind) {
    match kind {
        BlockKind::Paragraph => {
            ctx.ensure_blank_line();
            walk_children(el, ctx);
            ctx.ensure_blank_line();
        }

        BlockKind::Heading(level) => {
            ctx.ensure_blank_line();
            let prefix = "#".repeat(level as usize);
            ctx.push(&prefix);
            ctx.push_char(' ');
            walk_children(el, ctx);
            ctx.ensure_blank_line();
        }

        BlockKind::Blockquote => {
            ctx.ensure_blank_line();
            // Render children into a temporary buffer, then prefix each line with >
            let mut inner_ctx = Context::new();
            inner_ctx.in_pre = ctx.in_pre;
            inner_ctx.in_link = ctx.in_link;
            walk_children(el, &mut inner_ctx);
            let inner = whitespace::normalize(&inner_ctx.output);
            for line in inner.lines() {
                ctx.push("> ");
                ctx.push(line);
                ctx.push_char('\n');
            }
            ctx.push_char('\n');
        }

        BlockKind::UnorderedList => {
            ctx.ensure_blank_line();
            ctx.list_depth += 1;
            ctx.list_stack.push(ListType::Unordered);
            walk_children(el, ctx);
            ctx.list_stack.pop();
            ctx.list_depth -= 1;
            ctx.ensure_blank_line();
        }

        BlockKind::OrderedList => {
            ctx.ensure_blank_line();
            ctx.list_depth += 1;
            ctx.list_stack.push(ListType::Ordered(0));
            walk_children(el, ctx);
            ctx.list_stack.pop();
            ctx.list_depth -= 1;
            ctx.ensure_blank_line();
        }

        BlockKind::ListItem => {
            ctx.ensure_newline();
            let indent = ctx.list_indent();
            ctx.push(&indent);

            // Determine bullet or number
            let marker = match ctx.list_stack.last_mut() {
                Some(ListType::Unordered) => "- ".to_string(),
                Some(ListType::Ordered(n)) => {
                    *n += 1;
                    format!("{}. ", *n)
                }
                None => "- ".to_string(),
            };
            ctx.push(&marker);
            walk_children(el, ctx);
            ctx.ensure_newline();
        }

        BlockKind::PreFormatted => {
            ctx.ensure_blank_line();
            ctx.push("```\n");
            ctx.in_pre = true;
            walk_children(el, ctx);
            ctx.in_pre = false;
            ctx.ensure_newline();
            ctx.push("```");
            ctx.ensure_blank_line();
        }

        BlockKind::HorizontalRule => {
            ctx.ensure_blank_line();
            ctx.push("---");
            ctx.ensure_blank_line();
        }

        BlockKind::Table => {
            ctx.ensure_blank_line();
            if tables::is_data_table(el) {
                let (headers, rows) = tables::extract_table_data(el);
                let md = tables::render_markdown_table(&headers, &rows);
                if !md.is_empty() {
                    ctx.push(&md);
                }
            } else {
                // Layout table — unwrap and render cell contents directly
                render_layout_table(el, ctx);
            }
            ctx.ensure_blank_line();
        }

        BlockKind::Div => {
            // Divs act as block separators but don't add their own markup
            ctx.ensure_blank_line();
            walk_children(el, ctx);
            ctx.ensure_blank_line();
        }
    }
}

fn handle_inline(el: ElementRef, ctx: &mut Context, kind: InlineKind) {
    match kind {
        InlineKind::Bold => {
            ctx.push("**");
            walk_children(el, ctx);
            ctx.push("**");
        }

        InlineKind::Italic => {
            ctx.push("*");
            walk_children(el, ctx);
            ctx.push("*");
        }

        InlineKind::Strikethrough => {
            ctx.push("~~");
            walk_children(el, ctx);
            ctx.push("~~");
        }

        InlineKind::Code => {
            if ctx.in_pre {
                // Inside a 
, don't double-wrap
                walk_children(el, ctx);
            } else {
                ctx.push("`");
                walk_children(el, ctx);
                ctx.push("`");
            }
        }

        InlineKind::Link => {
            if ctx.in_link {
                // Don't nest links
                walk_children(el, ctx);
                return;
            }

            let href = el.value().attr("href").unwrap_or("");

            if href.is_empty() || href == "#" {
                walk_children(el, ctx);
                return;
            }

            // Collect the link text
            let mut text_ctx = Context::new();
            text_ctx.in_link = true;
            walk_children(el, &mut text_ctx);
            let text = text_ctx.output.trim().to_string();

            if text.is_empty() {
                // Link with no text — just show the URL
                ctx.push(href);
            } else if text == href {
                // Link text matches URL — no need for markdown link syntax
                ctx.push(href);
            } else {
                ctx.push("[");
                ctx.push(&text);
                ctx.push("](");
                ctx.push(href);
                ctx.push(")");
            }
        }

        InlineKind::Image => {
            let element = el.value();
            if elements::is_tracking_pixel(element) {
                return;
            }

            let alt = element.attr("alt").unwrap_or("");
            let src = element.attr("src").unwrap_or("");

            if src.is_empty() {
                return;
            }

            ctx.push("![");
            ctx.push(alt);
            ctx.push("](");
            ctx.push(src);
            ctx.push(")");
        }

        InlineKind::LineBreak => {
            ctx.push_char('\n');
        }

        InlineKind::Superscript => {
            ctx.push("^");
            walk_children(el, ctx);
        }

        InlineKind::Subscript => {
            ctx.push("~");
            walk_children(el, ctx);
        }
    }
}

/// Render a reply boundary as a quoted block.
///
/// This is the same rendering logic as `
` — children are /// rendered into a temporary buffer and each line gets `> ` prefixed. /// Attribution lines (e.g. "On ... wrote:") are rendered above the quote. fn render_reply_block(el: ElementRef, ctx: &mut Context) { ctx.ensure_blank_line(); // Look for attribution text if let Some(attribution) = replies::find_attribution(el) { ctx.push(&attribution); ctx.push_char('\n'); } // Render children into temp buffer, then prefix with > let mut inner_ctx = Context::new(); inner_ctx.in_pre = ctx.in_pre; inner_ctx.in_link = ctx.in_link; walk_children(el, &mut inner_ctx); let inner = whitespace::normalize(&inner_ctx.output); if !inner.is_empty() { for line in inner.lines() { ctx.push("> "); ctx.push(line); ctx.push_char('\n'); } ctx.push_char('\n'); } } /// Unwrap a layout table by rendering cell contents sequentially. /// /// Walks through rows and cells, rendering each cell's content as if /// the table wrapper didn't exist. This handles the common email pattern /// of wrapping everything in `
...
`. fn render_layout_table(table: ElementRef, ctx: &mut Context) { for descendant in table.descendants() { if let Some(el_ref) = ElementRef::wrap(descendant) { let name = el_ref.value().name(); if name == "td" || name == "th" { // Check if the cell itself is hidden if !elements::is_hidden(el_ref.value()) { walk_children(el_ref, ctx); ctx.ensure_blank_line(); } } } } } #[cfg(test)] mod tests { use super::*; // -- Basic elements -- #[test] fn empty_input() { assert_eq!(convert(""), ""); } #[test] fn plain_text() { assert_eq!(convert("hello world"), "hello world"); } #[test] fn paragraph() { assert_eq!(convert("

one

two

"), "one\n\ntwo"); } #[test] fn headings() { assert_eq!(convert("

Title

"), "# Title"); assert_eq!(convert("

Sub

"), "### Sub"); } #[test] fn bold_and_italic() { assert_eq!( convert("

bold and italic

"), "**bold** and *italic*" ); } #[test] fn link() { assert_eq!( convert(r#"
click"#), "[click](https://example.com)" ); } #[test] fn link_text_matches_url() { assert_eq!( convert(r#"https://example.com"#), "https://example.com" ); } #[test] fn link_empty_href() { assert_eq!(convert(r#"click"#), "click"); } #[test] fn image() { assert_eq!( convert(r#"A photo"#), "![A photo](photo.jpg)" ); } #[test] fn tracking_pixel_skipped() { assert_eq!(convert(r#""#), ""); } #[test] fn unordered_list() { assert_eq!( convert("
  • one
  • two
"), "- one\n- two" ); } #[test] fn ordered_list() { assert_eq!( convert("
  1. first
  2. second
"), "1. first\n2. second" ); } #[test] fn nested_list() { let html = "
  • outer
    • inner
"; let md = convert(html); assert!(md.contains("- outer")); assert!(md.contains(" - inner")); } #[test] fn nested_list_exact_indent_depth_2() { // At depth 2, `list_indent` returns `" "` (exactly two spaces, one indent level). // Catches `list_indent` mutations: // - `(depth - 1)` → `(depth + 1)`: would produce 3 indent levels (6 spaces). // - `(depth - 1)` → `(depth / 1)`: would produce 2 indent levels (4 spaces). // Either makes this exact-match assertion fail. // (The converter emits a blank line before each nested list — that's a // separate stylistic question; the *indent* is what we're pinning down here.) assert_eq!( convert("
  • A
    • B
"), "- A\n\n - B" ); } #[test] fn triple_nested_list_exact_indent_depth_3() { // At depth 3, indent is exactly `" "` (four spaces). assert_eq!( convert("
  • A
    • B
      • C
"), "- A\n\n - B\n\n - C" ); } #[test] fn sibling_top_level_lists_have_no_indent_after_nesting() { // After a nested
    closes, `list_depth -= 1` must execute to return // to outer scope. If mutated to `+= 1` or `/= 1`, list_depth stays // elevated and the SECOND top-level list ends up incorrectly indented. let md = convert( "
    • A
      • B
    • C
    ", ); // The second list's "C" item must appear at column 0, not indented. // We check the exact substring "\n- C" (newline then no leading whitespace). assert!( md.contains("\n- C"), "second top-level list must not be indented after a nested list closes; got: {md:?}" ); // And explicitly: it must NOT appear with leading spaces. assert!( !md.contains("\n - C"), "second list incorrectly indented; got: {md:?}" ); } #[test] fn ordered_list_decrements_depth_after_nesting() { // Same shape but with
      — exercises the L218 `-= 1` mutation in the // OrderedList block, distinct from UnorderedList's L208. let md = convert( "
      1. A
        1. B
      1. C
      ", ); assert!(md.contains("\n1. C"), "second ol must restart at depth 1: {md:?}"); assert!(!md.contains("\n 1. C"), "second ol indented incorrectly: {md:?}"); } #[test] fn blockquote() { assert_eq!(convert("
      quoted text
      "), "> quoted text"); } #[test] fn nested_blockquote() { let html = "
      outer
      inner
      "; let md = convert(html); assert!(md.contains("> outer")); assert!(md.contains("> > inner")); } #[test] fn preformatted() { let html = "
      fn main() {\n    println!(\"hi\");\n}
      "; let md = convert(html); assert!(md.starts_with("```\n")); assert!(md.contains("fn main()")); assert!(md.ends_with("\n```")); } #[test] fn horizontal_rule() { assert_eq!(convert("

      above


      below

      "), "above\n\n---\n\nbelow"); } #[test] fn br_tag() { assert_eq!(convert("line one
      line two"), "line one\nline two"); } #[test] fn strikethrough() { assert_eq!(convert("removed"), "~~removed~~"); } #[test] fn inline_code() { assert_eq!(convert("use pter here"), "use `pter` here"); } #[test] fn script_and_style_stripped() { assert_eq!( convert("

      text

      "), "text" ); } #[test] fn unknown_elements_transparent() { assert_eq!(convert("hello"), "hello"); } #[test] fn hidden_element_skipped() { assert_eq!( convert(r#"

      visible

      hidden
      "#), "visible" ); } #[test] fn whitespace_collapsed() { assert_eq!(convert(" lots of space "), "lots of space"); } #[test] fn entities_decoded() { // html5ever decodes entities during parsing assert_eq!(convert("

      & < > "

      "), "& < > \""); } #[test] fn sup_and_sub() { assert_eq!(convert("x2"), "x^2"); assert_eq!(convert("H2O"), "H~2O"); } // -- Div / section as block separator -- #[test] fn div_separates_blocks() { assert_eq!(convert("
      one
      two
      "), "one\n\ntwo"); } // -- Tables -- #[test] fn layout_table_single_cell_unwrapped() { let html = "

      Hello world

      "; assert_eq!(convert(html), "Hello world"); } #[test] fn layout_table_multi_column_linearized() { let html = "
      LeftRight
      "; let md = convert(html); assert!(md.contains("Left")); assert!(md.contains("Right")); } #[test] fn data_table_rendered_as_markdown() { let html = "\ \
      NameAge
      Alice30
      Bob25
      "; let md = convert(html); assert!(md.contains("| Name | Age |")); assert!(md.contains("| --- | --- |")); assert!(md.contains("| Alice | 30 |")); assert!(md.contains("| Bob | 25 |")); } #[test] fn nested_layout_tables_unwrapped() { let html = "
      \
      Inner content
      \
      "; let md = convert(html); assert!(md.contains("Inner content")); assert!(!md.contains("|")); } #[test] fn presentation_role_is_layout() { let html = r#"
      Content 
      "#; let md = convert(html); assert!(md.contains("Content")); assert!(!md.contains("|")); } #[test] fn spacer_element_hidden() { let html = r#"

      real

      spacer

      also real

      "#; let md = convert(html); assert!(md.contains("real")); assert!(!md.contains("spacer")); assert!(md.contains("also real")); } // -- Combined -- #[test] fn mixed_content() { let html = r#"

      Subject

      Hello Max,

      Check out this link.

      • Item one
      • Item two
      "#; let md = convert(html); assert!(md.starts_with("# Subject")); assert!(md.contains("Hello **Max**,")); assert!(md.contains("[this link](https://example.com)")); assert!(md.contains("- Item one\n- Item two")); } }