use scraper::node::Node; use scraper::ElementRef; /// Check if an element marks the beginning of a quoted reply. /// /// This is the central abstraction for reply detection. Rather than /// building per-client logic throughout the converter, all client-specific /// knowledge lives here behind a single predicate. /// /// An element is a reply boundary if it's a container that wraps quoted /// content from a previous message in the thread. The converter treats /// these identically to `
` — children get `>` prefixed. pub fn is_reply_boundary(el: ElementRef) -> bool { let element = el.value(); let name = element.name(); //
is already handled by the element classifier. // This function catches non-blockquote reply wrappers. // Structural: elements with type="cite" (Apple Mail, some webmail) if element.attr("type") == Some("cite") { return true; } // Class/ID-based detection — thin per-client checks if element.attr("class").is_some_and(is_reply_class) { return true; } if element.attr("id").is_some_and(is_reply_id) { return true; } // Heuristic: a
whose first meaningful text child matches // an attribution pattern ("On ... wrote:") followed by a blockquote if name == "div" && has_attribution_then_quote(el) { return true; } false } /// Extract an attribution line from just before or at the start of a reply boundary. /// /// Returns the attribution text (e.g. "On Mon, Jan 5, Alice wrote:") if found, /// so the converter can render it above the quoted block. pub fn find_attribution(el: ElementRef) -> Option { // Check the element's own leading text for attribution patterns for child in el.children() { match child.value() { Node::Text(text) => { let trimmed = text.text.trim(); if is_attribution_text(trimmed) { return Some(trimmed.to_string()); } } Node::Element(_) => { // Stop at the first child element — attribution is leading text break; } _ => {} } } // Check for a preceding sibling text node or element with attribution if let Some(prev) = previous_sibling_text(el) { let trimmed = prev.trim().to_string(); if is_attribution_text(&trimmed) { return Some(trimmed); } } None } /// Check if text matches common email attribution patterns. /// /// These patterns are cross-client — every email client generates some /// variant of "On [date], [person] wrote:" or "--- Forwarded message ---". fn is_attribution_text(text: &str) -> bool { let t = text.trim(); // "On ... wrote:" (Gmail, Apple Mail, Thunderbird, most clients) if t.starts_with("On ") && t.ends_with("wrote:") { return true; } // Localized variants: "Le ... a écrit :" (French), "Am ... schrieb" (German) if (t.starts_with("Le ") || t.starts_with("El ")) && (t.ends_with("crit :") || t.ends_with("crit:")) { return true; } if t.starts_with("Am ") && (t.ends_with("schrieb:") || t.ends_with("schrieb :")) { return true; } // Forwarded message separators if t.contains("Forwarded message") || t.contains("Begin forwarded message") || t.contains("Original Message") { return true; } false } /// Thin per-client class checks. Each is one line — easy to add new clients. fn is_reply_class(class: &str) -> bool { // Split on whitespace to check individual class names class.split_whitespace().any(|c| { matches!( c, "gmail_quote" | "gmail_extra" | "yahoo_quoted" | "protonmail_quote" | "tutanota_quote" | "moz-cite-prefix" // Thunderbird | "zmail_extra" // Zoho | "WordSection1" // Outlook (sometimes wraps replies) ) }) } /// Thin per-client ID checks. fn is_reply_id(id: &str) -> bool { matches!( id, "divRplyFwdMsg" // Outlook | "reply-message" // Generic | "OLK_SRC_BODY_SECTION" // Outlook Mac ) } /// Check if a div contains attribution text followed by a blockquote. /// /// This catches the common pattern where no class/id is present but /// the structure is: `
On ... wrote:
...
` fn has_attribution_then_quote(el: ElementRef) -> bool { let mut found_attribution = false; for child in el.children() { match child.value() { Node::Text(text) => { if is_attribution_text(text.text.trim()) { found_attribution = true; } } Node::Element(e) => { if found_attribution && e.name() == "blockquote" { return true; } // Skip
tags between attribution and blockquote if e.name() != "br" { // If we hit a non-br element before finding attribution, stop if !found_attribution { return false; } } } _ => {} } } false } /// Get text from the previous sibling, if it exists and is a text or inline element. fn previous_sibling_text(el: ElementRef) -> Option { let prev = el.prev_sibling()?; match prev.value() { Node::Text(text) => Some(text.text.to_string()), Node::Element(e) => { // Check inline elements like , that might wrap attribution if matches!(e.name(), "span" | "font" | "b" | "i" | "div" | "p") { let el_ref = ElementRef::wrap(prev)?; let text: String = el_ref.text().collect(); if !text.trim().is_empty() { return Some(text); } } None } _ => None, } } /// Check if a separator element marks the boundary between original /// content and a forwarded/replied message. /// /// This catches `
` or styled divs that act as visual separators /// before reply content (common in Outlook "From: ... Sent: ..." blocks). pub fn is_outlook_separator(el: ElementRef) -> bool { let element = el.value(); // Outlook uses a specific pattern: a div containing // "From: ... Sent: ... To: ... Subject: ..." as a reply header if element.name() == "div" || element.name() == "p" { let text: String = el.text().collect(); let t = text.trim(); // Must have at least From + Sent/Date or Subject let has_from = t.contains("From:"); let has_sent = t.contains("Sent:") || t.contains("Date:"); let has_subject = t.contains("Subject:"); if has_from && (has_sent || has_subject) { return true; } } false } #[cfg(test)] mod tests { use super::*; use scraper::{Html, Selector}; fn parse_and_select(html: &str, selector: &str) -> (Html, Selector) { let doc = Html::parse_document(html); let sel = Selector::parse(selector).unwrap(); (doc, sel) } // -- Attribution detection -- #[test] fn attribution_on_wrote() { assert!(is_attribution_text("On Mon, Jan 5, 2026 at 3:00 PM Alice wrote:")); } #[test] fn attribution_forwarded() { assert!(is_attribution_text("---------- Forwarded message ----------")); } #[test] fn attribution_original_message() { assert!(is_attribution_text("-----Original Message-----")); } #[test] fn attribution_begin_forwarded() { assert!(is_attribution_text("Begin forwarded message:")); } #[test] fn not_attribution() { assert!(!is_attribution_text("Hello, how are you?")); assert!(!is_attribution_text("On the other hand, this is fine.")); } // -- Reply class detection -- #[test] fn gmail_quote_class() { assert!(is_reply_class("gmail_quote")); } #[test] fn multiple_classes_with_reply() { assert!(is_reply_class("some-class gmail_quote another")); } #[test] fn non_reply_class() { assert!(!is_reply_class("regular-div content-wrapper")); } // -- Reply boundary detection -- #[test] fn type_cite_is_boundary() { let html = r#"

quoted

"#; let (doc, sel) = parse_and_select(html, r#"div[type="cite"]"#); let el = doc.select(&sel).next().unwrap(); assert!(is_reply_boundary(el)); } #[test] fn gmail_quote_is_boundary() { let html = r#"

quoted

"#; let (doc, sel) = parse_and_select(html, "div.gmail_quote"); let el = doc.select(&sel).next().unwrap(); assert!(is_reply_boundary(el)); } #[test] fn outlook_id_is_boundary() { let html = r#"

quoted

"#; let (doc, sel) = parse_and_select(html, "#divRplyFwdMsg"); let el = doc.select(&sel).next().unwrap(); assert!(is_reply_boundary(el)); } #[test] fn plain_div_not_boundary() { let html = r#"

not quoted

"#; let (doc, sel) = parse_and_select(html, "div.content"); let el = doc.select(&sel).next().unwrap(); assert!(!is_reply_boundary(el)); } // -- Outlook separator -- #[test] fn outlook_from_sent_subject() { let html = "
From: Alice\nSent: Monday\nTo: Bob\nSubject: Hello
"; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(is_outlook_separator(el)); } #[test] fn regular_div_not_separator() { let html = "
Just a normal paragraph.
"; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(!is_outlook_separator(el)); } // -- Boundary tests for `is_attribution_text`: each arm needs both sides -- #[test] fn attribution_on_without_wrote_is_false() { // "On ..." without "wrote:" — catches mutating && to || assert!(!is_attribution_text("On the bright side, this is fine.")); } #[test] fn attribution_wrote_without_on_is_false() { // "... wrote:" without leading "On " — catches mutating && to || assert!(!is_attribution_text("Alice wrote:")); } #[test] fn attribution_french_le_with_colon_space() { assert!(is_attribution_text("Le lundi 5 janvier 2026, Alice a écrit :")); } #[test] fn attribution_french_le_no_space_before_colon() { // "écrit:" without space — covers L89 || mutation between the two ending forms assert!(is_attribution_text("Le lundi, Alice a écrit:")); } #[test] fn attribution_spanish_el_with_colon_space() { assert!(is_attribution_text("El lunes 5 de enero, Alice a escrit :")); } #[test] fn attribution_spanish_el_no_space_before_colon() { assert!(is_attribution_text("El lunes, Alice a escrit:")); } #[test] fn attribution_french_le_without_wrote_ending_is_false() { // "Le X" without "écrit" — catches L89 mutating || to && assert!(!is_attribution_text("Le lundi, Alice est ici.")); } #[test] fn attribution_starts_with_le_but_not_french_pattern() { // Word starts with "Le" but isn't the French attribution form. assert!(!is_attribution_text("Le sigh.")); } #[test] fn attribution_german_am_with_colon() { assert!(is_attribution_text("Am Montag, 5. Januar 2026, schrieb:")); } #[test] fn attribution_german_am_with_space_colon() { assert!(is_attribution_text("Am Montag schrieb :")); } #[test] fn attribution_german_am_without_schrieb_is_false() { // "Am X" without "schrieb" — catches L93 && mutation assert!(!is_attribution_text("Am very fine, thanks.")); } #[test] fn attribution_german_schrieb_without_am_is_false() { // "schrieb:" without leading "Am " — catches L93 && mutation assert!(!is_attribution_text("Bob schrieb:")); } #[test] fn attribution_begin_forwarded_only() { // Only "Begin forwarded message" present — catches the || chain mutating to && assert!(is_attribution_text("Begin forwarded message")); } #[test] fn attribution_original_message_only() { // Only "Original Message" present — catches the || chain mutating to && assert!(is_attribution_text("-----Original Message-----")); } // -- Boundary tests for `is_reply_id` -- #[test] fn reply_id_reply_message() { assert!(is_reply_id("reply-message")); } #[test] fn reply_id_olk_src_body_section() { assert!(is_reply_id("OLK_SRC_BODY_SECTION")); } #[test] fn reply_id_unknown_is_false() { // Catches `replace is_reply_id -> bool with true` mutant assert!(!is_reply_id("main-content")); assert!(!is_reply_id("")); assert!(!is_reply_id("reply")); } // -- Boundary tests for `find_attribution` -- #[test] fn find_attribution_in_leading_text() { let html = r#"
On Mon, Alice wrote:
quoted
"#; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); let attr = find_attribution(el); assert!(attr.is_some()); assert!(attr.unwrap().contains("wrote:")); } #[test] fn find_attribution_none_when_no_match() { let html = r#"
Just regular text here, nothing fancy.
"#; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(find_attribution(el).is_none()); } #[test] fn find_attribution_stops_at_first_element_child() { // Element-then-text: the Text(_) arm should still match leading text BEFORE // hitting any element. With a leading element, the loop should `break` // out without inspecting later text. Catches "delete match arm Node::Element(_)". let html = r#"
hiOn Mon, Alice wrote:
"#; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); // Leading content is an element, not text — and the later text falls outside // the leading-text scan. So no attribution should be found from leading text. // Also, no preceding sibling. → None. assert!(find_attribution(el).is_none()); } #[test] fn find_attribution_in_preceding_sibling() { let html = r#"

On Mon, Alice wrote:

body
"#; let (doc, sel) = parse_and_select(html, "div.quote"); let el = doc.select(&sel).next().unwrap(); let attr = find_attribution(el); assert!(attr.is_some(), "expected attribution from preceding

"); } // -- Boundary tests for `has_attribution_then_quote` -- // These exercise the function via `is_reply_boundary` since it's private. #[test] fn boundary_div_with_attribution_then_blockquote() { let html = r#"

On Mon, Alice wrote:
quoted
"#; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(is_reply_boundary(el)); } #[test] fn boundary_div_blockquote_without_attribution_is_false() { // A bare blockquote-wrapping div without attribution text is not a boundary. // Catches "replace has_attribution_then_quote -> bool with false" (would // make this still pass, but the positive case above would fail). let html = r#"
quoted
"#; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(!is_reply_boundary(el)); } #[test] fn boundary_div_attribution_no_blockquote_is_false() { // Attribution text but no blockquote → not a boundary. // Catches the L151 == mutation (would treat any element as blockquote). let html = r#"
On Mon, Alice wrote:

not a quote

"#; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(!is_reply_boundary(el)); } #[test] fn boundary_div_attribution_br_blockquote() { // Attribution →
→ blockquote. The
must be skipped. // Catches the L155 != mutation in br-handling. let html = r#"
On Mon, Alice wrote:
quoted
"#; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(is_reply_boundary(el)); } #[test] fn boundary_div_non_br_element_before_attribution_is_false() { // Non-br element BEFORE finding attribution → early return false. // Catches the L157 `!` deletion. let html = r#"

preface

On Mon, Alice wrote:
q
"#; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(!is_reply_boundary(el)); } // -- Boundary tests for `previous_sibling_text` -- // Exercised via find_attribution since the function is private. #[test] fn prev_sibling_text_node() { // Raw Text node as preceding sibling. Inside a parent
, a leading // text run followed by a child
means the inner div's // `prev_sibling()` is a `Node::Text`. Catches `delete match arm Node::Text(text)`. let html = r#"
On Mon, Alice wrote:
body
"#; let (doc, sel) = parse_and_select(html, "div.q"); let el = doc.select(&sel).next().unwrap(); assert!(find_attribution(el).is_some()); } #[test] fn prev_sibling_inline_span_with_attribution() { let html = r#"
On Mon, Alice wrote:
body
"#; let (doc, sel) = parse_and_select(html, "div.q"); let el = doc.select(&sel).next().unwrap(); assert!(find_attribution(el).is_some()); } #[test] fn prev_sibling_inline_font_with_attribution() { // is also inline-treated; covers a different arm in the matches!. let html = r#"
On Mon, Alice wrote:
body
"#; let (doc, sel) = parse_and_select(html, "div.q"); let el = doc.select(&sel).next().unwrap(); assert!(find_attribution(el).is_some()); } #[test] fn prev_sibling_non_inline_element_returns_none() { // is not in the inline whitelist → preceding-sibling lookup fails. let html = r#"
On Mon, Alice wrote:
body
"#; let (doc, sel) = parse_and_select(html, "div.q"); let el = doc.select(&sel).next().unwrap(); assert!(find_attribution(el).is_none()); } #[test] fn prev_sibling_empty_inline_returns_none() { let html = r#"
body
"#; let (doc, sel) = parse_and_select(html, "div.q"); let el = doc.select(&sel).next().unwrap(); // Whitespace-only preceding span → no attribution match. assert!(find_attribution(el).is_none()); } // -- Boundary tests for `is_outlook_separator` -- #[test] fn outlook_from_date_subject_is_separator() { // Date instead of Sent → covers L206 || (Sent || Date) mutation let html = "
From: Alice\nDate: Monday\nSubject: Hello
"; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(is_outlook_separator(el)); } #[test] fn outlook_from_sent_no_subject_is_separator() { // From + Sent, no Subject → catches L209 mutating || to && let html = "
From: Alice\nSent: Monday
"; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(is_outlook_separator(el)); } #[test] fn outlook_from_subject_no_sent_is_separator() { // From + Subject, no Sent/Date → catches L209 mutating || to && let html = "
From: Alice\nSubject: Hello
"; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(is_outlook_separator(el)); } #[test] fn outlook_from_only_is_not_separator() { // From alone (no Sent/Date/Subject) → must be false. // Catches L209 && mutation to ||. let html = "
From: Alice
"; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(!is_outlook_separator(el)); } #[test] fn outlook_sent_subject_no_from_is_not_separator() { // No From → must be false regardless of Sent/Subject presence. let html = "
Sent: Monday\nSubject: Hello
"; let (doc, sel) = parse_and_select(html, "div"); let el = doc.select(&sel).next().unwrap(); assert!(!is_outlook_separator(el)); } }