use scraper::node::Element; /// What kind of markdown wrapper an element produces. pub enum ElementAction { /// Skip this element and all its children entirely. Skip, /// Render children only, no wrapper (transparent element). Transparent, /// Block element with specific rendering. Block(BlockKind), /// Inline element with specific rendering. Inline(InlineKind), } pub enum BlockKind { Paragraph, Heading(u8), Blockquote, UnorderedList, OrderedList, ListItem, PreFormatted, HorizontalRule, Table, Div, } pub enum InlineKind { Bold, Italic, Strikethrough, Code, Link, Image, LineBreak, Superscript, Subscript, } /// Classify an HTML element into the action pter should take. pub fn classify(el: &Element) -> ElementAction { match el.name() { // Skip entirely "script" | "style" | "head" | "meta" | "link" | "title" | "noscript" => { ElementAction::Skip } // Block elements "p" => ElementAction::Block(BlockKind::Paragraph), "h1" => ElementAction::Block(BlockKind::Heading(1)), "h2" => ElementAction::Block(BlockKind::Heading(2)), "h3" => ElementAction::Block(BlockKind::Heading(3)), "h4" => ElementAction::Block(BlockKind::Heading(4)), "h5" => ElementAction::Block(BlockKind::Heading(5)), "h6" => ElementAction::Block(BlockKind::Heading(6)), "blockquote" => ElementAction::Block(BlockKind::Blockquote), "ul" | "menu" => ElementAction::Block(BlockKind::UnorderedList), "ol" => ElementAction::Block(BlockKind::OrderedList), "li" => ElementAction::Block(BlockKind::ListItem), "pre" => ElementAction::Block(BlockKind::PreFormatted), "hr" => ElementAction::Block(BlockKind::HorizontalRule), "table" => ElementAction::Block(BlockKind::Table), // Table sub-elements are handled by the Table block handler, not individually "thead" | "tbody" | "tfoot" | "tr" | "td" | "th" | "caption" | "colgroup" | "col" => { ElementAction::Transparent } "div" | "section" | "article" | "main" | "header" | "footer" | "nav" | "aside" | "figure" | "figcaption" | "details" | "summary" => { ElementAction::Block(BlockKind::Div) } // Inline elements "strong" | "b" => ElementAction::Inline(InlineKind::Bold), "em" | "i" => ElementAction::Inline(InlineKind::Italic), "del" | "s" | "strike" => ElementAction::Inline(InlineKind::Strikethrough), "code" | "tt" => ElementAction::Inline(InlineKind::Code), "a" => ElementAction::Inline(InlineKind::Link), "img" => ElementAction::Inline(InlineKind::Image), "br" => ElementAction::Inline(InlineKind::LineBreak), "sup" => ElementAction::Inline(InlineKind::Superscript), "sub" => ElementAction::Inline(InlineKind::Subscript), // Everything else: transparent (render children) _ => ElementAction::Transparent, } } /// Check if an element is a tracking pixel. /// Returns true if it should be skipped. pub fn is_tracking_pixel(el: &Element) -> bool { let width = el.attr("width"); let height = el.attr("height"); // 1x1 or 0x0 images if matches!(width, Some("1" | "0")) || matches!(height, Some("1" | "0")) { return true; } // No src attribute let Some(src) = el.attr("src") else { return true; }; // Empty or data:image/gif (common transparent pixel) if src.is_empty() { return true; } if src.starts_with("data:image/gif;base64,R0lGOD") { return true; } // Check inline style for tiny dimensions if let Some(style) = el.attr("style") { let style_lower = style.to_lowercase(); if style_lower.contains("width:1px") || style_lower.contains("width: 1px") || style_lower.contains("width:0") || style_lower.contains("height:1px") || style_lower.contains("height: 1px") || style_lower.contains("height:0") || style_lower.contains("display:none") || style_lower.contains("display: none") { return true; } } false } #[cfg(test)] mod tests { use super::*; use scraper::{Html, Selector}; fn classify_tag(tag: &str) -> ElementAction { let html = format!("<{tag}>"); let doc = Html::parse_fragment(&html); let sel = Selector::parse(tag).unwrap(); let el = doc.select(&sel).next().unwrap(); classify(el.value()) } fn img_is_pixel(attrs: &str) -> bool { let html = format!("
"); let doc = Html::parse_fragment(&html); let sel = Selector::parse("img").unwrap(); let el = doc.select(&sel).next().unwrap(); is_tracking_pixel(el.value()) } fn div_is_hidden(attrs: &str) -> bool { let html = format!("
"); let doc = Html::parse_fragment(&html); let sel = Selector::parse("div").unwrap(); let el = doc.select(&sel).next().unwrap(); is_hidden(el.value()) } // -- classify: heading levels (h4/h5/h6 arms) -- // Without these arms, the elements fall through to `_ => Transparent`, // which differs from `Block(Heading(n))`. Tests catch the deletion. #[test] fn classify_h1_is_heading_1() { assert!(matches!(classify_tag("h1"), ElementAction::Block(BlockKind::Heading(1)))); } #[test] fn classify_h4_is_heading_4() { assert!(matches!(classify_tag("h4"), ElementAction::Block(BlockKind::Heading(4)))); } #[test] fn classify_h5_is_heading_5() { assert!(matches!(classify_tag("h5"), ElementAction::Block(BlockKind::Heading(5)))); } #[test] fn classify_h6_is_heading_6() { assert!(matches!(classify_tag("h6"), ElementAction::Block(BlockKind::Heading(6)))); } #[test] fn classify_script_is_skip() { assert!(matches!(classify_tag("script"), ElementAction::Skip)); } #[test] fn classify_table_is_block_table() { assert!(matches!(classify_tag("table"), ElementAction::Block(BlockKind::Table))); } #[test] fn classify_strong_is_inline_bold() { assert!(matches!(classify_tag("strong"), ElementAction::Inline(InlineKind::Bold))); } // -- is_tracking_pixel: each || arm needs its own positive test -- #[test] fn pixel_width_1_only() { assert!(img_is_pixel(r#"src="x" width="1" height="100""#)); } #[test] fn pixel_height_1_only() { // Catches L95 mutating || to && (width OR height; not AND) assert!(img_is_pixel(r#"src="x" width="100" height="1""#)); } #[test] fn pixel_width_0_only() { assert!(img_is_pixel(r#"src="x" width="0" height="100""#)); } #[test] fn pixel_no_src_is_pixel() { assert!(img_is_pixel(r#"width="100" height="100""#)); } #[test] fn pixel_empty_src_is_pixel() { assert!(img_is_pixel(r#"src="" width="100" height="100""#)); } #[test] fn pixel_transparent_gif_data_uri_is_pixel() { assert!(img_is_pixel( r#"src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" width="100" height="100""# )); } // Each `||` arm in the style chain (L115–122) — each needs its own input // that triggers ONLY that arm. Catches `replace || with &&` mutants. #[test] fn pixel_style_width_1px() { assert!(img_is_pixel(r#"src="x" style="width:1px""#)); } #[test] fn pixel_style_width_space_1px() { assert!(img_is_pixel(r#"src="x" style="width: 1px""#)); } #[test] fn pixel_style_width_0() { assert!(img_is_pixel(r#"src="x" style="width:0""#)); } #[test] fn pixel_style_height_1px() { assert!(img_is_pixel(r#"src="x" style="height:1px""#)); } #[test] fn pixel_style_height_space_1px() { assert!(img_is_pixel(r#"src="x" style="height: 1px""#)); } #[test] fn pixel_style_height_0() { assert!(img_is_pixel(r#"src="x" style="height:0""#)); } #[test] fn pixel_style_display_none() { assert!(img_is_pixel(r#"src="x" style="display:none""#)); } #[test] fn pixel_style_display_space_none() { assert!(img_is_pixel(r#"src="x" style="display: none""#)); } #[test] fn pixel_normal_image_is_not_pixel() { assert!(!img_is_pixel( r#"src="https://example.com/cat.jpg" width="500" height="300""# )); } // -- is_hidden: each || arm with its own targeted test -- #[test] fn hidden_display_none() { assert!(div_is_hidden(r#"style="display:none""#)); } #[test] fn hidden_display_space_none() { assert!(div_is_hidden(r#"style="display: none""#)); } #[test] fn hidden_visibility_hidden() { assert!(div_is_hidden(r#"style="visibility:hidden""#)); } #[test] fn hidden_visibility_space_hidden() { assert!(div_is_hidden(r#"style="visibility: hidden""#)); } #[test] fn hidden_font_size_0() { assert!(div_is_hidden(r#"style="font-size:0""#)); } #[test] fn hidden_font_size_space_0() { assert!(div_is_hidden(r#"style="font-size: 0""#)); } #[test] fn hidden_line_height_0() { assert!(div_is_hidden(r#"style="line-height:0""#)); } #[test] fn hidden_line_height_space_0() { assert!(div_is_hidden(r#"style="line-height: 0""#)); } // The (height:0 && overflow:hidden) and (height: 0 && overflow: hidden) arms // need both halves present to fire. Tests cover each form, plus the negative // case where height:0 alone is NOT hidden (catches && → || mutation on L146/147). #[test] fn hidden_height_0_with_overflow_no_spaces() { assert!(div_is_hidden(r#"style="height:0;overflow:hidden""#)); } #[test] fn hidden_height_0_with_overflow_with_spaces() { assert!(div_is_hidden(r#"style="height: 0;overflow: hidden""#)); } #[test] fn hidden_height_0_alone_is_not_hidden() { // Catches the L146 && → || mutation: with ||, this would erroneously be hidden. assert!(!div_is_hidden(r#"style="height:0""#)); } #[test] fn hidden_height_space_0_alone_is_not_hidden() { // Same boundary check for the space variant — catches the && → || mutation // on the `(height: 0 && overflow: hidden)` arm specifically. assert!(!div_is_hidden(r#"style="height: 0""#)); } #[test] fn hidden_max_height_0() { assert!(div_is_hidden(r#"style="max-height:0""#)); } #[test] fn hidden_max_height_space_0() { assert!(div_is_hidden(r#"style="max-height: 0""#)); } #[test] fn hidden_no_signal_in_style() { assert!(!div_is_hidden(r#"style="color:red;font-weight:bold""#)); } #[test] fn hidden_no_style_attr_is_not_hidden() { assert!(!div_is_hidden("")); } } /// Check if an element is hidden via inline style. /// /// Catches display:none, visibility:hidden, and spacer tricks /// like font-size:0 or line-height:0 (commonly used in email templates). pub fn is_hidden(el: &Element) -> bool { if let Some(style) = el.attr("style") { let s = style.to_lowercase(); if s.contains("display:none") || s.contains("display: none") || s.contains("visibility:hidden") || s.contains("visibility: hidden") || s.contains("font-size:0") || s.contains("font-size: 0") || s.contains("line-height:0") || s.contains("line-height: 0") || (s.contains("height:0") && s.contains("overflow:hidden")) || (s.contains("height: 0") && s.contains("overflow: hidden")) || s.contains("max-height:0") || s.contains("max-height: 0") { return true; } } false }