use scraper::node::Element;
/// What kind of markdown wrapper an element produces.
pub enum ElementAction {
/// Skip this element and all its children entirely.
Skip,
/// Render children only, no wrapper (transparent element).
Transparent,
/// Block element with specific rendering.
Block(BlockKind),
/// Inline element with specific rendering.
Inline(InlineKind),
}
pub enum BlockKind {
Paragraph,
Heading(u8),
Blockquote,
UnorderedList,
OrderedList,
ListItem,
PreFormatted,
HorizontalRule,
Table,
Div,
}
pub enum InlineKind {
Bold,
Italic,
Strikethrough,
Code,
Link,
Image,
LineBreak,
Superscript,
Subscript,
}
/// Classify an HTML element into the action pter should take.
pub fn classify(el: &Element) -> ElementAction {
match el.name() {
// Skip entirely
"script" | "style" | "head" | "meta" | "link" | "title" | "noscript" => {
ElementAction::Skip
}
// Block elements
"p" => ElementAction::Block(BlockKind::Paragraph),
"h1" => ElementAction::Block(BlockKind::Heading(1)),
"h2" => ElementAction::Block(BlockKind::Heading(2)),
"h3" => ElementAction::Block(BlockKind::Heading(3)),
"h4" => ElementAction::Block(BlockKind::Heading(4)),
"h5" => ElementAction::Block(BlockKind::Heading(5)),
"h6" => ElementAction::Block(BlockKind::Heading(6)),
"blockquote" => ElementAction::Block(BlockKind::Blockquote),
"ul" | "menu" => ElementAction::Block(BlockKind::UnorderedList),
"ol" => ElementAction::Block(BlockKind::OrderedList),
"li" => ElementAction::Block(BlockKind::ListItem),
"pre" => ElementAction::Block(BlockKind::PreFormatted),
"hr" => ElementAction::Block(BlockKind::HorizontalRule),
"table" => ElementAction::Block(BlockKind::Table),
// Table sub-elements are handled by the Table block handler, not individually
"thead" | "tbody" | "tfoot" | "tr" | "td" | "th" | "caption" | "colgroup" | "col" => {
ElementAction::Transparent
}
"div" | "section" | "article" | "main" | "header" | "footer" | "nav" | "aside"
| "figure" | "figcaption" | "details" | "summary" => {
ElementAction::Block(BlockKind::Div)
}
// Inline elements
"strong" | "b" => ElementAction::Inline(InlineKind::Bold),
"em" | "i" => ElementAction::Inline(InlineKind::Italic),
"del" | "s" | "strike" => ElementAction::Inline(InlineKind::Strikethrough),
"code" | "tt" => ElementAction::Inline(InlineKind::Code),
"a" => ElementAction::Inline(InlineKind::Link),
"img" => ElementAction::Inline(InlineKind::Image),
"br" => ElementAction::Inline(InlineKind::LineBreak),
"sup" => ElementAction::Inline(InlineKind::Superscript),
"sub" => ElementAction::Inline(InlineKind::Subscript),
// Everything else: transparent (render children)
_ => ElementAction::Transparent,
}
}
/// Check if an
element is a tracking pixel.
/// Returns true if it should be skipped.
pub fn is_tracking_pixel(el: &Element) -> bool {
let width = el.attr("width");
let height = el.attr("height");
// 1x1 or 0x0 images
if matches!(width, Some("1" | "0")) || matches!(height, Some("1" | "0")) {
return true;
}
// No src attribute
let Some(src) = el.attr("src") else {
return true;
};
// Empty or data:image/gif (common transparent pixel)
if src.is_empty() {
return true;
}
if src.starts_with("data:image/gif;base64,R0lGOD") {
return true;
}
// Check inline style for tiny dimensions
if let Some(style) = el.attr("style") {
let style_lower = style.to_lowercase();
if style_lower.contains("width:1px")
|| style_lower.contains("width: 1px")
|| style_lower.contains("width:0")
|| style_lower.contains("height:1px")
|| style_lower.contains("height: 1px")
|| style_lower.contains("height:0")
|| style_lower.contains("display:none")
|| style_lower.contains("display: none")
{
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
use scraper::{Html, Selector};
fn classify_tag(tag: &str) -> ElementAction {
let html = format!("<{tag}>{tag}>");
let doc = Html::parse_fragment(&html);
let sel = Selector::parse(tag).unwrap();
let el = doc.select(&sel).next().unwrap();
classify(el.value())
}
fn img_is_pixel(attrs: &str) -> bool {
let html = format!("
![]()
");
let doc = Html::parse_fragment(&html);
let sel = Selector::parse("img").unwrap();
let el = doc.select(&sel).next().unwrap();
is_tracking_pixel(el.value())
}
fn div_is_hidden(attrs: &str) -> bool {
let html = format!("");
let doc = Html::parse_fragment(&html);
let sel = Selector::parse("div").unwrap();
let el = doc.select(&sel).next().unwrap();
is_hidden(el.value())
}
// -- classify: heading levels (h4/h5/h6 arms) --
// Without these arms, the elements fall through to `_ => Transparent`,
// which differs from `Block(Heading(n))`. Tests catch the deletion.
#[test]
fn classify_h1_is_heading_1() {
assert!(matches!(classify_tag("h1"), ElementAction::Block(BlockKind::Heading(1))));
}
#[test]
fn classify_h4_is_heading_4() {
assert!(matches!(classify_tag("h4"), ElementAction::Block(BlockKind::Heading(4))));
}
#[test]
fn classify_h5_is_heading_5() {
assert!(matches!(classify_tag("h5"), ElementAction::Block(BlockKind::Heading(5))));
}
#[test]
fn classify_h6_is_heading_6() {
assert!(matches!(classify_tag("h6"), ElementAction::Block(BlockKind::Heading(6))));
}
#[test]
fn classify_script_is_skip() {
assert!(matches!(classify_tag("script"), ElementAction::Skip));
}
#[test]
fn classify_table_is_block_table() {
assert!(matches!(classify_tag("table"), ElementAction::Block(BlockKind::Table)));
}
#[test]
fn classify_strong_is_inline_bold() {
assert!(matches!(classify_tag("strong"), ElementAction::Inline(InlineKind::Bold)));
}
// -- is_tracking_pixel: each || arm needs its own positive test --
#[test]
fn pixel_width_1_only() {
assert!(img_is_pixel(r#"src="x" width="1" height="100""#));
}
#[test]
fn pixel_height_1_only() {
// Catches L95 mutating || to && (width OR height; not AND)
assert!(img_is_pixel(r#"src="x" width="100" height="1""#));
}
#[test]
fn pixel_width_0_only() {
assert!(img_is_pixel(r#"src="x" width="0" height="100""#));
}
#[test]
fn pixel_no_src_is_pixel() {
assert!(img_is_pixel(r#"width="100" height="100""#));
}
#[test]
fn pixel_empty_src_is_pixel() {
assert!(img_is_pixel(r#"src="" width="100" height="100""#));
}
#[test]
fn pixel_transparent_gif_data_uri_is_pixel() {
assert!(img_is_pixel(
r#"src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" width="100" height="100""#
));
}
// Each `||` arm in the style chain (L115–122) — each needs its own input
// that triggers ONLY that arm. Catches `replace || with &&` mutants.
#[test]
fn pixel_style_width_1px() {
assert!(img_is_pixel(r#"src="x" style="width:1px""#));
}
#[test]
fn pixel_style_width_space_1px() {
assert!(img_is_pixel(r#"src="x" style="width: 1px""#));
}
#[test]
fn pixel_style_width_0() {
assert!(img_is_pixel(r#"src="x" style="width:0""#));
}
#[test]
fn pixel_style_height_1px() {
assert!(img_is_pixel(r#"src="x" style="height:1px""#));
}
#[test]
fn pixel_style_height_space_1px() {
assert!(img_is_pixel(r#"src="x" style="height: 1px""#));
}
#[test]
fn pixel_style_height_0() {
assert!(img_is_pixel(r#"src="x" style="height:0""#));
}
#[test]
fn pixel_style_display_none() {
assert!(img_is_pixel(r#"src="x" style="display:none""#));
}
#[test]
fn pixel_style_display_space_none() {
assert!(img_is_pixel(r#"src="x" style="display: none""#));
}
#[test]
fn pixel_normal_image_is_not_pixel() {
assert!(!img_is_pixel(
r#"src="https://example.com/cat.jpg" width="500" height="300""#
));
}
// -- is_hidden: each || arm with its own targeted test --
#[test]
fn hidden_display_none() {
assert!(div_is_hidden(r#"style="display:none""#));
}
#[test]
fn hidden_display_space_none() {
assert!(div_is_hidden(r#"style="display: none""#));
}
#[test]
fn hidden_visibility_hidden() {
assert!(div_is_hidden(r#"style="visibility:hidden""#));
}
#[test]
fn hidden_visibility_space_hidden() {
assert!(div_is_hidden(r#"style="visibility: hidden""#));
}
#[test]
fn hidden_font_size_0() {
assert!(div_is_hidden(r#"style="font-size:0""#));
}
#[test]
fn hidden_font_size_space_0() {
assert!(div_is_hidden(r#"style="font-size: 0""#));
}
#[test]
fn hidden_line_height_0() {
assert!(div_is_hidden(r#"style="line-height:0""#));
}
#[test]
fn hidden_line_height_space_0() {
assert!(div_is_hidden(r#"style="line-height: 0""#));
}
// The (height:0 && overflow:hidden) and (height: 0 && overflow: hidden) arms
// need both halves present to fire. Tests cover each form, plus the negative
// case where height:0 alone is NOT hidden (catches && → || mutation on L146/147).
#[test]
fn hidden_height_0_with_overflow_no_spaces() {
assert!(div_is_hidden(r#"style="height:0;overflow:hidden""#));
}
#[test]
fn hidden_height_0_with_overflow_with_spaces() {
assert!(div_is_hidden(r#"style="height: 0;overflow: hidden""#));
}
#[test]
fn hidden_height_0_alone_is_not_hidden() {
// Catches the L146 && → || mutation: with ||, this would erroneously be hidden.
assert!(!div_is_hidden(r#"style="height:0""#));
}
#[test]
fn hidden_height_space_0_alone_is_not_hidden() {
// Same boundary check for the space variant — catches the && → || mutation
// on the `(height: 0 && overflow: hidden)` arm specifically.
assert!(!div_is_hidden(r#"style="height: 0""#));
}
#[test]
fn hidden_max_height_0() {
assert!(div_is_hidden(r#"style="max-height:0""#));
}
#[test]
fn hidden_max_height_space_0() {
assert!(div_is_hidden(r#"style="max-height: 0""#));
}
#[test]
fn hidden_no_signal_in_style() {
assert!(!div_is_hidden(r#"style="color:red;font-weight:bold""#));
}
#[test]
fn hidden_no_style_attr_is_not_hidden() {
assert!(!div_is_hidden(""));
}
}
/// Check if an element is hidden via inline style.
///
/// Catches display:none, visibility:hidden, and spacer tricks
/// like font-size:0 or line-height:0 (commonly used in email templates).
pub fn is_hidden(el: &Element) -> bool {
if let Some(style) = el.attr("style") {
let s = style.to_lowercase();
if s.contains("display:none")
|| s.contains("display: none")
|| s.contains("visibility:hidden")
|| s.contains("visibility: hidden")
|| s.contains("font-size:0")
|| s.contains("font-size: 0")
|| s.contains("line-height:0")
|| s.contains("line-height: 0")
|| (s.contains("height:0") && s.contains("overflow:hidden"))
|| (s.contains("height: 0") && s.contains("overflow: hidden"))
|| s.contains("max-height:0")
|| s.contains("max-height: 0")
{
return true;
}
}
false
}