Skip to main content

max / pter

11.9 KB · 400 lines History Blame Raw
1 use scraper::node::Element;
2
3 /// What kind of markdown wrapper an element produces.
4 pub enum ElementAction {
5 /// Skip this element and all its children entirely.
6 Skip,
7 /// Render children only, no wrapper (transparent element).
8 Transparent,
9 /// Block element with specific rendering.
10 Block(BlockKind),
11 /// Inline element with specific rendering.
12 Inline(InlineKind),
13 }
14
15 pub enum BlockKind {
16 Paragraph,
17 Heading(u8),
18 Blockquote,
19 UnorderedList,
20 OrderedList,
21 ListItem,
22 PreFormatted,
23 HorizontalRule,
24 Table,
25 Div,
26 }
27
28 pub enum InlineKind {
29 Bold,
30 Italic,
31 Strikethrough,
32 Code,
33 Link,
34 Image,
35 LineBreak,
36 Superscript,
37 Subscript,
38 }
39
40 /// Classify an HTML element into the action pter should take.
41 pub fn classify(el: &Element) -> ElementAction {
42 match el.name() {
43 // Skip entirely
44 "script" | "style" | "head" | "meta" | "link" | "title" | "noscript" => {
45 ElementAction::Skip
46 }
47
48 // Block elements
49 "p" => ElementAction::Block(BlockKind::Paragraph),
50 "h1" => ElementAction::Block(BlockKind::Heading(1)),
51 "h2" => ElementAction::Block(BlockKind::Heading(2)),
52 "h3" => ElementAction::Block(BlockKind::Heading(3)),
53 "h4" => ElementAction::Block(BlockKind::Heading(4)),
54 "h5" => ElementAction::Block(BlockKind::Heading(5)),
55 "h6" => ElementAction::Block(BlockKind::Heading(6)),
56 "blockquote" => ElementAction::Block(BlockKind::Blockquote),
57 "ul" | "menu" => ElementAction::Block(BlockKind::UnorderedList),
58 "ol" => ElementAction::Block(BlockKind::OrderedList),
59 "li" => ElementAction::Block(BlockKind::ListItem),
60 "pre" => ElementAction::Block(BlockKind::PreFormatted),
61 "hr" => ElementAction::Block(BlockKind::HorizontalRule),
62 "table" => ElementAction::Block(BlockKind::Table),
63 // Table sub-elements are handled by the Table block handler, not individually
64 "thead" | "tbody" | "tfoot" | "tr" | "td" | "th" | "caption" | "colgroup" | "col" => {
65 ElementAction::Transparent
66 }
67 "div" | "section" | "article" | "main" | "header" | "footer" | "nav" | "aside"
68 | "figure" | "figcaption" | "details" | "summary" => {
69 ElementAction::Block(BlockKind::Div)
70 }
71
72 // Inline elements
73 "strong" | "b" => ElementAction::Inline(InlineKind::Bold),
74 "em" | "i" => ElementAction::Inline(InlineKind::Italic),
75 "del" | "s" | "strike" => ElementAction::Inline(InlineKind::Strikethrough),
76 "code" | "tt" => ElementAction::Inline(InlineKind::Code),
77 "a" => ElementAction::Inline(InlineKind::Link),
78 "img" => ElementAction::Inline(InlineKind::Image),
79 "br" => ElementAction::Inline(InlineKind::LineBreak),
80 "sup" => ElementAction::Inline(InlineKind::Superscript),
81 "sub" => ElementAction::Inline(InlineKind::Subscript),
82
83 // Everything else: transparent (render children)
84 _ => ElementAction::Transparent,
85 }
86 }
87
88 /// Check if an <img> element is a tracking pixel.
89 /// Returns true if it should be skipped.
90 pub fn is_tracking_pixel(el: &Element) -> bool {
91 let width = el.attr("width");
92 let height = el.attr("height");
93
94 // 1x1 or 0x0 images
95 if matches!(width, Some("1" | "0")) || matches!(height, Some("1" | "0")) {
96 return true;
97 }
98
99 // No src attribute
100 let Some(src) = el.attr("src") else {
101 return true;
102 };
103
104 // Empty or data:image/gif (common transparent pixel)
105 if src.is_empty() {
106 return true;
107 }
108 if src.starts_with("data:image/gif;base64,R0lGOD") {
109 return true;
110 }
111
112 // Check inline style for tiny dimensions
113 if let Some(style) = el.attr("style") {
114 let style_lower = style.to_lowercase();
115 if style_lower.contains("width:1px")
116 || style_lower.contains("width: 1px")
117 || style_lower.contains("width:0")
118 || style_lower.contains("height:1px")
119 || style_lower.contains("height: 1px")
120 || style_lower.contains("height:0")
121 || style_lower.contains("display:none")
122 || style_lower.contains("display: none")
123 {
124 return true;
125 }
126 }
127
128 false
129 }
130
131 #[cfg(test)]
132 mod tests {
133 use super::*;
134 use scraper::{Html, Selector};
135
136 fn classify_tag(tag: &str) -> ElementAction {
137 let html = format!("<{tag}></{tag}>");
138 let doc = Html::parse_fragment(&html);
139 let sel = Selector::parse(tag).unwrap();
140 let el = doc.select(&sel).next().unwrap();
141 classify(el.value())
142 }
143
144 fn img_is_pixel(attrs: &str) -> bool {
145 let html = format!("<div><img {attrs} ></div>");
146 let doc = Html::parse_fragment(&html);
147 let sel = Selector::parse("img").unwrap();
148 let el = doc.select(&sel).next().unwrap();
149 is_tracking_pixel(el.value())
150 }
151
152 fn div_is_hidden(attrs: &str) -> bool {
153 let html = format!("<div {attrs}></div>");
154 let doc = Html::parse_fragment(&html);
155 let sel = Selector::parse("div").unwrap();
156 let el = doc.select(&sel).next().unwrap();
157 is_hidden(el.value())
158 }
159
160 // -- classify: heading levels (h4/h5/h6 arms) --
161 // Without these arms, the elements fall through to `_ => Transparent`,
162 // which differs from `Block(Heading(n))`. Tests catch the deletion.
163
164 #[test]
165 fn classify_h1_is_heading_1() {
166 assert!(matches!(classify_tag("h1"), ElementAction::Block(BlockKind::Heading(1))));
167 }
168
169 #[test]
170 fn classify_h4_is_heading_4() {
171 assert!(matches!(classify_tag("h4"), ElementAction::Block(BlockKind::Heading(4))));
172 }
173
174 #[test]
175 fn classify_h5_is_heading_5() {
176 assert!(matches!(classify_tag("h5"), ElementAction::Block(BlockKind::Heading(5))));
177 }
178
179 #[test]
180 fn classify_h6_is_heading_6() {
181 assert!(matches!(classify_tag("h6"), ElementAction::Block(BlockKind::Heading(6))));
182 }
183
184 #[test]
185 fn classify_script_is_skip() {
186 assert!(matches!(classify_tag("script"), ElementAction::Skip));
187 }
188
189 #[test]
190 fn classify_table_is_block_table() {
191 assert!(matches!(classify_tag("table"), ElementAction::Block(BlockKind::Table)));
192 }
193
194 #[test]
195 fn classify_strong_is_inline_bold() {
196 assert!(matches!(classify_tag("strong"), ElementAction::Inline(InlineKind::Bold)));
197 }
198
199 // -- is_tracking_pixel: each || arm needs its own positive test --
200
201 #[test]
202 fn pixel_width_1_only() {
203 assert!(img_is_pixel(r#"src="x" width="1" height="100""#));
204 }
205
206 #[test]
207 fn pixel_height_1_only() {
208 // Catches L95 mutating || to && (width OR height; not AND)
209 assert!(img_is_pixel(r#"src="x" width="100" height="1""#));
210 }
211
212 #[test]
213 fn pixel_width_0_only() {
214 assert!(img_is_pixel(r#"src="x" width="0" height="100""#));
215 }
216
217 #[test]
218 fn pixel_no_src_is_pixel() {
219 assert!(img_is_pixel(r#"width="100" height="100""#));
220 }
221
222 #[test]
223 fn pixel_empty_src_is_pixel() {
224 assert!(img_is_pixel(r#"src="" width="100" height="100""#));
225 }
226
227 #[test]
228 fn pixel_transparent_gif_data_uri_is_pixel() {
229 assert!(img_is_pixel(
230 r#"src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" width="100" height="100""#
231 ));
232 }
233
234 // Each `||` arm in the style chain (L115–122) — each needs its own input
235 // that triggers ONLY that arm. Catches `replace || with &&` mutants.
236
237 #[test]
238 fn pixel_style_width_1px() {
239 assert!(img_is_pixel(r#"src="x" style="width:1px""#));
240 }
241
242 #[test]
243 fn pixel_style_width_space_1px() {
244 assert!(img_is_pixel(r#"src="x" style="width: 1px""#));
245 }
246
247 #[test]
248 fn pixel_style_width_0() {
249 assert!(img_is_pixel(r#"src="x" style="width:0""#));
250 }
251
252 #[test]
253 fn pixel_style_height_1px() {
254 assert!(img_is_pixel(r#"src="x" style="height:1px""#));
255 }
256
257 #[test]
258 fn pixel_style_height_space_1px() {
259 assert!(img_is_pixel(r#"src="x" style="height: 1px""#));
260 }
261
262 #[test]
263 fn pixel_style_height_0() {
264 assert!(img_is_pixel(r#"src="x" style="height:0""#));
265 }
266
267 #[test]
268 fn pixel_style_display_none() {
269 assert!(img_is_pixel(r#"src="x" style="display:none""#));
270 }
271
272 #[test]
273 fn pixel_style_display_space_none() {
274 assert!(img_is_pixel(r#"src="x" style="display: none""#));
275 }
276
277 #[test]
278 fn pixel_normal_image_is_not_pixel() {
279 assert!(!img_is_pixel(
280 r#"src="https://example.com/cat.jpg" width="500" height="300""#
281 ));
282 }
283
284 // -- is_hidden: each || arm with its own targeted test --
285
286 #[test]
287 fn hidden_display_none() {
288 assert!(div_is_hidden(r#"style="display:none""#));
289 }
290
291 #[test]
292 fn hidden_display_space_none() {
293 assert!(div_is_hidden(r#"style="display: none""#));
294 }
295
296 #[test]
297 fn hidden_visibility_hidden() {
298 assert!(div_is_hidden(r#"style="visibility:hidden""#));
299 }
300
301 #[test]
302 fn hidden_visibility_space_hidden() {
303 assert!(div_is_hidden(r#"style="visibility: hidden""#));
304 }
305
306 #[test]
307 fn hidden_font_size_0() {
308 assert!(div_is_hidden(r#"style="font-size:0""#));
309 }
310
311 #[test]
312 fn hidden_font_size_space_0() {
313 assert!(div_is_hidden(r#"style="font-size: 0""#));
314 }
315
316 #[test]
317 fn hidden_line_height_0() {
318 assert!(div_is_hidden(r#"style="line-height:0""#));
319 }
320
321 #[test]
322 fn hidden_line_height_space_0() {
323 assert!(div_is_hidden(r#"style="line-height: 0""#));
324 }
325
326 // The (height:0 && overflow:hidden) and (height: 0 && overflow: hidden) arms
327 // need both halves present to fire. Tests cover each form, plus the negative
328 // case where height:0 alone is NOT hidden (catches && → || mutation on L146/147).
329
330 #[test]
331 fn hidden_height_0_with_overflow_no_spaces() {
332 assert!(div_is_hidden(r#"style="height:0;overflow:hidden""#));
333 }
334
335 #[test]
336 fn hidden_height_0_with_overflow_with_spaces() {
337 assert!(div_is_hidden(r#"style="height: 0;overflow: hidden""#));
338 }
339
340 #[test]
341 fn hidden_height_0_alone_is_not_hidden() {
342 // Catches the L146 && → || mutation: with ||, this would erroneously be hidden.
343 assert!(!div_is_hidden(r#"style="height:0""#));
344 }
345
346 #[test]
347 fn hidden_height_space_0_alone_is_not_hidden() {
348 // Same boundary check for the space variant — catches the && → || mutation
349 // on the `(height: 0 && overflow: hidden)` arm specifically.
350 assert!(!div_is_hidden(r#"style="height: 0""#));
351 }
352
353 #[test]
354 fn hidden_max_height_0() {
355 assert!(div_is_hidden(r#"style="max-height:0""#));
356 }
357
358 #[test]
359 fn hidden_max_height_space_0() {
360 assert!(div_is_hidden(r#"style="max-height: 0""#));
361 }
362
363 #[test]
364 fn hidden_no_signal_in_style() {
365 assert!(!div_is_hidden(r#"style="color:red;font-weight:bold""#));
366 }
367
368 #[test]
369 fn hidden_no_style_attr_is_not_hidden() {
370 assert!(!div_is_hidden(""));
371 }
372 }
373
374 /// Check if an element is hidden via inline style.
375 ///
376 /// Catches display:none, visibility:hidden, and spacer tricks
377 /// like font-size:0 or line-height:0 (commonly used in email templates).
378 pub fn is_hidden(el: &Element) -> bool {
379 if let Some(style) = el.attr("style") {
380 let s = style.to_lowercase();
381 if s.contains("display:none")
382 || s.contains("display: none")
383 || s.contains("visibility:hidden")
384 || s.contains("visibility: hidden")
385 || s.contains("font-size:0")
386 || s.contains("font-size: 0")
387 || s.contains("line-height:0")
388 || s.contains("line-height: 0")
389 || (s.contains("height:0") && s.contains("overflow:hidden"))
390 || (s.contains("height: 0") && s.contains("overflow: hidden"))
391 || s.contains("max-height:0")
392 || s.contains("max-height: 0")
393 {
394 return true;
395 }
396 }
397 false
398 }
399
400