Skip to main content

max / pter

Add tests to kill surviving mutants in convert, elements, replies, tables Targets specific mutation operators that previously survived: list-depth arithmetic in convert, classify/is_tracking_pixel/is_hidden_element in elements, locale-specific attribution-line matchers and outlook separator boundaries in replies, role-attribute and header-detection arms in tables. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-05-14 19:21 UTC
Commit: f595794411a25ae04f2c8555519fd4ae64bcefae
Parent: 3ed6a17
4 files changed, +731 insertions, -0 deletions
@@ -526,6 +526,62 @@ mod tests {
526 526 }
527 527
528 528 #[test]
529 + fn nested_list_exact_indent_depth_2() {
530 + // At depth 2, `list_indent` returns `" "` (exactly two spaces, one indent level).
531 + // Catches `list_indent` mutations:
532 + // - `(depth - 1)` → `(depth + 1)`: would produce 3 indent levels (6 spaces).
533 + // - `(depth - 1)` → `(depth / 1)`: would produce 2 indent levels (4 spaces).
534 + // Either makes this exact-match assertion fail.
535 + // (The converter emits a blank line before each nested list — that's a
536 + // separate stylistic question; the *indent* is what we're pinning down here.)
537 + assert_eq!(
538 + convert("<ul><li>A<ul><li>B</li></ul></li></ul>"),
539 + "- A\n\n - B"
540 + );
541 + }
542 +
543 + #[test]
544 + fn triple_nested_list_exact_indent_depth_3() {
545 + // At depth 3, indent is exactly `" "` (four spaces).
546 + assert_eq!(
547 + convert("<ul><li>A<ul><li>B<ul><li>C</li></ul></li></ul></li></ul>"),
548 + "- A\n\n - B\n\n - C"
549 + );
550 + }
551 +
552 + #[test]
553 + fn sibling_top_level_lists_have_no_indent_after_nesting() {
554 + // After a nested <ul> closes, `list_depth -= 1` must execute to return
555 + // to outer scope. If mutated to `+= 1` or `/= 1`, list_depth stays
556 + // elevated and the SECOND top-level list ends up incorrectly indented.
557 + let md = convert(
558 + "<ul><li>A<ul><li>B</li></ul></li></ul><ul><li>C</li></ul>",
559 + );
560 + // The second list's "C" item must appear at column 0, not indented.
561 + // We check the exact substring "\n- C" (newline then no leading whitespace).
562 + assert!(
563 + md.contains("\n- C"),
564 + "second top-level list must not be indented after a nested list closes; got: {md:?}"
565 + );
566 + // And explicitly: it must NOT appear with leading spaces.
567 + assert!(
568 + !md.contains("\n - C"),
569 + "second list incorrectly indented; got: {md:?}"
570 + );
571 + }
572 +
573 + #[test]
574 + fn ordered_list_decrements_depth_after_nesting() {
575 + // Same shape but with <ol> — exercises the L218 `-= 1` mutation in the
576 + // OrderedList block, distinct from UnorderedList's L208.
577 + let md = convert(
578 + "<ol><li>A<ol><li>B</li></ol></li></ol><ol><li>C</li></ol>",
579 + );
580 + assert!(md.contains("\n1. C"), "second ol must restart at depth 1: {md:?}");
581 + assert!(!md.contains("\n 1. C"), "second ol indented incorrectly: {md:?}");
582 + }
583 +
584 + #[test]
529 585 fn blockquote() {
530 586 assert_eq!(convert("<blockquote>quoted text</blockquote>"), "> quoted text");
531 587 }
@@ -128,6 +128,249 @@ pub fn is_tracking_pixel(el: &Element) -> bool {
128 128 false
129 129 }
130 130
131 + #[cfg(test)]
132 + mod tests {
133 + use super::*;
134 + use scraper::{Html, Selector};
135 +
136 + fn classify_tag(tag: &str) -> ElementAction {
137 + let html = format!("<{tag}></{tag}>");
138 + let doc = Html::parse_fragment(&html);
139 + let sel = Selector::parse(tag).unwrap();
140 + let el = doc.select(&sel).next().unwrap();
141 + classify(el.value())
142 + }
143 +
144 + fn img_is_pixel(attrs: &str) -> bool {
145 + let html = format!("<div><img {attrs} ></div>");
146 + let doc = Html::parse_fragment(&html);
147 + let sel = Selector::parse("img").unwrap();
148 + let el = doc.select(&sel).next().unwrap();
149 + is_tracking_pixel(el.value())
150 + }
151 +
152 + fn div_is_hidden(attrs: &str) -> bool {
153 + let html = format!("<div {attrs}></div>");
154 + let doc = Html::parse_fragment(&html);
155 + let sel = Selector::parse("div").unwrap();
156 + let el = doc.select(&sel).next().unwrap();
157 + is_hidden(el.value())
158 + }
159 +
160 + // -- classify: heading levels (h4/h5/h6 arms) --
161 + // Without these arms, the elements fall through to `_ => Transparent`,
162 + // which differs from `Block(Heading(n))`. Tests catch the deletion.
163 +
164 + #[test]
165 + fn classify_h1_is_heading_1() {
166 + assert!(matches!(classify_tag("h1"), ElementAction::Block(BlockKind::Heading(1))));
167 + }
168 +
169 + #[test]
170 + fn classify_h4_is_heading_4() {
171 + assert!(matches!(classify_tag("h4"), ElementAction::Block(BlockKind::Heading(4))));
172 + }
173 +
174 + #[test]
175 + fn classify_h5_is_heading_5() {
176 + assert!(matches!(classify_tag("h5"), ElementAction::Block(BlockKind::Heading(5))));
177 + }
178 +
179 + #[test]
180 + fn classify_h6_is_heading_6() {
181 + assert!(matches!(classify_tag("h6"), ElementAction::Block(BlockKind::Heading(6))));
182 + }
183 +
184 + #[test]
185 + fn classify_script_is_skip() {
186 + assert!(matches!(classify_tag("script"), ElementAction::Skip));
187 + }
188 +
189 + #[test]
190 + fn classify_table_is_block_table() {
191 + assert!(matches!(classify_tag("table"), ElementAction::Block(BlockKind::Table)));
192 + }
193 +
194 + #[test]
195 + fn classify_strong_is_inline_bold() {
196 + assert!(matches!(classify_tag("strong"), ElementAction::Inline(InlineKind::Bold)));
197 + }
198 +
199 + // -- is_tracking_pixel: each || arm needs its own positive test --
200 +
201 + #[test]
202 + fn pixel_width_1_only() {
203 + assert!(img_is_pixel(r#"src="x" width="1" height="100""#));
204 + }
205 +
206 + #[test]
207 + fn pixel_height_1_only() {
208 + // Catches L95 mutating || to && (width OR height; not AND)
209 + assert!(img_is_pixel(r#"src="x" width="100" height="1""#));
210 + }
211 +
212 + #[test]
213 + fn pixel_width_0_only() {
214 + assert!(img_is_pixel(r#"src="x" width="0" height="100""#));
215 + }
216 +
217 + #[test]
218 + fn pixel_no_src_is_pixel() {
219 + assert!(img_is_pixel(r#"width="100" height="100""#));
220 + }
221 +
222 + #[test]
223 + fn pixel_empty_src_is_pixel() {
224 + assert!(img_is_pixel(r#"src="" width="100" height="100""#));
225 + }
226 +
227 + #[test]
228 + fn pixel_transparent_gif_data_uri_is_pixel() {
229 + assert!(img_is_pixel(
230 + r#"src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" width="100" height="100""#
231 + ));
232 + }
233 +
234 + // Each `||` arm in the style chain (L115–122) — each needs its own input
235 + // that triggers ONLY that arm. Catches `replace || with &&` mutants.
236 +
237 + #[test]
238 + fn pixel_style_width_1px() {
239 + assert!(img_is_pixel(r#"src="x" style="width:1px""#));
240 + }
241 +
242 + #[test]
243 + fn pixel_style_width_space_1px() {
244 + assert!(img_is_pixel(r#"src="x" style="width: 1px""#));
245 + }
246 +
247 + #[test]
248 + fn pixel_style_width_0() {
249 + assert!(img_is_pixel(r#"src="x" style="width:0""#));
250 + }
251 +
252 + #[test]
253 + fn pixel_style_height_1px() {
254 + assert!(img_is_pixel(r#"src="x" style="height:1px""#));
255 + }
256 +
257 + #[test]
258 + fn pixel_style_height_space_1px() {
259 + assert!(img_is_pixel(r#"src="x" style="height: 1px""#));
260 + }
261 +
262 + #[test]
263 + fn pixel_style_height_0() {
264 + assert!(img_is_pixel(r#"src="x" style="height:0""#));
265 + }
266 +
267 + #[test]
268 + fn pixel_style_display_none() {
269 + assert!(img_is_pixel(r#"src="x" style="display:none""#));
270 + }
271 +
272 + #[test]
273 + fn pixel_style_display_space_none() {
274 + assert!(img_is_pixel(r#"src="x" style="display: none""#));
275 + }
276 +
277 + #[test]
278 + fn pixel_normal_image_is_not_pixel() {
279 + assert!(!img_is_pixel(
280 + r#"src="https://example.com/cat.jpg" width="500" height="300""#
281 + ));
282 + }
283 +
284 + // -- is_hidden: each || arm with its own targeted test --
285 +
286 + #[test]
287 + fn hidden_display_none() {
288 + assert!(div_is_hidden(r#"style="display:none""#));
289 + }
290 +
291 + #[test]
292 + fn hidden_display_space_none() {
293 + assert!(div_is_hidden(r#"style="display: none""#));
294 + }
295 +
296 + #[test]
297 + fn hidden_visibility_hidden() {
298 + assert!(div_is_hidden(r#"style="visibility:hidden""#));
299 + }
300 +
301 + #[test]
302 + fn hidden_visibility_space_hidden() {
303 + assert!(div_is_hidden(r#"style="visibility: hidden""#));
304 + }
305 +
306 + #[test]
307 + fn hidden_font_size_0() {
308 + assert!(div_is_hidden(r#"style="font-size:0""#));
309 + }
310 +
311 + #[test]
312 + fn hidden_font_size_space_0() {
313 + assert!(div_is_hidden(r#"style="font-size: 0""#));
314 + }
315 +
316 + #[test]
317 + fn hidden_line_height_0() {
318 + assert!(div_is_hidden(r#"style="line-height:0""#));
319 + }
320 +
321 + #[test]
322 + fn hidden_line_height_space_0() {
323 + assert!(div_is_hidden(r#"style="line-height: 0""#));
324 + }
325 +
326 + // The (height:0 && overflow:hidden) and (height: 0 && overflow: hidden) arms
327 + // need both halves present to fire. Tests cover each form, plus the negative
328 + // case where height:0 alone is NOT hidden (catches && → || mutation on L146/147).
329 +
330 + #[test]
331 + fn hidden_height_0_with_overflow_no_spaces() {
332 + assert!(div_is_hidden(r#"style="height:0;overflow:hidden""#));
333 + }
334 +
335 + #[test]
336 + fn hidden_height_0_with_overflow_with_spaces() {
337 + assert!(div_is_hidden(r#"style="height: 0;overflow: hidden""#));
338 + }
339 +
340 + #[test]
341 + fn hidden_height_0_alone_is_not_hidden() {
342 + // Catches the L146 && → || mutation: with ||, this would erroneously be hidden.
343 + assert!(!div_is_hidden(r#"style="height:0""#));
344 + }
345 +
346 + #[test]
347 + fn hidden_height_space_0_alone_is_not_hidden() {
348 + // Same boundary check for the space variant — catches the && → || mutation
349 + // on the `(height: 0 && overflow: hidden)` arm specifically.
350 + assert!(!div_is_hidden(r#"style="height: 0""#));
351 + }
352 +
353 + #[test]
354 + fn hidden_max_height_0() {
355 + assert!(div_is_hidden(r#"style="max-height:0""#));
356 + }
357 +
358 + #[test]
359 + fn hidden_max_height_space_0() {
360 + assert!(div_is_hidden(r#"style="max-height: 0""#));
361 + }
362 +
363 + #[test]
364 + fn hidden_no_signal_in_style() {
365 + assert!(!div_is_hidden(r#"style="color:red;font-weight:bold""#));
366 + }
367 +
368 + #[test]
369 + fn hidden_no_style_attr_is_not_hidden() {
370 + assert!(!div_is_hidden(""));
371 + }
372 + }
373 +
131 374 /// Check if an element is hidden via inline style.
132 375 ///
133 376 /// Catches display:none, visibility:hidden, and spacer tricks
@@ -321,4 +321,297 @@ mod tests {
321 321 let el = doc.select(&sel).next().unwrap();
322 322 assert!(!is_outlook_separator(el));
323 323 }
324 +
325 + // -- Boundary tests for `is_attribution_text`: each arm needs both sides --
326 +
327 + #[test]
328 + fn attribution_on_without_wrote_is_false() {
329 + // "On ..." without "wrote:" — catches mutating && to ||
330 + assert!(!is_attribution_text("On the bright side, this is fine."));
331 + }
332 +
333 + #[test]
334 + fn attribution_wrote_without_on_is_false() {
335 + // "... wrote:" without leading "On " — catches mutating && to ||
336 + assert!(!is_attribution_text("Alice wrote:"));
337 + }
338 +
339 + #[test]
340 + fn attribution_french_le_with_colon_space() {
341 + assert!(is_attribution_text("Le lundi 5 janvier 2026, Alice a écrit :"));
342 + }
343 +
344 + #[test]
345 + fn attribution_french_le_no_space_before_colon() {
346 + // "écrit:" without space — covers L89 || mutation between the two ending forms
347 + assert!(is_attribution_text("Le lundi, Alice a écrit:"));
348 + }
349 +
350 + #[test]
351 + fn attribution_spanish_el_with_colon_space() {
352 + assert!(is_attribution_text("El lunes 5 de enero, Alice a escrit :"));
353 + }
354 +
355 + #[test]
356 + fn attribution_spanish_el_no_space_before_colon() {
357 + assert!(is_attribution_text("El lunes, Alice a escrit:"));
358 + }
359 +
360 + #[test]
361 + fn attribution_french_le_without_wrote_ending_is_false() {
362 + // "Le X" without "écrit" — catches L89 mutating || to &&
363 + assert!(!is_attribution_text("Le lundi, Alice est ici."));
364 + }
365 +
366 + #[test]
367 + fn attribution_starts_with_le_but_not_french_pattern() {
368 + // Word starts with "Le" but isn't the French attribution form.
369 + assert!(!is_attribution_text("Le sigh."));
370 + }
371 +
372 + #[test]
373 + fn attribution_german_am_with_colon() {
374 + assert!(is_attribution_text("Am Montag, 5. Januar 2026, schrieb:"));
375 + }
376 +
377 + #[test]
378 + fn attribution_german_am_with_space_colon() {
379 + assert!(is_attribution_text("Am Montag schrieb :"));
380 + }
381 +
382 + #[test]
383 + fn attribution_german_am_without_schrieb_is_false() {
384 + // "Am X" without "schrieb" — catches L93 && mutation
385 + assert!(!is_attribution_text("Am very fine, thanks."));
386 + }
387 +
388 + #[test]
389 + fn attribution_german_schrieb_without_am_is_false() {
390 + // "schrieb:" without leading "Am " — catches L93 && mutation
391 + assert!(!is_attribution_text("Bob schrieb:"));
392 + }
393 +
394 + #[test]
395 + fn attribution_begin_forwarded_only() {
396 + // Only "Begin forwarded message" present — catches the || chain mutating to &&
397 + assert!(is_attribution_text("Begin forwarded message"));
398 + }
399 +
400 + #[test]
401 + fn attribution_original_message_only() {
402 + // Only "Original Message" present — catches the || chain mutating to &&
403 + assert!(is_attribution_text("-----Original Message-----"));
404 + }
405 +
406 + // -- Boundary tests for `is_reply_id` --
407 +
408 + #[test]
409 + fn reply_id_reply_message() {
410 + assert!(is_reply_id("reply-message"));
411 + }
412 +
413 + #[test]
414 + fn reply_id_olk_src_body_section() {
415 + assert!(is_reply_id("OLK_SRC_BODY_SECTION"));
416 + }
417 +
418 + #[test]
419 + fn reply_id_unknown_is_false() {
420 + // Catches `replace is_reply_id -> bool with true` mutant
421 + assert!(!is_reply_id("main-content"));
422 + assert!(!is_reply_id(""));
423 + assert!(!is_reply_id("reply"));
424 + }
425 +
426 + // -- Boundary tests for `find_attribution` --
427 +
428 + #[test]
429 + fn find_attribution_in_leading_text() {
430 + let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#;
431 + let (doc, sel) = parse_and_select(html, "div");
432 + let el = doc.select(&sel).next().unwrap();
433 + let attr = find_attribution(el);
434 + assert!(attr.is_some());
435 + assert!(attr.unwrap().contains("wrote:"));
436 + }
437 +
438 + #[test]
439 + fn find_attribution_none_when_no_match() {
440 + let html = r#"<div>Just regular text here, nothing fancy.</div>"#;
441 + let (doc, sel) = parse_and_select(html, "div");
442 + let el = doc.select(&sel).next().unwrap();
443 + assert!(find_attribution(el).is_none());
444 + }
445 +
446 + #[test]
447 + fn find_attribution_stops_at_first_element_child() {
448 + // Element-then-text: the Text(_) arm should still match leading text BEFORE
449 + // hitting any element. With a leading element, the loop should `break`
450 + // out without inspecting later text. Catches "delete match arm Node::Element(_)".
451 + let html = r#"<div><span>hi</span>On Mon, Alice wrote:</div>"#;
452 + let (doc, sel) = parse_and_select(html, "div");
453 + let el = doc.select(&sel).next().unwrap();
454 + // Leading content is an element, not text — and the later text falls outside
455 + // the leading-text scan. So no attribution should be found from leading text.
456 + // Also, no preceding sibling. → None.
457 + assert!(find_attribution(el).is_none());
458 + }
459 +
460 + #[test]
461 + fn find_attribution_in_preceding_sibling() {
462 + let html = r#"<div><p>On Mon, Alice wrote:</p><div class="quote">body</div></div>"#;
463 + let (doc, sel) = parse_and_select(html, "div.quote");
464 + let el = doc.select(&sel).next().unwrap();
465 + let attr = find_attribution(el);
466 + assert!(attr.is_some(), "expected attribution from preceding <p>");
467 + }
468 +
469 + // -- Boundary tests for `has_attribution_then_quote` --
470 + // These exercise the function via `is_reply_boundary` since it's private.
471 +
472 + #[test]
473 + fn boundary_div_with_attribution_then_blockquote() {
474 + let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#;
475 + let (doc, sel) = parse_and_select(html, "div");
476 + let el = doc.select(&sel).next().unwrap();
477 + assert!(is_reply_boundary(el));
478 + }
479 +
480 + #[test]
481 + fn boundary_div_blockquote_without_attribution_is_false() {
482 + // A bare blockquote-wrapping div without attribution text is not a boundary.
483 + // Catches "replace has_attribution_then_quote -> bool with false" (would
484 + // make this still pass, but the positive case above would fail).
485 + let html = r#"<div><blockquote>quoted</blockquote></div>"#;
486 + let (doc, sel) = parse_and_select(html, "div");
487 + let el = doc.select(&sel).next().unwrap();
488 + assert!(!is_reply_boundary(el));
489 + }
490 +
491 + #[test]
492 + fn boundary_div_attribution_no_blockquote_is_false() {
493 + // Attribution text but no blockquote → not a boundary.
494 + // Catches the L151 == mutation (would treat any element as blockquote).
495 + let html = r#"<div>On Mon, Alice wrote:<p>not a quote</p></div>"#;
496 + let (doc, sel) = parse_and_select(html, "div");
497 + let el = doc.select(&sel).next().unwrap();
498 + assert!(!is_reply_boundary(el));
499 + }
500 +
501 + #[test]
502 + fn boundary_div_attribution_br_blockquote() {
503 + // Attribution → <br> → blockquote. The <br> must be skipped.
504 + // Catches the L155 != mutation in br-handling.
505 + let html = r#"<div>On Mon, Alice wrote:<br><blockquote>quoted</blockquote></div>"#;
506 + let (doc, sel) = parse_and_select(html, "div");
507 + let el = doc.select(&sel).next().unwrap();
508 + assert!(is_reply_boundary(el));
509 + }
510 +
511 + #[test]
512 + fn boundary_div_non_br_element_before_attribution_is_false() {
513 + // Non-br element BEFORE finding attribution → early return false.
514 + // Catches the L157 `!` deletion.
515 + let html = r#"<div><p>preface</p>On Mon, Alice wrote:<blockquote>q</blockquote></div>"#;
516 + let (doc, sel) = parse_and_select(html, "div");
517 + let el = doc.select(&sel).next().unwrap();
518 + assert!(!is_reply_boundary(el));
519 + }
520 +
521 + // -- Boundary tests for `previous_sibling_text` --
522 + // Exercised via find_attribution since the function is private.
523 +
524 + #[test]
525 + fn prev_sibling_text_node() {
526 + // Raw Text node as preceding sibling. Inside a parent <div>, a leading
527 + // text run followed by a child <div class="q"> means the inner div's
528 + // `prev_sibling()` is a `Node::Text`. Catches `delete match arm Node::Text(text)`.
529 + let html = r#"<div>On Mon, Alice wrote:<div class="q">body</div></div>"#;
530 + let (doc, sel) = parse_and_select(html, "div.q");
531 + let el = doc.select(&sel).next().unwrap();
532 + assert!(find_attribution(el).is_some());
533 + }
534 +
535 + #[test]
536 + fn prev_sibling_inline_span_with_attribution() {
537 + let html = r#"<div><span>On Mon, Alice wrote:</span><div class="q">body</div></div>"#;
538 + let (doc, sel) = parse_and_select(html, "div.q");
539 + let el = doc.select(&sel).next().unwrap();
540 + assert!(find_attribution(el).is_some());
541 + }
542 +
543 + #[test]
544 + fn prev_sibling_inline_font_with_attribution() {
545 + // <font> is also inline-treated; covers a different arm in the matches!.
546 + let html = r#"<div><font>On Mon, Alice wrote:</font><div class="q">body</div></div>"#;
547 + let (doc, sel) = parse_and_select(html, "div.q");
548 + let el = doc.select(&sel).next().unwrap();
549 + assert!(find_attribution(el).is_some());
550 + }
551 +
552 + #[test]
553 + fn prev_sibling_non_inline_element_returns_none() {
554 + // <table> is not in the inline whitelist → preceding-sibling lookup fails.
555 + let html = r#"<div><table><tr><td>On Mon, Alice wrote:</td></tr></table><div class="q">body</div></div>"#;
556 + let (doc, sel) = parse_and_select(html, "div.q");
557 + let el = doc.select(&sel).next().unwrap();
558 + assert!(find_attribution(el).is_none());
559 + }
560 +
561 + #[test]
562 + fn prev_sibling_empty_inline_returns_none() {
563 + let html = r#"<div><span> </span><div class="q">body</div></div>"#;
564 + let (doc, sel) = parse_and_select(html, "div.q");
565 + let el = doc.select(&sel).next().unwrap();
566 + // Whitespace-only preceding span → no attribution match.
567 + assert!(find_attribution(el).is_none());
568 + }
569 +
570 + // -- Boundary tests for `is_outlook_separator` --
571 +
572 + #[test]
573 + fn outlook_from_date_subject_is_separator() {
574 + // Date instead of Sent → covers L206 || (Sent || Date) mutation
575 + let html = "<div>From: Alice\nDate: Monday\nSubject: Hello</div>";
576 + let (doc, sel) = parse_and_select(html, "div");
577 + let el = doc.select(&sel).next().unwrap();
578 + assert!(is_outlook_separator(el));
579 + }
580 +
581 + #[test]
582 + fn outlook_from_sent_no_subject_is_separator() {
583 + // From + Sent, no Subject → catches L209 mutating || to &&
584 + let html = "<div>From: Alice\nSent: Monday</div>";
585 + let (doc, sel) = parse_and_select(html, "div");
586 + let el = doc.select(&sel).next().unwrap();
587 + assert!(is_outlook_separator(el));
588 + }
589 +
590 + #[test]
591 + fn outlook_from_subject_no_sent_is_separator() {
592 + // From + Subject, no Sent/Date → catches L209 mutating || to &&
593 + let html = "<div>From: Alice\nSubject: Hello</div>";
594 + let (doc, sel) = parse_and_select(html, "div");
595 + let el = doc.select(&sel).next().unwrap();
596 + assert!(is_outlook_separator(el));
597 + }
598 +
599 + #[test]
600 + fn outlook_from_only_is_not_separator() {
601 + // From alone (no Sent/Date/Subject) → must be false.
602 + // Catches L209 && mutation to ||.
603 + let html = "<div>From: Alice</div>";
604 + let (doc, sel) = parse_and_select(html, "div");
605 + let el = doc.select(&sel).next().unwrap();
606 + assert!(!is_outlook_separator(el));
607 + }
608 +
609 + #[test]
610 + fn outlook_sent_subject_no_from_is_not_separator() {
611 + // No From → must be false regardless of Sent/Subject presence.
612 + let html = "<div>Sent: Monday\nSubject: Hello</div>";
613 + let (doc, sel) = parse_and_select(html, "div");
614 + let el = doc.select(&sel).next().unwrap();
615 + assert!(!is_outlook_separator(el));
616 + }
324 617 }
M src/tables.rs +139
@@ -294,4 +294,143 @@ mod tests {
294 294 assert_eq!(h, vec!["Name", "Val"]);
295 295 assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]);
296 296 }
297 +
298 + // -- Boundary tests for is_data_table role handling --
299 +
300 + #[test]
301 + fn role_none_is_layout() {
302 + // role="none" → explicit layout signal. Catches L22 `||` mutation
303 + // (presentation OR none); without the ||, "none" wouldn't short-circuit.
304 + let doc = parse_table(
305 + r#"<table role="none"><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></table>"#,
306 + );
307 + // Even with <th>, the explicit role="none" should win.
308 + assert!(!is_data_table(select_table(&doc)));
309 + }
310 +
311 + #[test]
312 + fn role_table_is_data() {
313 + // role="table" → data. Catches L22 == "grid" mutating to != (which would
314 + // make grid not match) AND covers the parallel `|| role == "table"` arm.
315 + let doc =
316 + parse_table(r#"<table role="table"><tr><td>a</td></tr></table>"#);
317 + assert!(is_data_table(select_table(&doc)));
318 + }
319 +
320 + #[test]
321 + fn role_unknown_falls_through_to_structural() {
322 + // Unknown role → no early decision; structural rules apply.
323 + // Single-cell single-row layout table → not data.
324 + let doc =
325 + parse_table(r#"<table role="banner"><tr><td>only one cell</td></tr></table>"#);
326 + assert!(!is_data_table(select_table(&doc)));
327 + }
328 +
329 + #[test]
330 + fn role_presentation_overrides_structure() {
331 + // role="presentation" → layout, even with multiple substantive rows.
332 + // Catches L22 == "presentation" mutating to != (which would skip this check).
333 + let doc = parse_table(
334 + r#"<table role="presentation"><tr><td>Alice</td><td>Engineer</td></tr>\
335 + <tr><td>Bob</td><td>Designer</td></tr></table>"#,
336 + );
337 + assert!(!is_data_table(select_table(&doc)));
338 + }
339 +
340 + // -- Boundary tests for has_substantive_text > 1 --
341 +
342 + #[test]
343 + fn single_char_cells_not_substantive() {
344 + // Two rows of single-char cells → not substantive → not a data table.
345 + // Catches L66 `>` mutating to `>=`: with >=, single chars become substantive
346 + // and these two rows would qualify as a data table.
347 + let doc = parse_table(
348 + "<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>",
349 + );
350 + assert!(!is_data_table(select_table(&doc)));
351 + }
352 +
353 + #[test]
354 + fn two_char_cells_are_substantive() {
355 + let doc = parse_table(
356 + "<table><tr><td>ab</td><td>cd</td></tr><tr><td>ef</td><td>gh</td></tr></table>",
357 + );
358 + assert!(is_data_table(select_table(&doc)));
359 + }
360 +
361 + // -- Boundary tests for extract_table_data tbody handling --
362 +
363 + #[test]
364 + fn extract_with_tbody_no_thead() {
365 + // Catches L87 `== "tbody"` mutating to != (which would skip tbody).
366 + let doc = parse_table(
367 + "<table><tbody><tr><td>Name</td><td>Val</td></tr><tr><td>X</td><td>Y</td></tr></tbody></table>",
368 + );
369 + let (h, r) = extract_table_data(select_table(&doc));
370 + // First tbody row promoted to headers; second row is data.
371 + assert_eq!(h, vec!["Name", "Val"]);
372 + assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]);
373 + }
374 +
375 + // -- Boundary tests for the headers-vs-th-row decision (L104 &&) --
376 +
377 + #[test]
378 + fn thead_present_blocks_later_th_row_promotion() {
379 + // Headers already set by thead. A later th-row should NOT overwrite them.
380 + // Catches L104 `&&` mutating to `||`: with ||, has_th_cells alone would
381 + // re-promote, clobbering the thead headers.
382 + let doc = parse_table(
383 + "<table><thead><tr><th>A</th><th>B</th></tr></thead>\
384 + <tbody><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></tbody></table>",
385 + );
386 + let (h, r) = extract_table_data(select_table(&doc));
387 + assert_eq!(h, vec!["A", "B"], "thead headers must not be overwritten");
388 + // Both the th-row and the td-row become data rows.
389 + assert_eq!(r.len(), 2);
390 + }
391 +
392 + #[test]
393 + fn no_thead_th_row_promotes_to_headers() {
394 + // No thead, but a tr full of th cells → that tr's cells become headers.
395 + // Catches `has_th_cells -> bool` always-false mutation (which would
396 + // make this row become a data row instead).
397 + let doc = parse_table(
398 + "<table><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></table>",
399 + );
400 + let (h, r) = extract_table_data(select_table(&doc));
401 + assert_eq!(h, vec!["X", "Y"]);
402 + assert_eq!(r, vec![vec!["1".to_string(), "2".to_string()]]);
403 + }
404 +
405 + #[test]
406 + fn all_td_rows_promote_first_to_headers() {
407 + // No th anywhere → has_th_cells is false for every row → first row promoted
408 + // by the `if headers.is_empty() && !rows.is_empty()` fallback.
409 + // Catches `has_th_cells -> bool` always-true mutation (which would promote
410 + // every row as headers, leaving rows empty after the first).
411 + let doc = parse_table(
412 + "<table><tr><td>Name</td><td>Val</td></tr><tr><td>X</td><td>Y</td></tr><tr><td>P</td><td>Q</td></tr></table>",
413 + );
414 + let (h, r) = extract_table_data(select_table(&doc));
415 + assert_eq!(h, vec!["Name", "Val"]);
416 + assert_eq!(r.len(), 2);
417 + }
418 +
419 + // -- Boundary test for has_th_cells (L139 == "th") --
420 +
421 + #[test]
422 + fn td_only_row_is_not_a_header_row() {
423 + // A tr with only <td> cells should NOT promote to headers when other
424 + // rows exist. Catches L139 `== "th"` mutating to `!=` (which would
425 + // match td cells and incorrectly treat every td row as a header row).
426 + let doc = parse_table(
427 + "<table><tr><td>data-1</td><td>data-2</td></tr>\
428 + <tr><td>data-3</td><td>data-4</td></tr>\
429 + <tr><td>data-5</td><td>data-6</td></tr></table>",
430 + );
431 + let (h, r) = extract_table_data(select_table(&doc));
432 + // First row is promoted (via the fallback at the end), leaving exactly two data rows.
433 + assert_eq!(h, vec!["data-1", "data-2"]);
434 + assert_eq!(r.len(), 2, "remaining rows should be data, not headers");
435 + }
297 436 }