max / pter

Add tests to kill surviving mutants in convert, elements, replies, tables Targets specific mutation operators that previously survived: list-depth arithmetic in convert, classify/is_tracking_pixel/is_hidden_element in elements, locale-specific attribution-line matchers and outlook separator boundaries in replies, role-attribute and header-detection arms in tables. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-05-14 19:21 UTC

Commit: f595794411a25ae04f2c8555519fd4ae64bcefae

Parent: 3ed6a17

4 files changed, +731 insertions, -0 deletions

M src/convert.rs +56

			@@ -526,6 +526,62 @@ mod tests {
526	526		}
527	527
528	528		#[test]
	529	+	fn nested_list_exact_indent_depth_2() {
	530	+	// At depth 2, `list_indent` returns `" "` (exactly two spaces, one indent level).
	531	+	// Catches `list_indent` mutations:
	532	+	// - `(depth - 1)` → `(depth + 1)`: would produce 3 indent levels (6 spaces).
	533	+	// - `(depth - 1)` → `(depth / 1)`: would produce 2 indent levels (4 spaces).
	534	+	// Either makes this exact-match assertion fail.
	535	+	// (The converter emits a blank line before each nested list — that's a
	536	+	// separate stylistic question; the indent is what we're pinning down here.)
	537	+	assert_eq!(
	538	+	convert("<ul><li>A<ul><li>B</li></ul></li></ul>"),
	539	+	"- A\n\n - B"
	540	+	);
	541	+	}
	542	+
	543	+	#[test]
	544	+	fn triple_nested_list_exact_indent_depth_3() {
	545	+	// At depth 3, indent is exactly `" "` (four spaces).
	546	+	assert_eq!(
	547	+	convert("<ul><li>A<ul><li>B<ul><li>C</li></ul></li></ul></li></ul>"),
	548	+	"- A\n\n - B\n\n - C"
	549	+	);
	550	+	}
	551	+
	552	+	#[test]
	553	+	fn sibling_top_level_lists_have_no_indent_after_nesting() {
	554	+	// After a nested <ul> closes, `list_depth -= 1` must execute to return
	555	+	// to outer scope. If mutated to `+= 1` or `/= 1`, list_depth stays
	556	+	// elevated and the SECOND top-level list ends up incorrectly indented.
	557	+	let md = convert(
	558	+	"<ul><li>A<ul><li>B</li></ul></li></ul><ul><li>C</li></ul>",
	559	+	);
	560	+	// The second list's "C" item must appear at column 0, not indented.
	561	+	// We check the exact substring "\n- C" (newline then no leading whitespace).
	562	+	assert!(
	563	+	md.contains("\n- C"),
	564	+	"second top-level list must not be indented after a nested list closes; got: {md:?}"
	565	+	);
	566	+	// And explicitly: it must NOT appear with leading spaces.
	567	+	assert!(
	568	+	!md.contains("\n - C"),
	569	+	"second list incorrectly indented; got: {md:?}"
	570	+	);
	571	+	}
	572	+
	573	+	#[test]
	574	+	fn ordered_list_decrements_depth_after_nesting() {
	575	+	// Same shape but with <ol> — exercises the L218 `-= 1` mutation in the
	576	+	// OrderedList block, distinct from UnorderedList's L208.
	577	+	let md = convert(
	578	+	"<ol><li>A<ol><li>B</li></ol></li></ol><ol><li>C</li></ol>",
	579	+	);
	580	+	assert!(md.contains("\n1. C"), "second ol must restart at depth 1: {md:?}");
	581	+	assert!(!md.contains("\n 1. C"), "second ol indented incorrectly: {md:?}");
	582	+	}
	583	+
	584	+	#[test]
529	585		fn blockquote() {
530	586		assert_eq!(convert("<blockquote>quoted text</blockquote>"), "> quoted text");
531	587		}

M src/elements.rs +243

			@@ -128,6 +128,249 @@ pub fn is_tracking_pixel(el: &Element) -> bool {
128	128		false
129	129		}
130	130
	131	+	#[cfg(test)]
	132	+	mod tests {
	133	+	use super::*;
	134	+	use scraper::{Html, Selector};
	135	+
	136	+	fn classify_tag(tag: &str) -> ElementAction {
	137	+	let html = format!("<{tag}></{tag}>");
	138	+	let doc = Html::parse_fragment(&html);
	139	+	let sel = Selector::parse(tag).unwrap();
	140	+	let el = doc.select(&sel).next().unwrap();
	141	+	classify(el.value())
	142	+	}
	143	+
	144	+	fn img_is_pixel(attrs: &str) -> bool {
	145	+	let html = format!("<div><img {attrs} ></div>");
	146	+	let doc = Html::parse_fragment(&html);
	147	+	let sel = Selector::parse("img").unwrap();
	148	+	let el = doc.select(&sel).next().unwrap();
	149	+	is_tracking_pixel(el.value())
	150	+	}
	151	+
	152	+	fn div_is_hidden(attrs: &str) -> bool {
	153	+	let html = format!("<div {attrs}></div>");
	154	+	let doc = Html::parse_fragment(&html);
	155	+	let sel = Selector::parse("div").unwrap();
	156	+	let el = doc.select(&sel).next().unwrap();
	157	+	is_hidden(el.value())
	158	+	}
	159	+
	160	+	// -- classify: heading levels (h4/h5/h6 arms) --
	161	+	// Without these arms, the elements fall through to `_ => Transparent`,
	162	+	// which differs from `Block(Heading(n))`. Tests catch the deletion.
	163	+
	164	+	#[test]
	165	+	fn classify_h1_is_heading_1() {
	166	+	assert!(matches!(classify_tag("h1"), ElementAction::Block(BlockKind::Heading(1))));
	167	+	}
	168	+
	169	+	#[test]
	170	+	fn classify_h4_is_heading_4() {
	171	+	assert!(matches!(classify_tag("h4"), ElementAction::Block(BlockKind::Heading(4))));
	172	+	}
	173	+
	174	+	#[test]
	175	+	fn classify_h5_is_heading_5() {
	176	+	assert!(matches!(classify_tag("h5"), ElementAction::Block(BlockKind::Heading(5))));
	177	+	}
	178	+
	179	+	#[test]
	180	+	fn classify_h6_is_heading_6() {
	181	+	assert!(matches!(classify_tag("h6"), ElementAction::Block(BlockKind::Heading(6))));
	182	+	}
	183	+
	184	+	#[test]
	185	+	fn classify_script_is_skip() {
	186	+	assert!(matches!(classify_tag("script"), ElementAction::Skip));
	187	+	}
	188	+
	189	+	#[test]
	190	+	fn classify_table_is_block_table() {
	191	+	assert!(matches!(classify_tag("table"), ElementAction::Block(BlockKind::Table)));
	192	+	}
	193	+
	194	+	#[test]
	195	+	fn classify_strong_is_inline_bold() {
	196	+	assert!(matches!(classify_tag("strong"), ElementAction::Inline(InlineKind::Bold)));
	197	+	}
	198	+
	199	+	// -- is_tracking_pixel: each \|\| arm needs its own positive test --
	200	+
	201	+	#[test]
	202	+	fn pixel_width_1_only() {
	203	+	assert!(img_is_pixel(r#"src="x" width="1" height="100""#));
	204	+	}
	205	+
	206	+	#[test]
	207	+	fn pixel_height_1_only() {
	208	+	// Catches L95 mutating \|\| to && (width OR height; not AND)
	209	+	assert!(img_is_pixel(r#"src="x" width="100" height="1""#));
	210	+	}
	211	+
	212	+	#[test]
	213	+	fn pixel_width_0_only() {
	214	+	assert!(img_is_pixel(r#"src="x" width="0" height="100""#));
	215	+	}
	216	+
	217	+	#[test]
	218	+	fn pixel_no_src_is_pixel() {
	219	+	assert!(img_is_pixel(r#"width="100" height="100""#));
	220	+	}
	221	+
	222	+	#[test]
	223	+	fn pixel_empty_src_is_pixel() {
	224	+	assert!(img_is_pixel(r#"src="" width="100" height="100""#));
	225	+	}
	226	+
	227	+	#[test]
	228	+	fn pixel_transparent_gif_data_uri_is_pixel() {
	229	+	assert!(img_is_pixel(
	230	+	r#"src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" width="100" height="100""#
	231	+	));
	232	+	}
	233	+
	234	+	// Each `\|\|` arm in the style chain (L115–122) — each needs its own input
	235	+	// that triggers ONLY that arm. Catches `replace \|\| with &&` mutants.
	236	+
	237	+	#[test]
	238	+	fn pixel_style_width_1px() {
	239	+	assert!(img_is_pixel(r#"src="x" style="width:1px""#));
	240	+	}
	241	+
	242	+	#[test]
	243	+	fn pixel_style_width_space_1px() {
	244	+	assert!(img_is_pixel(r#"src="x" style="width: 1px""#));
	245	+	}
	246	+
	247	+	#[test]
	248	+	fn pixel_style_width_0() {
	249	+	assert!(img_is_pixel(r#"src="x" style="width:0""#));
	250	+	}
	251	+
	252	+	#[test]
	253	+	fn pixel_style_height_1px() {
	254	+	assert!(img_is_pixel(r#"src="x" style="height:1px""#));
	255	+	}
	256	+
	257	+	#[test]
	258	+	fn pixel_style_height_space_1px() {
	259	+	assert!(img_is_pixel(r#"src="x" style="height: 1px""#));
	260	+	}
	261	+
	262	+	#[test]
	263	+	fn pixel_style_height_0() {
	264	+	assert!(img_is_pixel(r#"src="x" style="height:0""#));
	265	+	}
	266	+
	267	+	#[test]
	268	+	fn pixel_style_display_none() {
	269	+	assert!(img_is_pixel(r#"src="x" style="display:none""#));
	270	+	}
	271	+
	272	+	#[test]
	273	+	fn pixel_style_display_space_none() {
	274	+	assert!(img_is_pixel(r#"src="x" style="display: none""#));
	275	+	}
	276	+
	277	+	#[test]
	278	+	fn pixel_normal_image_is_not_pixel() {
	279	+	assert!(!img_is_pixel(
	280	+	r#"src="https://example.com/cat.jpg" width="500" height="300""#
	281	+	));
	282	+	}
	283	+
	284	+	// -- is_hidden: each \|\| arm with its own targeted test --
	285	+
	286	+	#[test]
	287	+	fn hidden_display_none() {
	288	+	assert!(div_is_hidden(r#"style="display:none""#));
	289	+	}
	290	+
	291	+	#[test]
	292	+	fn hidden_display_space_none() {
	293	+	assert!(div_is_hidden(r#"style="display: none""#));
	294	+	}
	295	+
	296	+	#[test]
	297	+	fn hidden_visibility_hidden() {
	298	+	assert!(div_is_hidden(r#"style="visibility:hidden""#));
	299	+	}
	300	+
	301	+	#[test]
	302	+	fn hidden_visibility_space_hidden() {
	303	+	assert!(div_is_hidden(r#"style="visibility: hidden""#));
	304	+	}
	305	+
	306	+	#[test]
	307	+	fn hidden_font_size_0() {
	308	+	assert!(div_is_hidden(r#"style="font-size:0""#));
	309	+	}
	310	+
	311	+	#[test]
	312	+	fn hidden_font_size_space_0() {
	313	+	assert!(div_is_hidden(r#"style="font-size: 0""#));
	314	+	}
	315	+
	316	+	#[test]
	317	+	fn hidden_line_height_0() {
	318	+	assert!(div_is_hidden(r#"style="line-height:0""#));
	319	+	}
	320	+
	321	+	#[test]
	322	+	fn hidden_line_height_space_0() {
	323	+	assert!(div_is_hidden(r#"style="line-height: 0""#));
	324	+	}
	325	+
	326	+	// The (height:0 && overflow:hidden) and (height: 0 && overflow: hidden) arms
	327	+	// need both halves present to fire. Tests cover each form, plus the negative
	328	+	// case where height:0 alone is NOT hidden (catches && → \|\| mutation on L146/147).
	329	+
	330	+	#[test]
	331	+	fn hidden_height_0_with_overflow_no_spaces() {
	332	+	assert!(div_is_hidden(r#"style="height:0;overflow:hidden""#));
	333	+	}
	334	+
	335	+	#[test]
	336	+	fn hidden_height_0_with_overflow_with_spaces() {
	337	+	assert!(div_is_hidden(r#"style="height: 0;overflow: hidden""#));
	338	+	}
	339	+
	340	+	#[test]
	341	+	fn hidden_height_0_alone_is_not_hidden() {
	342	+	// Catches the L146 && → \|\| mutation: with \|\|, this would erroneously be hidden.
	343	+	assert!(!div_is_hidden(r#"style="height:0""#));
	344	+	}
	345	+
	346	+	#[test]
	347	+	fn hidden_height_space_0_alone_is_not_hidden() {
	348	+	// Same boundary check for the space variant — catches the && → \|\| mutation
	349	+	// on the `(height: 0 && overflow: hidden)` arm specifically.
	350	+	assert!(!div_is_hidden(r#"style="height: 0""#));
	351	+	}
	352	+
	353	+	#[test]
	354	+	fn hidden_max_height_0() {
	355	+	assert!(div_is_hidden(r#"style="max-height:0""#));
	356	+	}
	357	+
	358	+	#[test]
	359	+	fn hidden_max_height_space_0() {
	360	+	assert!(div_is_hidden(r#"style="max-height: 0""#));
	361	+	}
	362	+
	363	+	#[test]
	364	+	fn hidden_no_signal_in_style() {
	365	+	assert!(!div_is_hidden(r#"style="color:red;font-weight:bold""#));
	366	+	}
	367	+
	368	+	#[test]
	369	+	fn hidden_no_style_attr_is_not_hidden() {
	370	+	assert!(!div_is_hidden(""));
	371	+	}
	372	+	}
	373	+
131	374		/// Check if an element is hidden via inline style.
132	375		///
133	376		/// Catches display:none, visibility:hidden, and spacer tricks

M src/replies.rs +293

			@@ -321,4 +321,297 @@ mod tests {
321	321		let el = doc.select(&sel).next().unwrap();
322	322		assert!(!is_outlook_separator(el));
323	323		}
	324	+
	325	+	// -- Boundary tests for `is_attribution_text`: each arm needs both sides --
	326	+
	327	+	#[test]
	328	+	fn attribution_on_without_wrote_is_false() {
	329	+	// "On ..." without "wrote:" — catches mutating && to \|\|
	330	+	assert!(!is_attribution_text("On the bright side, this is fine."));
	331	+	}
	332	+
	333	+	#[test]
	334	+	fn attribution_wrote_without_on_is_false() {
	335	+	// "... wrote:" without leading "On " — catches mutating && to \|\|
	336	+	assert!(!is_attribution_text("Alice wrote:"));
	337	+	}
	338	+
	339	+	#[test]
	340	+	fn attribution_french_le_with_colon_space() {
	341	+	assert!(is_attribution_text("Le lundi 5 janvier 2026, Alice a écrit :"));
	342	+	}
	343	+
	344	+	#[test]
	345	+	fn attribution_french_le_no_space_before_colon() {
	346	+	// "écrit:" without space — covers L89 \|\| mutation between the two ending forms
	347	+	assert!(is_attribution_text("Le lundi, Alice a écrit:"));
	348	+	}
	349	+
	350	+	#[test]
	351	+	fn attribution_spanish_el_with_colon_space() {
	352	+	assert!(is_attribution_text("El lunes 5 de enero, Alice a escrit :"));
	353	+	}
	354	+
	355	+	#[test]
	356	+	fn attribution_spanish_el_no_space_before_colon() {
	357	+	assert!(is_attribution_text("El lunes, Alice a escrit:"));
	358	+	}
	359	+
	360	+	#[test]
	361	+	fn attribution_french_le_without_wrote_ending_is_false() {
	362	+	// "Le X" without "écrit" — catches L89 mutating \|\| to &&
	363	+	assert!(!is_attribution_text("Le lundi, Alice est ici."));
	364	+	}
	365	+
	366	+	#[test]
	367	+	fn attribution_starts_with_le_but_not_french_pattern() {
	368	+	// Word starts with "Le" but isn't the French attribution form.
	369	+	assert!(!is_attribution_text("Le sigh."));
	370	+	}
	371	+
	372	+	#[test]
	373	+	fn attribution_german_am_with_colon() {
	374	+	assert!(is_attribution_text("Am Montag, 5. Januar 2026, schrieb:"));
	375	+	}
	376	+
	377	+	#[test]
	378	+	fn attribution_german_am_with_space_colon() {
	379	+	assert!(is_attribution_text("Am Montag schrieb :"));
	380	+	}
	381	+
	382	+	#[test]
	383	+	fn attribution_german_am_without_schrieb_is_false() {
	384	+	// "Am X" without "schrieb" — catches L93 && mutation
	385	+	assert!(!is_attribution_text("Am very fine, thanks."));
	386	+	}
	387	+
	388	+	#[test]
	389	+	fn attribution_german_schrieb_without_am_is_false() {
	390	+	// "schrieb:" without leading "Am " — catches L93 && mutation
	391	+	assert!(!is_attribution_text("Bob schrieb:"));
	392	+	}
	393	+
	394	+	#[test]
	395	+	fn attribution_begin_forwarded_only() {
	396	+	// Only "Begin forwarded message" present — catches the \|\| chain mutating to &&
	397	+	assert!(is_attribution_text("Begin forwarded message"));
	398	+	}
	399	+
	400	+	#[test]
	401	+	fn attribution_original_message_only() {
	402	+	// Only "Original Message" present — catches the \|\| chain mutating to &&
	403	+	assert!(is_attribution_text("-----Original Message-----"));
	404	+	}
	405	+
	406	+	// -- Boundary tests for `is_reply_id` --
	407	+
	408	+	#[test]
	409	+	fn reply_id_reply_message() {
	410	+	assert!(is_reply_id("reply-message"));
	411	+	}
	412	+
	413	+	#[test]
	414	+	fn reply_id_olk_src_body_section() {
	415	+	assert!(is_reply_id("OLK_SRC_BODY_SECTION"));
	416	+	}
	417	+
	418	+	#[test]
	419	+	fn reply_id_unknown_is_false() {
	420	+	// Catches `replace is_reply_id -> bool with true` mutant
	421	+	assert!(!is_reply_id("main-content"));
	422	+	assert!(!is_reply_id(""));
	423	+	assert!(!is_reply_id("reply"));
	424	+	}
	425	+
	426	+	// -- Boundary tests for `find_attribution` --
	427	+
	428	+	#[test]
	429	+	fn find_attribution_in_leading_text() {
	430	+	let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#;
	431	+	let (doc, sel) = parse_and_select(html, "div");
	432	+	let el = doc.select(&sel).next().unwrap();
	433	+	let attr = find_attribution(el);
	434	+	assert!(attr.is_some());
	435	+	assert!(attr.unwrap().contains("wrote:"));
	436	+	}
	437	+
	438	+	#[test]
	439	+	fn find_attribution_none_when_no_match() {
	440	+	let html = r#"<div>Just regular text here, nothing fancy.</div>"#;
	441	+	let (doc, sel) = parse_and_select(html, "div");
	442	+	let el = doc.select(&sel).next().unwrap();
	443	+	assert!(find_attribution(el).is_none());
	444	+	}
	445	+
	446	+	#[test]
	447	+	fn find_attribution_stops_at_first_element_child() {
	448	+	// Element-then-text: the Text(_) arm should still match leading text BEFORE
	449	+	// hitting any element. With a leading element, the loop should `break`
	450	+	// out without inspecting later text. Catches "delete match arm Node::Element(_)".
	451	+	let html = r#"<div><span>hi</span>On Mon, Alice wrote:</div>"#;
	452	+	let (doc, sel) = parse_and_select(html, "div");
	453	+	let el = doc.select(&sel).next().unwrap();
	454	+	// Leading content is an element, not text — and the later text falls outside
	455	+	// the leading-text scan. So no attribution should be found from leading text.
	456	+	// Also, no preceding sibling. → None.
	457	+	assert!(find_attribution(el).is_none());
	458	+	}
	459	+
	460	+	#[test]
	461	+	fn find_attribution_in_preceding_sibling() {
	462	+	let html = r#"<div><p>On Mon, Alice wrote:</p><div class="quote">body</div></div>"#;
	463	+	let (doc, sel) = parse_and_select(html, "div.quote");
	464	+	let el = doc.select(&sel).next().unwrap();
	465	+	let attr = find_attribution(el);
	466	+	assert!(attr.is_some(), "expected attribution from preceding <p>");
	467	+	}
	468	+
	469	+	// -- Boundary tests for `has_attribution_then_quote` --
	470	+	// These exercise the function via `is_reply_boundary` since it's private.
	471	+
	472	+	#[test]
	473	+	fn boundary_div_with_attribution_then_blockquote() {
	474	+	let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#;
	475	+	let (doc, sel) = parse_and_select(html, "div");
	476	+	let el = doc.select(&sel).next().unwrap();
	477	+	assert!(is_reply_boundary(el));
	478	+	}
	479	+
	480	+	#[test]
	481	+	fn boundary_div_blockquote_without_attribution_is_false() {
	482	+	// A bare blockquote-wrapping div without attribution text is not a boundary.
	483	+	// Catches "replace has_attribution_then_quote -> bool with false" (would
	484	+	// make this still pass, but the positive case above would fail).
	485	+	let html = r#"<div><blockquote>quoted</blockquote></div>"#;
	486	+	let (doc, sel) = parse_and_select(html, "div");
	487	+	let el = doc.select(&sel).next().unwrap();
	488	+	assert!(!is_reply_boundary(el));
	489	+	}
	490	+
	491	+	#[test]
	492	+	fn boundary_div_attribution_no_blockquote_is_false() {
	493	+	// Attribution text but no blockquote → not a boundary.
	494	+	// Catches the L151 == mutation (would treat any element as blockquote).
	495	+	let html = r#"<div>On Mon, Alice wrote:<p>not a quote</p></div>"#;
	496	+	let (doc, sel) = parse_and_select(html, "div");
	497	+	let el = doc.select(&sel).next().unwrap();
	498	+	assert!(!is_reply_boundary(el));
	499	+	}
	500	+
	501	+	#[test]
	502	+	fn boundary_div_attribution_br_blockquote() {
	503	+	// Attribution → <br> → blockquote. The <br> must be skipped.
	504	+	// Catches the L155 != mutation in br-handling.
	505	+	let html = r#"<div>On Mon, Alice wrote:<br><blockquote>quoted</blockquote></div>"#;
	506	+	let (doc, sel) = parse_and_select(html, "div");
	507	+	let el = doc.select(&sel).next().unwrap();
	508	+	assert!(is_reply_boundary(el));
	509	+	}
	510	+
	511	+	#[test]
	512	+	fn boundary_div_non_br_element_before_attribution_is_false() {
	513	+	// Non-br element BEFORE finding attribution → early return false.
	514	+	// Catches the L157 `!` deletion.
	515	+	let html = r#"<div><p>preface</p>On Mon, Alice wrote:<blockquote>q</blockquote></div>"#;
	516	+	let (doc, sel) = parse_and_select(html, "div");
	517	+	let el = doc.select(&sel).next().unwrap();
	518	+	assert!(!is_reply_boundary(el));
	519	+	}
	520	+
	521	+	// -- Boundary tests for `previous_sibling_text` --
	522	+	// Exercised via find_attribution since the function is private.
	523	+
	524	+	#[test]
	525	+	fn prev_sibling_text_node() {
	526	+	// Raw Text node as preceding sibling. Inside a parent <div>, a leading
	527	+	// text run followed by a child <div class="q"> means the inner div's
	528	+	// `prev_sibling()` is a `Node::Text`. Catches `delete match arm Node::Text(text)`.
	529	+	let html = r#"<div>On Mon, Alice wrote:<div class="q">body</div></div>"#;
	530	+	let (doc, sel) = parse_and_select(html, "div.q");
	531	+	let el = doc.select(&sel).next().unwrap();
	532	+	assert!(find_attribution(el).is_some());
	533	+	}
	534	+
	535	+	#[test]
	536	+	fn prev_sibling_inline_span_with_attribution() {
	537	+	let html = r#"<div><span>On Mon, Alice wrote:</span><div class="q">body</div></div>"#;
	538	+	let (doc, sel) = parse_and_select(html, "div.q");
	539	+	let el = doc.select(&sel).next().unwrap();
	540	+	assert!(find_attribution(el).is_some());
	541	+	}
	542	+
	543	+	#[test]
	544	+	fn prev_sibling_inline_font_with_attribution() {
	545	+	// <font> is also inline-treated; covers a different arm in the matches!.
	546	+	let html = r#"<div><font>On Mon, Alice wrote:</font><div class="q">body</div></div>"#;
	547	+	let (doc, sel) = parse_and_select(html, "div.q");
	548	+	let el = doc.select(&sel).next().unwrap();
	549	+	assert!(find_attribution(el).is_some());
	550	+	}
	551	+
	552	+	#[test]
	553	+	fn prev_sibling_non_inline_element_returns_none() {
	554	+	// <table> is not in the inline whitelist → preceding-sibling lookup fails.
	555	+	let html = r#"<div><table><tr><td>On Mon, Alice wrote:</td></tr></table><div class="q">body</div></div>"#;
	556	+	let (doc, sel) = parse_and_select(html, "div.q");
	557	+	let el = doc.select(&sel).next().unwrap();
	558	+	assert!(find_attribution(el).is_none());
	559	+	}
	560	+
	561	+	#[test]
	562	+	fn prev_sibling_empty_inline_returns_none() {
	563	+	let html = r#"<div><span> </span><div class="q">body</div></div>"#;
	564	+	let (doc, sel) = parse_and_select(html, "div.q");
	565	+	let el = doc.select(&sel).next().unwrap();
	566	+	// Whitespace-only preceding span → no attribution match.
	567	+	assert!(find_attribution(el).is_none());
	568	+	}
	569	+
	570	+	// -- Boundary tests for `is_outlook_separator` --
	571	+
	572	+	#[test]
	573	+	fn outlook_from_date_subject_is_separator() {
	574	+	// Date instead of Sent → covers L206 \|\| (Sent \|\| Date) mutation
	575	+	let html = "<div>From: Alice\nDate: Monday\nSubject: Hello</div>";
	576	+	let (doc, sel) = parse_and_select(html, "div");
	577	+	let el = doc.select(&sel).next().unwrap();
	578	+	assert!(is_outlook_separator(el));
	579	+	}
	580	+
	581	+	#[test]
	582	+	fn outlook_from_sent_no_subject_is_separator() {
	583	+	// From + Sent, no Subject → catches L209 mutating \|\| to &&
	584	+	let html = "<div>From: Alice\nSent: Monday</div>";
	585	+	let (doc, sel) = parse_and_select(html, "div");
	586	+	let el = doc.select(&sel).next().unwrap();
	587	+	assert!(is_outlook_separator(el));
	588	+	}
	589	+
	590	+	#[test]
	591	+	fn outlook_from_subject_no_sent_is_separator() {
	592	+	// From + Subject, no Sent/Date → catches L209 mutating \|\| to &&
	593	+	let html = "<div>From: Alice\nSubject: Hello</div>";
	594	+	let (doc, sel) = parse_and_select(html, "div");
	595	+	let el = doc.select(&sel).next().unwrap();
	596	+	assert!(is_outlook_separator(el));
	597	+	}
	598	+
	599	+	#[test]
	600	+	fn outlook_from_only_is_not_separator() {
	601	+	// From alone (no Sent/Date/Subject) → must be false.
	602	+	// Catches L209 && mutation to \|\|.
	603	+	let html = "<div>From: Alice</div>";
	604	+	let (doc, sel) = parse_and_select(html, "div");
	605	+	let el = doc.select(&sel).next().unwrap();
	606	+	assert!(!is_outlook_separator(el));
	607	+	}
	608	+
	609	+	#[test]
	610	+	fn outlook_sent_subject_no_from_is_not_separator() {
	611	+	// No From → must be false regardless of Sent/Subject presence.
	612	+	let html = "<div>Sent: Monday\nSubject: Hello</div>";
	613	+	let (doc, sel) = parse_and_select(html, "div");
	614	+	let el = doc.select(&sel).next().unwrap();
	615	+	assert!(!is_outlook_separator(el));
	616	+	}
324	617		}

M src/tables.rs +139

			@@ -294,4 +294,143 @@ mod tests {
294	294		assert_eq!(h, vec!["Name", "Val"]);
295	295		assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]);
296	296		}
	297	+
	298	+	// -- Boundary tests for is_data_table role handling --
	299	+
	300	+	#[test]
	301	+	fn role_none_is_layout() {
	302	+	// role="none" → explicit layout signal. Catches L22 `\|\|` mutation
	303	+	// (presentation OR none); without the \|\|, "none" wouldn't short-circuit.
	304	+	let doc = parse_table(
	305	+	r#"<table role="none"><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></table>"#,
	306	+	);
	307	+	// Even with <th>, the explicit role="none" should win.
	308	+	assert!(!is_data_table(select_table(&doc)));
	309	+	}
	310	+
	311	+	#[test]
	312	+	fn role_table_is_data() {
	313	+	// role="table" → data. Catches L22 == "grid" mutating to != (which would
	314	+	// make grid not match) AND covers the parallel `\|\| role == "table"` arm.
	315	+	let doc =
	316	+	parse_table(r#"<table role="table"><tr><td>a</td></tr></table>"#);
	317	+	assert!(is_data_table(select_table(&doc)));
	318	+	}
	319	+
	320	+	#[test]
	321	+	fn role_unknown_falls_through_to_structural() {
	322	+	// Unknown role → no early decision; structural rules apply.
	323	+	// Single-cell single-row layout table → not data.
	324	+	let doc =
	325	+	parse_table(r#"<table role="banner"><tr><td>only one cell</td></tr></table>"#);
	326	+	assert!(!is_data_table(select_table(&doc)));
	327	+	}
	328	+
	329	+	#[test]
	330	+	fn role_presentation_overrides_structure() {
	331	+	// role="presentation" → layout, even with multiple substantive rows.
	332	+	// Catches L22 == "presentation" mutating to != (which would skip this check).
	333	+	let doc = parse_table(
	334	+	r#"<table role="presentation"><tr><td>Alice</td><td>Engineer</td></tr>\
	335	+	<tr><td>Bob</td><td>Designer</td></tr></table>"#,
	336	+	);
	337	+	assert!(!is_data_table(select_table(&doc)));
	338	+	}
	339	+
	340	+	// -- Boundary tests for has_substantive_text > 1 --
	341	+
	342	+	#[test]
	343	+	fn single_char_cells_not_substantive() {
	344	+	// Two rows of single-char cells → not substantive → not a data table.
	345	+	// Catches L66 `>` mutating to `>=`: with >=, single chars become substantive
	346	+	// and these two rows would qualify as a data table.
	347	+	let doc = parse_table(
	348	+	"<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>",
	349	+	);
	350	+	assert!(!is_data_table(select_table(&doc)));
	351	+	}
	352	+
	353	+	#[test]
	354	+	fn two_char_cells_are_substantive() {
	355	+	let doc = parse_table(
	356	+	"<table><tr><td>ab</td><td>cd</td></tr><tr><td>ef</td><td>gh</td></tr></table>",
	357	+	);
	358	+	assert!(is_data_table(select_table(&doc)));
	359	+	}
	360	+
	361	+	// -- Boundary tests for extract_table_data tbody handling --
	362	+
	363	+	#[test]
	364	+	fn extract_with_tbody_no_thead() {
	365	+	// Catches L87 `== "tbody"` mutating to != (which would skip tbody).
	366	+	let doc = parse_table(
	367	+	"<table><tbody><tr><td>Name</td><td>Val</td></tr><tr><td>X</td><td>Y</td></tr></tbody></table>",
	368	+	);
	369	+	let (h, r) = extract_table_data(select_table(&doc));
	370	+	// First tbody row promoted to headers; second row is data.
	371	+	assert_eq!(h, vec!["Name", "Val"]);
	372	+	assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]);
	373	+	}
	374	+
	375	+	// -- Boundary tests for the headers-vs-th-row decision (L104 &&) --
	376	+
	377	+	#[test]
	378	+	fn thead_present_blocks_later_th_row_promotion() {
	379	+	// Headers already set by thead. A later th-row should NOT overwrite them.
	380	+	// Catches L104 `&&` mutating to `\|\|`: with \|\|, has_th_cells alone would
	381	+	// re-promote, clobbering the thead headers.
	382	+	let doc = parse_table(
	383	+	"<table><thead><tr><th>A</th><th>B</th></tr></thead>\
	384	+	<tbody><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></tbody></table>",
	385	+	);
	386	+	let (h, r) = extract_table_data(select_table(&doc));
	387	+	assert_eq!(h, vec!["A", "B"], "thead headers must not be overwritten");
	388	+	// Both the th-row and the td-row become data rows.
	389	+	assert_eq!(r.len(), 2);
	390	+	}
	391	+
	392	+	#[test]
	393	+	fn no_thead_th_row_promotes_to_headers() {
	394	+	// No thead, but a tr full of th cells → that tr's cells become headers.
	395	+	// Catches `has_th_cells -> bool` always-false mutation (which would
	396	+	// make this row become a data row instead).
	397	+	let doc = parse_table(
	398	+	"<table><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></table>",
	399	+	);
	400	+	let (h, r) = extract_table_data(select_table(&doc));
	401	+	assert_eq!(h, vec!["X", "Y"]);
	402	+	assert_eq!(r, vec![vec!["1".to_string(), "2".to_string()]]);
	403	+	}
	404	+
	405	+	#[test]
	406	+	fn all_td_rows_promote_first_to_headers() {
	407	+	// No th anywhere → has_th_cells is false for every row → first row promoted
	408	+	// by the `if headers.is_empty() && !rows.is_empty()` fallback.
	409	+	// Catches `has_th_cells -> bool` always-true mutation (which would promote
	410	+	// every row as headers, leaving rows empty after the first).
	411	+	let doc = parse_table(
	412	+	"<table><tr><td>Name</td><td>Val</td></tr><tr><td>X</td><td>Y</td></tr><tr><td>P</td><td>Q</td></tr></table>",
	413	+	);
	414	+	let (h, r) = extract_table_data(select_table(&doc));
	415	+	assert_eq!(h, vec!["Name", "Val"]);
	416	+	assert_eq!(r.len(), 2);
	417	+	}
	418	+
	419	+	// -- Boundary test for has_th_cells (L139 == "th") --
	420	+
	421	+	#[test]
	422	+	fn td_only_row_is_not_a_header_row() {
	423	+	// A tr with only <td> cells should NOT promote to headers when other
	424	+	// rows exist. Catches L139 `== "th"` mutating to `!=` (which would
	425	+	// match td cells and incorrectly treat every td row as a header row).
	426	+	let doc = parse_table(
	427	+	"<table><tr><td>data-1</td><td>data-2</td></tr>\
	428	+	<tr><td>data-3</td><td>data-4</td></tr>\
	429	+	<tr><td>data-5</td><td>data-6</td></tr></table>",
	430	+	);
	431	+	let (h, r) = extract_table_data(select_table(&doc));
	432	+	// First row is promoted (via the fallback at the end), leaving exactly two data rows.
	433	+	assert_eq!(h, vec!["data-1", "data-2"]);
	434	+	assert_eq!(r.len(), 2, "remaining rows should be data, not headers");
	435	+	}
297	436		}