max / pter
4 files changed,
+731 insertions,
-0 deletions
| @@ -526,6 +526,62 @@ mod tests { | |||
| 526 | 526 | } | |
| 527 | 527 | ||
| 528 | 528 | #[test] | |
| 529 | + | fn nested_list_exact_indent_depth_2() { | |
| 530 | + | // At depth 2, `list_indent` returns `" "` (exactly two spaces, one indent level). | |
| 531 | + | // Catches `list_indent` mutations: | |
| 532 | + | // - `(depth - 1)` → `(depth + 1)`: would produce 3 indent levels (6 spaces). | |
| 533 | + | // - `(depth - 1)` → `(depth / 1)`: would produce 2 indent levels (4 spaces). | |
| 534 | + | // Either makes this exact-match assertion fail. | |
| 535 | + | // (The converter emits a blank line before each nested list — that's a | |
| 536 | + | // separate stylistic question; the *indent* is what we're pinning down here.) | |
| 537 | + | assert_eq!( | |
| 538 | + | convert("<ul><li>A<ul><li>B</li></ul></li></ul>"), | |
| 539 | + | "- A\n\n - B" | |
| 540 | + | ); | |
| 541 | + | } | |
| 542 | + | ||
| 543 | + | #[test] | |
| 544 | + | fn triple_nested_list_exact_indent_depth_3() { | |
| 545 | + | // At depth 3, indent is exactly `" "` (four spaces). | |
| 546 | + | assert_eq!( | |
| 547 | + | convert("<ul><li>A<ul><li>B<ul><li>C</li></ul></li></ul></li></ul>"), | |
| 548 | + | "- A\n\n - B\n\n - C" | |
| 549 | + | ); | |
| 550 | + | } | |
| 551 | + | ||
| 552 | + | #[test] | |
| 553 | + | fn sibling_top_level_lists_have_no_indent_after_nesting() { | |
| 554 | + | // After a nested <ul> closes, `list_depth -= 1` must execute to return | |
| 555 | + | // to outer scope. If mutated to `+= 1` or `/= 1`, list_depth stays | |
| 556 | + | // elevated and the SECOND top-level list ends up incorrectly indented. | |
| 557 | + | let md = convert( | |
| 558 | + | "<ul><li>A<ul><li>B</li></ul></li></ul><ul><li>C</li></ul>", | |
| 559 | + | ); | |
| 560 | + | // The second list's "C" item must appear at column 0, not indented. | |
| 561 | + | // We check the exact substring "\n- C" (newline then no leading whitespace). | |
| 562 | + | assert!( | |
| 563 | + | md.contains("\n- C"), | |
| 564 | + | "second top-level list must not be indented after a nested list closes; got: {md:?}" | |
| 565 | + | ); | |
| 566 | + | // And explicitly: it must NOT appear with leading spaces. | |
| 567 | + | assert!( | |
| 568 | + | !md.contains("\n - C"), | |
| 569 | + | "second list incorrectly indented; got: {md:?}" | |
| 570 | + | ); | |
| 571 | + | } | |
| 572 | + | ||
| 573 | + | #[test] | |
| 574 | + | fn ordered_list_decrements_depth_after_nesting() { | |
| 575 | + | // Same shape but with <ol> — exercises the L218 `-= 1` mutation in the | |
| 576 | + | // OrderedList block, distinct from UnorderedList's L208. | |
| 577 | + | let md = convert( | |
| 578 | + | "<ol><li>A<ol><li>B</li></ol></li></ol><ol><li>C</li></ol>", | |
| 579 | + | ); | |
| 580 | + | assert!(md.contains("\n1. C"), "second ol must restart at depth 1: {md:?}"); | |
| 581 | + | assert!(!md.contains("\n 1. C"), "second ol indented incorrectly: {md:?}"); | |
| 582 | + | } | |
| 583 | + | ||
| 584 | + | #[test] | |
| 529 | 585 | fn blockquote() { | |
| 530 | 586 | assert_eq!(convert("<blockquote>quoted text</blockquote>"), "> quoted text"); | |
| 531 | 587 | } |
| @@ -128,6 +128,249 @@ pub fn is_tracking_pixel(el: &Element) -> bool { | |||
| 128 | 128 | false | |
| 129 | 129 | } | |
| 130 | 130 | ||
| 131 | + | #[cfg(test)] | |
| 132 | + | mod tests { | |
| 133 | + | use super::*; | |
| 134 | + | use scraper::{Html, Selector}; | |
| 135 | + | ||
| 136 | + | fn classify_tag(tag: &str) -> ElementAction { | |
| 137 | + | let html = format!("<{tag}></{tag}>"); | |
| 138 | + | let doc = Html::parse_fragment(&html); | |
| 139 | + | let sel = Selector::parse(tag).unwrap(); | |
| 140 | + | let el = doc.select(&sel).next().unwrap(); | |
| 141 | + | classify(el.value()) | |
| 142 | + | } | |
| 143 | + | ||
| 144 | + | fn img_is_pixel(attrs: &str) -> bool { | |
| 145 | + | let html = format!("<div><img {attrs} ></div>"); | |
| 146 | + | let doc = Html::parse_fragment(&html); | |
| 147 | + | let sel = Selector::parse("img").unwrap(); | |
| 148 | + | let el = doc.select(&sel).next().unwrap(); | |
| 149 | + | is_tracking_pixel(el.value()) | |
| 150 | + | } | |
| 151 | + | ||
| 152 | + | fn div_is_hidden(attrs: &str) -> bool { | |
| 153 | + | let html = format!("<div {attrs}></div>"); | |
| 154 | + | let doc = Html::parse_fragment(&html); | |
| 155 | + | let sel = Selector::parse("div").unwrap(); | |
| 156 | + | let el = doc.select(&sel).next().unwrap(); | |
| 157 | + | is_hidden(el.value()) | |
| 158 | + | } | |
| 159 | + | ||
| 160 | + | // -- classify: heading levels (h4/h5/h6 arms) -- | |
| 161 | + | // Without these arms, the elements fall through to `_ => Transparent`, | |
| 162 | + | // which differs from `Block(Heading(n))`. Tests catch the deletion. | |
| 163 | + | ||
| 164 | + | #[test] | |
| 165 | + | fn classify_h1_is_heading_1() { | |
| 166 | + | assert!(matches!(classify_tag("h1"), ElementAction::Block(BlockKind::Heading(1)))); | |
| 167 | + | } | |
| 168 | + | ||
| 169 | + | #[test] | |
| 170 | + | fn classify_h4_is_heading_4() { | |
| 171 | + | assert!(matches!(classify_tag("h4"), ElementAction::Block(BlockKind::Heading(4)))); | |
| 172 | + | } | |
| 173 | + | ||
| 174 | + | #[test] | |
| 175 | + | fn classify_h5_is_heading_5() { | |
| 176 | + | assert!(matches!(classify_tag("h5"), ElementAction::Block(BlockKind::Heading(5)))); | |
| 177 | + | } | |
| 178 | + | ||
| 179 | + | #[test] | |
| 180 | + | fn classify_h6_is_heading_6() { | |
| 181 | + | assert!(matches!(classify_tag("h6"), ElementAction::Block(BlockKind::Heading(6)))); | |
| 182 | + | } | |
| 183 | + | ||
| 184 | + | #[test] | |
| 185 | + | fn classify_script_is_skip() { | |
| 186 | + | assert!(matches!(classify_tag("script"), ElementAction::Skip)); | |
| 187 | + | } | |
| 188 | + | ||
| 189 | + | #[test] | |
| 190 | + | fn classify_table_is_block_table() { | |
| 191 | + | assert!(matches!(classify_tag("table"), ElementAction::Block(BlockKind::Table))); | |
| 192 | + | } | |
| 193 | + | ||
| 194 | + | #[test] | |
| 195 | + | fn classify_strong_is_inline_bold() { | |
| 196 | + | assert!(matches!(classify_tag("strong"), ElementAction::Inline(InlineKind::Bold))); | |
| 197 | + | } | |
| 198 | + | ||
| 199 | + | // -- is_tracking_pixel: each || arm needs its own positive test -- | |
| 200 | + | ||
| 201 | + | #[test] | |
| 202 | + | fn pixel_width_1_only() { | |
| 203 | + | assert!(img_is_pixel(r#"src="x" width="1" height="100""#)); | |
| 204 | + | } | |
| 205 | + | ||
| 206 | + | #[test] | |
| 207 | + | fn pixel_height_1_only() { | |
| 208 | + | // Catches L95 mutating || to && (width OR height; not AND) | |
| 209 | + | assert!(img_is_pixel(r#"src="x" width="100" height="1""#)); | |
| 210 | + | } | |
| 211 | + | ||
| 212 | + | #[test] | |
| 213 | + | fn pixel_width_0_only() { | |
| 214 | + | assert!(img_is_pixel(r#"src="x" width="0" height="100""#)); | |
| 215 | + | } | |
| 216 | + | ||
| 217 | + | #[test] | |
| 218 | + | fn pixel_no_src_is_pixel() { | |
| 219 | + | assert!(img_is_pixel(r#"width="100" height="100""#)); | |
| 220 | + | } | |
| 221 | + | ||
| 222 | + | #[test] | |
| 223 | + | fn pixel_empty_src_is_pixel() { | |
| 224 | + | assert!(img_is_pixel(r#"src="" width="100" height="100""#)); | |
| 225 | + | } | |
| 226 | + | ||
| 227 | + | #[test] | |
| 228 | + | fn pixel_transparent_gif_data_uri_is_pixel() { | |
| 229 | + | assert!(img_is_pixel( | |
| 230 | + | r#"src="data:image/gif;base64,R0lGODlhAQABAAAAACH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==" width="100" height="100""# | |
| 231 | + | )); | |
| 232 | + | } | |
| 233 | + | ||
| 234 | + | // Each `||` arm in the style chain (L115–122) — each needs its own input | |
| 235 | + | // that triggers ONLY that arm. Catches `replace || with &&` mutants. | |
| 236 | + | ||
| 237 | + | #[test] | |
| 238 | + | fn pixel_style_width_1px() { | |
| 239 | + | assert!(img_is_pixel(r#"src="x" style="width:1px""#)); | |
| 240 | + | } | |
| 241 | + | ||
| 242 | + | #[test] | |
| 243 | + | fn pixel_style_width_space_1px() { | |
| 244 | + | assert!(img_is_pixel(r#"src="x" style="width: 1px""#)); | |
| 245 | + | } | |
| 246 | + | ||
| 247 | + | #[test] | |
| 248 | + | fn pixel_style_width_0() { | |
| 249 | + | assert!(img_is_pixel(r#"src="x" style="width:0""#)); | |
| 250 | + | } | |
| 251 | + | ||
| 252 | + | #[test] | |
| 253 | + | fn pixel_style_height_1px() { | |
| 254 | + | assert!(img_is_pixel(r#"src="x" style="height:1px""#)); | |
| 255 | + | } | |
| 256 | + | ||
| 257 | + | #[test] | |
| 258 | + | fn pixel_style_height_space_1px() { | |
| 259 | + | assert!(img_is_pixel(r#"src="x" style="height: 1px""#)); | |
| 260 | + | } | |
| 261 | + | ||
| 262 | + | #[test] | |
| 263 | + | fn pixel_style_height_0() { | |
| 264 | + | assert!(img_is_pixel(r#"src="x" style="height:0""#)); | |
| 265 | + | } | |
| 266 | + | ||
| 267 | + | #[test] | |
| 268 | + | fn pixel_style_display_none() { | |
| 269 | + | assert!(img_is_pixel(r#"src="x" style="display:none""#)); | |
| 270 | + | } | |
| 271 | + | ||
| 272 | + | #[test] | |
| 273 | + | fn pixel_style_display_space_none() { | |
| 274 | + | assert!(img_is_pixel(r#"src="x" style="display: none""#)); | |
| 275 | + | } | |
| 276 | + | ||
| 277 | + | #[test] | |
| 278 | + | fn pixel_normal_image_is_not_pixel() { | |
| 279 | + | assert!(!img_is_pixel( | |
| 280 | + | r#"src="https://example.com/cat.jpg" width="500" height="300""# | |
| 281 | + | )); | |
| 282 | + | } | |
| 283 | + | ||
| 284 | + | // -- is_hidden: each || arm with its own targeted test -- | |
| 285 | + | ||
| 286 | + | #[test] | |
| 287 | + | fn hidden_display_none() { | |
| 288 | + | assert!(div_is_hidden(r#"style="display:none""#)); | |
| 289 | + | } | |
| 290 | + | ||
| 291 | + | #[test] | |
| 292 | + | fn hidden_display_space_none() { | |
| 293 | + | assert!(div_is_hidden(r#"style="display: none""#)); | |
| 294 | + | } | |
| 295 | + | ||
| 296 | + | #[test] | |
| 297 | + | fn hidden_visibility_hidden() { | |
| 298 | + | assert!(div_is_hidden(r#"style="visibility:hidden""#)); | |
| 299 | + | } | |
| 300 | + | ||
| 301 | + | #[test] | |
| 302 | + | fn hidden_visibility_space_hidden() { | |
| 303 | + | assert!(div_is_hidden(r#"style="visibility: hidden""#)); | |
| 304 | + | } | |
| 305 | + | ||
| 306 | + | #[test] | |
| 307 | + | fn hidden_font_size_0() { | |
| 308 | + | assert!(div_is_hidden(r#"style="font-size:0""#)); | |
| 309 | + | } | |
| 310 | + | ||
| 311 | + | #[test] | |
| 312 | + | fn hidden_font_size_space_0() { | |
| 313 | + | assert!(div_is_hidden(r#"style="font-size: 0""#)); | |
| 314 | + | } | |
| 315 | + | ||
| 316 | + | #[test] | |
| 317 | + | fn hidden_line_height_0() { | |
| 318 | + | assert!(div_is_hidden(r#"style="line-height:0""#)); | |
| 319 | + | } | |
| 320 | + | ||
| 321 | + | #[test] | |
| 322 | + | fn hidden_line_height_space_0() { | |
| 323 | + | assert!(div_is_hidden(r#"style="line-height: 0""#)); | |
| 324 | + | } | |
| 325 | + | ||
| 326 | + | // The (height:0 && overflow:hidden) and (height: 0 && overflow: hidden) arms | |
| 327 | + | // need both halves present to fire. Tests cover each form, plus the negative | |
| 328 | + | // case where height:0 alone is NOT hidden (catches && → || mutation on L146/147). | |
| 329 | + | ||
| 330 | + | #[test] | |
| 331 | + | fn hidden_height_0_with_overflow_no_spaces() { | |
| 332 | + | assert!(div_is_hidden(r#"style="height:0;overflow:hidden""#)); | |
| 333 | + | } | |
| 334 | + | ||
| 335 | + | #[test] | |
| 336 | + | fn hidden_height_0_with_overflow_with_spaces() { | |
| 337 | + | assert!(div_is_hidden(r#"style="height: 0;overflow: hidden""#)); | |
| 338 | + | } | |
| 339 | + | ||
| 340 | + | #[test] | |
| 341 | + | fn hidden_height_0_alone_is_not_hidden() { | |
| 342 | + | // Catches the L146 && → || mutation: with ||, this would erroneously be hidden. | |
| 343 | + | assert!(!div_is_hidden(r#"style="height:0""#)); | |
| 344 | + | } | |
| 345 | + | ||
| 346 | + | #[test] | |
| 347 | + | fn hidden_height_space_0_alone_is_not_hidden() { | |
| 348 | + | // Same boundary check for the space variant — catches the && → || mutation | |
| 349 | + | // on the `(height: 0 && overflow: hidden)` arm specifically. | |
| 350 | + | assert!(!div_is_hidden(r#"style="height: 0""#)); | |
| 351 | + | } | |
| 352 | + | ||
| 353 | + | #[test] | |
| 354 | + | fn hidden_max_height_0() { | |
| 355 | + | assert!(div_is_hidden(r#"style="max-height:0""#)); | |
| 356 | + | } | |
| 357 | + | ||
| 358 | + | #[test] | |
| 359 | + | fn hidden_max_height_space_0() { | |
| 360 | + | assert!(div_is_hidden(r#"style="max-height: 0""#)); | |
| 361 | + | } | |
| 362 | + | ||
| 363 | + | #[test] | |
| 364 | + | fn hidden_no_signal_in_style() { | |
| 365 | + | assert!(!div_is_hidden(r#"style="color:red;font-weight:bold""#)); | |
| 366 | + | } | |
| 367 | + | ||
| 368 | + | #[test] | |
| 369 | + | fn hidden_no_style_attr_is_not_hidden() { | |
| 370 | + | assert!(!div_is_hidden("")); | |
| 371 | + | } | |
| 372 | + | } | |
| 373 | + | ||
| 131 | 374 | /// Check if an element is hidden via inline style. | |
| 132 | 375 | /// | |
| 133 | 376 | /// Catches display:none, visibility:hidden, and spacer tricks |
| @@ -321,4 +321,297 @@ mod tests { | |||
| 321 | 321 | let el = doc.select(&sel).next().unwrap(); | |
| 322 | 322 | assert!(!is_outlook_separator(el)); | |
| 323 | 323 | } | |
| 324 | + | ||
| 325 | + | // -- Boundary tests for `is_attribution_text`: each arm needs both sides -- | |
| 326 | + | ||
| 327 | + | #[test] | |
| 328 | + | fn attribution_on_without_wrote_is_false() { | |
| 329 | + | // "On ..." without "wrote:" — catches mutating && to || | |
| 330 | + | assert!(!is_attribution_text("On the bright side, this is fine.")); | |
| 331 | + | } | |
| 332 | + | ||
| 333 | + | #[test] | |
| 334 | + | fn attribution_wrote_without_on_is_false() { | |
| 335 | + | // "... wrote:" without leading "On " — catches mutating && to || | |
| 336 | + | assert!(!is_attribution_text("Alice wrote:")); | |
| 337 | + | } | |
| 338 | + | ||
| 339 | + | #[test] | |
| 340 | + | fn attribution_french_le_with_colon_space() { | |
| 341 | + | assert!(is_attribution_text("Le lundi 5 janvier 2026, Alice a écrit :")); | |
| 342 | + | } | |
| 343 | + | ||
| 344 | + | #[test] | |
| 345 | + | fn attribution_french_le_no_space_before_colon() { | |
| 346 | + | // "écrit:" without space — covers L89 || mutation between the two ending forms | |
| 347 | + | assert!(is_attribution_text("Le lundi, Alice a écrit:")); | |
| 348 | + | } | |
| 349 | + | ||
| 350 | + | #[test] | |
| 351 | + | fn attribution_spanish_el_with_colon_space() { | |
| 352 | + | assert!(is_attribution_text("El lunes 5 de enero, Alice a escrit :")); | |
| 353 | + | } | |
| 354 | + | ||
| 355 | + | #[test] | |
| 356 | + | fn attribution_spanish_el_no_space_before_colon() { | |
| 357 | + | assert!(is_attribution_text("El lunes, Alice a escrit:")); | |
| 358 | + | } | |
| 359 | + | ||
| 360 | + | #[test] | |
| 361 | + | fn attribution_french_le_without_wrote_ending_is_false() { | |
| 362 | + | // "Le X" without "écrit" — catches L89 mutating || to && | |
| 363 | + | assert!(!is_attribution_text("Le lundi, Alice est ici.")); | |
| 364 | + | } | |
| 365 | + | ||
| 366 | + | #[test] | |
| 367 | + | fn attribution_starts_with_le_but_not_french_pattern() { | |
| 368 | + | // Word starts with "Le" but isn't the French attribution form. | |
| 369 | + | assert!(!is_attribution_text("Le sigh.")); | |
| 370 | + | } | |
| 371 | + | ||
| 372 | + | #[test] | |
| 373 | + | fn attribution_german_am_with_colon() { | |
| 374 | + | assert!(is_attribution_text("Am Montag, 5. Januar 2026, schrieb:")); | |
| 375 | + | } | |
| 376 | + | ||
| 377 | + | #[test] | |
| 378 | + | fn attribution_german_am_with_space_colon() { | |
| 379 | + | assert!(is_attribution_text("Am Montag schrieb :")); | |
| 380 | + | } | |
| 381 | + | ||
| 382 | + | #[test] | |
| 383 | + | fn attribution_german_am_without_schrieb_is_false() { | |
| 384 | + | // "Am X" without "schrieb" — catches L93 && mutation | |
| 385 | + | assert!(!is_attribution_text("Am very fine, thanks.")); | |
| 386 | + | } | |
| 387 | + | ||
| 388 | + | #[test] | |
| 389 | + | fn attribution_german_schrieb_without_am_is_false() { | |
| 390 | + | // "schrieb:" without leading "Am " — catches L93 && mutation | |
| 391 | + | assert!(!is_attribution_text("Bob schrieb:")); | |
| 392 | + | } | |
| 393 | + | ||
| 394 | + | #[test] | |
| 395 | + | fn attribution_begin_forwarded_only() { | |
| 396 | + | // Only "Begin forwarded message" present — catches the || chain mutating to && | |
| 397 | + | assert!(is_attribution_text("Begin forwarded message")); | |
| 398 | + | } | |
| 399 | + | ||
| 400 | + | #[test] | |
| 401 | + | fn attribution_original_message_only() { | |
| 402 | + | // Only "Original Message" present — catches the || chain mutating to && | |
| 403 | + | assert!(is_attribution_text("-----Original Message-----")); | |
| 404 | + | } | |
| 405 | + | ||
| 406 | + | // -- Boundary tests for `is_reply_id` -- | |
| 407 | + | ||
| 408 | + | #[test] | |
| 409 | + | fn reply_id_reply_message() { | |
| 410 | + | assert!(is_reply_id("reply-message")); | |
| 411 | + | } | |
| 412 | + | ||
| 413 | + | #[test] | |
| 414 | + | fn reply_id_olk_src_body_section() { | |
| 415 | + | assert!(is_reply_id("OLK_SRC_BODY_SECTION")); | |
| 416 | + | } | |
| 417 | + | ||
| 418 | + | #[test] | |
| 419 | + | fn reply_id_unknown_is_false() { | |
| 420 | + | // Catches `replace is_reply_id -> bool with true` mutant | |
| 421 | + | assert!(!is_reply_id("main-content")); | |
| 422 | + | assert!(!is_reply_id("")); | |
| 423 | + | assert!(!is_reply_id("reply")); | |
| 424 | + | } | |
| 425 | + | ||
| 426 | + | // -- Boundary tests for `find_attribution` -- | |
| 427 | + | ||
| 428 | + | #[test] | |
| 429 | + | fn find_attribution_in_leading_text() { | |
| 430 | + | let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#; | |
| 431 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 432 | + | let el = doc.select(&sel).next().unwrap(); | |
| 433 | + | let attr = find_attribution(el); | |
| 434 | + | assert!(attr.is_some()); | |
| 435 | + | assert!(attr.unwrap().contains("wrote:")); | |
| 436 | + | } | |
| 437 | + | ||
| 438 | + | #[test] | |
| 439 | + | fn find_attribution_none_when_no_match() { | |
| 440 | + | let html = r#"<div>Just regular text here, nothing fancy.</div>"#; | |
| 441 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 442 | + | let el = doc.select(&sel).next().unwrap(); | |
| 443 | + | assert!(find_attribution(el).is_none()); | |
| 444 | + | } | |
| 445 | + | ||
| 446 | + | #[test] | |
| 447 | + | fn find_attribution_stops_at_first_element_child() { | |
| 448 | + | // Element-then-text: the Text(_) arm should still match leading text BEFORE | |
| 449 | + | // hitting any element. With a leading element, the loop should `break` | |
| 450 | + | // out without inspecting later text. Catches "delete match arm Node::Element(_)". | |
| 451 | + | let html = r#"<div><span>hi</span>On Mon, Alice wrote:</div>"#; | |
| 452 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 453 | + | let el = doc.select(&sel).next().unwrap(); | |
| 454 | + | // Leading content is an element, not text — and the later text falls outside | |
| 455 | + | // the leading-text scan. So no attribution should be found from leading text. | |
| 456 | + | // Also, no preceding sibling. → None. | |
| 457 | + | assert!(find_attribution(el).is_none()); | |
| 458 | + | } | |
| 459 | + | ||
| 460 | + | #[test] | |
| 461 | + | fn find_attribution_in_preceding_sibling() { | |
| 462 | + | let html = r#"<div><p>On Mon, Alice wrote:</p><div class="quote">body</div></div>"#; | |
| 463 | + | let (doc, sel) = parse_and_select(html, "div.quote"); | |
| 464 | + | let el = doc.select(&sel).next().unwrap(); | |
| 465 | + | let attr = find_attribution(el); | |
| 466 | + | assert!(attr.is_some(), "expected attribution from preceding <p>"); | |
| 467 | + | } | |
| 468 | + | ||
| 469 | + | // -- Boundary tests for `has_attribution_then_quote` -- | |
| 470 | + | // These exercise the function via `is_reply_boundary` since it's private. | |
| 471 | + | ||
| 472 | + | #[test] | |
| 473 | + | fn boundary_div_with_attribution_then_blockquote() { | |
| 474 | + | let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#; | |
| 475 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 476 | + | let el = doc.select(&sel).next().unwrap(); | |
| 477 | + | assert!(is_reply_boundary(el)); | |
| 478 | + | } | |
| 479 | + | ||
| 480 | + | #[test] | |
| 481 | + | fn boundary_div_blockquote_without_attribution_is_false() { | |
| 482 | + | // A bare blockquote-wrapping div without attribution text is not a boundary. | |
| 483 | + | // Catches "replace has_attribution_then_quote -> bool with false" (would | |
| 484 | + | // make this still pass, but the positive case above would fail). | |
| 485 | + | let html = r#"<div><blockquote>quoted</blockquote></div>"#; | |
| 486 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 487 | + | let el = doc.select(&sel).next().unwrap(); | |
| 488 | + | assert!(!is_reply_boundary(el)); | |
| 489 | + | } | |
| 490 | + | ||
| 491 | + | #[test] | |
| 492 | + | fn boundary_div_attribution_no_blockquote_is_false() { | |
| 493 | + | // Attribution text but no blockquote → not a boundary. | |
| 494 | + | // Catches the L151 == mutation (would treat any element as blockquote). | |
| 495 | + | let html = r#"<div>On Mon, Alice wrote:<p>not a quote</p></div>"#; | |
| 496 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 497 | + | let el = doc.select(&sel).next().unwrap(); | |
| 498 | + | assert!(!is_reply_boundary(el)); | |
| 499 | + | } | |
| 500 | + | ||
| 501 | + | #[test] | |
| 502 | + | fn boundary_div_attribution_br_blockquote() { | |
| 503 | + | // Attribution → <br> → blockquote. The <br> must be skipped. | |
| 504 | + | // Catches the L155 != mutation in br-handling. | |
| 505 | + | let html = r#"<div>On Mon, Alice wrote:<br><blockquote>quoted</blockquote></div>"#; | |
| 506 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 507 | + | let el = doc.select(&sel).next().unwrap(); | |
| 508 | + | assert!(is_reply_boundary(el)); | |
| 509 | + | } | |
| 510 | + | ||
| 511 | + | #[test] | |
| 512 | + | fn boundary_div_non_br_element_before_attribution_is_false() { | |
| 513 | + | // Non-br element BEFORE finding attribution → early return false. | |
| 514 | + | // Catches the L157 `!` deletion. | |
| 515 | + | let html = r#"<div><p>preface</p>On Mon, Alice wrote:<blockquote>q</blockquote></div>"#; | |
| 516 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 517 | + | let el = doc.select(&sel).next().unwrap(); | |
| 518 | + | assert!(!is_reply_boundary(el)); | |
| 519 | + | } | |
| 520 | + | ||
| 521 | + | // -- Boundary tests for `previous_sibling_text` -- | |
| 522 | + | // Exercised via find_attribution since the function is private. | |
| 523 | + | ||
| 524 | + | #[test] | |
| 525 | + | fn prev_sibling_text_node() { | |
| 526 | + | // Raw Text node as preceding sibling. Inside a parent <div>, a leading | |
| 527 | + | // text run followed by a child <div class="q"> means the inner div's | |
| 528 | + | // `prev_sibling()` is a `Node::Text`. Catches `delete match arm Node::Text(text)`. | |
| 529 | + | let html = r#"<div>On Mon, Alice wrote:<div class="q">body</div></div>"#; | |
| 530 | + | let (doc, sel) = parse_and_select(html, "div.q"); | |
| 531 | + | let el = doc.select(&sel).next().unwrap(); | |
| 532 | + | assert!(find_attribution(el).is_some()); | |
| 533 | + | } | |
| 534 | + | ||
| 535 | + | #[test] | |
| 536 | + | fn prev_sibling_inline_span_with_attribution() { | |
| 537 | + | let html = r#"<div><span>On Mon, Alice wrote:</span><div class="q">body</div></div>"#; | |
| 538 | + | let (doc, sel) = parse_and_select(html, "div.q"); | |
| 539 | + | let el = doc.select(&sel).next().unwrap(); | |
| 540 | + | assert!(find_attribution(el).is_some()); | |
| 541 | + | } | |
| 542 | + | ||
| 543 | + | #[test] | |
| 544 | + | fn prev_sibling_inline_font_with_attribution() { | |
| 545 | + | // <font> is also inline-treated; covers a different arm in the matches!. | |
| 546 | + | let html = r#"<div><font>On Mon, Alice wrote:</font><div class="q">body</div></div>"#; | |
| 547 | + | let (doc, sel) = parse_and_select(html, "div.q"); | |
| 548 | + | let el = doc.select(&sel).next().unwrap(); | |
| 549 | + | assert!(find_attribution(el).is_some()); | |
| 550 | + | } | |
| 551 | + | ||
| 552 | + | #[test] | |
| 553 | + | fn prev_sibling_non_inline_element_returns_none() { | |
| 554 | + | // <table> is not in the inline whitelist → preceding-sibling lookup fails. | |
| 555 | + | let html = r#"<div><table><tr><td>On Mon, Alice wrote:</td></tr></table><div class="q">body</div></div>"#; | |
| 556 | + | let (doc, sel) = parse_and_select(html, "div.q"); | |
| 557 | + | let el = doc.select(&sel).next().unwrap(); | |
| 558 | + | assert!(find_attribution(el).is_none()); | |
| 559 | + | } | |
| 560 | + | ||
| 561 | + | #[test] | |
| 562 | + | fn prev_sibling_empty_inline_returns_none() { | |
| 563 | + | let html = r#"<div><span> </span><div class="q">body</div></div>"#; | |
| 564 | + | let (doc, sel) = parse_and_select(html, "div.q"); | |
| 565 | + | let el = doc.select(&sel).next().unwrap(); | |
| 566 | + | // Whitespace-only preceding span → no attribution match. | |
| 567 | + | assert!(find_attribution(el).is_none()); | |
| 568 | + | } | |
| 569 | + | ||
| 570 | + | // -- Boundary tests for `is_outlook_separator` -- | |
| 571 | + | ||
| 572 | + | #[test] | |
| 573 | + | fn outlook_from_date_subject_is_separator() { | |
| 574 | + | // Date instead of Sent → covers L206 || (Sent || Date) mutation | |
| 575 | + | let html = "<div>From: Alice\nDate: Monday\nSubject: Hello</div>"; | |
| 576 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 577 | + | let el = doc.select(&sel).next().unwrap(); | |
| 578 | + | assert!(is_outlook_separator(el)); | |
| 579 | + | } | |
| 580 | + | ||
| 581 | + | #[test] | |
| 582 | + | fn outlook_from_sent_no_subject_is_separator() { | |
| 583 | + | // From + Sent, no Subject → catches L209 mutating || to && | |
| 584 | + | let html = "<div>From: Alice\nSent: Monday</div>"; | |
| 585 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 586 | + | let el = doc.select(&sel).next().unwrap(); | |
| 587 | + | assert!(is_outlook_separator(el)); | |
| 588 | + | } | |
| 589 | + | ||
| 590 | + | #[test] | |
| 591 | + | fn outlook_from_subject_no_sent_is_separator() { | |
| 592 | + | // From + Subject, no Sent/Date → catches L209 mutating || to && | |
| 593 | + | let html = "<div>From: Alice\nSubject: Hello</div>"; | |
| 594 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 595 | + | let el = doc.select(&sel).next().unwrap(); | |
| 596 | + | assert!(is_outlook_separator(el)); | |
| 597 | + | } | |
| 598 | + | ||
| 599 | + | #[test] | |
| 600 | + | fn outlook_from_only_is_not_separator() { | |
| 601 | + | // From alone (no Sent/Date/Subject) → must be false. | |
| 602 | + | // Catches L209 && mutation to ||. | |
| 603 | + | let html = "<div>From: Alice</div>"; | |
| 604 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 605 | + | let el = doc.select(&sel).next().unwrap(); | |
| 606 | + | assert!(!is_outlook_separator(el)); | |
| 607 | + | } | |
| 608 | + | ||
| 609 | + | #[test] | |
| 610 | + | fn outlook_sent_subject_no_from_is_not_separator() { | |
| 611 | + | // No From → must be false regardless of Sent/Subject presence. | |
| 612 | + | let html = "<div>Sent: Monday\nSubject: Hello</div>"; | |
| 613 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 614 | + | let el = doc.select(&sel).next().unwrap(); | |
| 615 | + | assert!(!is_outlook_separator(el)); | |
| 616 | + | } | |
| 324 | 617 | } |
| @@ -294,4 +294,143 @@ mod tests { | |||
| 294 | 294 | assert_eq!(h, vec!["Name", "Val"]); | |
| 295 | 295 | assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]); | |
| 296 | 296 | } | |
| 297 | + | ||
| 298 | + | // -- Boundary tests for is_data_table role handling -- | |
| 299 | + | ||
| 300 | + | #[test] | |
| 301 | + | fn role_none_is_layout() { | |
| 302 | + | // role="none" → explicit layout signal. Catches L22 `||` mutation | |
| 303 | + | // (presentation OR none); without the ||, "none" wouldn't short-circuit. | |
| 304 | + | let doc = parse_table( | |
| 305 | + | r#"<table role="none"><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></table>"#, | |
| 306 | + | ); | |
| 307 | + | // Even with <th>, the explicit role="none" should win. | |
| 308 | + | assert!(!is_data_table(select_table(&doc))); | |
| 309 | + | } | |
| 310 | + | ||
| 311 | + | #[test] | |
| 312 | + | fn role_table_is_data() { | |
| 313 | + | // role="table" → data. Catches L22 == "grid" mutating to != (which would | |
| 314 | + | // make grid not match) AND covers the parallel `|| role == "table"` arm. | |
| 315 | + | let doc = | |
| 316 | + | parse_table(r#"<table role="table"><tr><td>a</td></tr></table>"#); | |
| 317 | + | assert!(is_data_table(select_table(&doc))); | |
| 318 | + | } | |
| 319 | + | ||
| 320 | + | #[test] | |
| 321 | + | fn role_unknown_falls_through_to_structural() { | |
| 322 | + | // Unknown role → no early decision; structural rules apply. | |
| 323 | + | // Single-cell single-row layout table → not data. | |
| 324 | + | let doc = | |
| 325 | + | parse_table(r#"<table role="banner"><tr><td>only one cell</td></tr></table>"#); | |
| 326 | + | assert!(!is_data_table(select_table(&doc))); | |
| 327 | + | } | |
| 328 | + | ||
| 329 | + | #[test] | |
| 330 | + | fn role_presentation_overrides_structure() { | |
| 331 | + | // role="presentation" → layout, even with multiple substantive rows. | |
| 332 | + | // Catches L22 == "presentation" mutating to != (which would skip this check). | |
| 333 | + | let doc = parse_table( | |
| 334 | + | r#"<table role="presentation"><tr><td>Alice</td><td>Engineer</td></tr>\ | |
| 335 | + | <tr><td>Bob</td><td>Designer</td></tr></table>"#, | |
| 336 | + | ); | |
| 337 | + | assert!(!is_data_table(select_table(&doc))); | |
| 338 | + | } | |
| 339 | + | ||
| 340 | + | // -- Boundary tests for has_substantive_text > 1 -- | |
| 341 | + | ||
| 342 | + | #[test] | |
| 343 | + | fn single_char_cells_not_substantive() { | |
| 344 | + | // Two rows of single-char cells → not substantive → not a data table. | |
| 345 | + | // Catches L66 `>` mutating to `>=`: with >=, single chars become substantive | |
| 346 | + | // and these two rows would qualify as a data table. | |
| 347 | + | let doc = parse_table( | |
| 348 | + | "<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>", | |
| 349 | + | ); | |
| 350 | + | assert!(!is_data_table(select_table(&doc))); | |
| 351 | + | } | |
| 352 | + | ||
| 353 | + | #[test] | |
| 354 | + | fn two_char_cells_are_substantive() { | |
| 355 | + | let doc = parse_table( | |
| 356 | + | "<table><tr><td>ab</td><td>cd</td></tr><tr><td>ef</td><td>gh</td></tr></table>", | |
| 357 | + | ); | |
| 358 | + | assert!(is_data_table(select_table(&doc))); | |
| 359 | + | } | |
| 360 | + | ||
| 361 | + | // -- Boundary tests for extract_table_data tbody handling -- | |
| 362 | + | ||
| 363 | + | #[test] | |
| 364 | + | fn extract_with_tbody_no_thead() { | |
| 365 | + | // Catches L87 `== "tbody"` mutating to != (which would skip tbody). | |
| 366 | + | let doc = parse_table( | |
| 367 | + | "<table><tbody><tr><td>Name</td><td>Val</td></tr><tr><td>X</td><td>Y</td></tr></tbody></table>", | |
| 368 | + | ); | |
| 369 | + | let (h, r) = extract_table_data(select_table(&doc)); | |
| 370 | + | // First tbody row promoted to headers; second row is data. | |
| 371 | + | assert_eq!(h, vec!["Name", "Val"]); | |
| 372 | + | assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]); | |
| 373 | + | } | |
| 374 | + | ||
| 375 | + | // -- Boundary tests for the headers-vs-th-row decision (L104 &&) -- | |
| 376 | + | ||
| 377 | + | #[test] | |
| 378 | + | fn thead_present_blocks_later_th_row_promotion() { | |
| 379 | + | // Headers already set by thead. A later th-row should NOT overwrite them. | |
| 380 | + | // Catches L104 `&&` mutating to `||`: with ||, has_th_cells alone would | |
| 381 | + | // re-promote, clobbering the thead headers. | |
| 382 | + | let doc = parse_table( | |
| 383 | + | "<table><thead><tr><th>A</th><th>B</th></tr></thead>\ | |
| 384 | + | <tbody><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></tbody></table>", | |
| 385 | + | ); | |
| 386 | + | let (h, r) = extract_table_data(select_table(&doc)); | |
| 387 | + | assert_eq!(h, vec!["A", "B"], "thead headers must not be overwritten"); | |
| 388 | + | // Both the th-row and the td-row become data rows. | |
| 389 | + | assert_eq!(r.len(), 2); | |
| 390 | + | } | |
| 391 | + | ||
| 392 | + | #[test] | |
| 393 | + | fn no_thead_th_row_promotes_to_headers() { | |
| 394 | + | // No thead, but a tr full of th cells → that tr's cells become headers. | |
| 395 | + | // Catches `has_th_cells -> bool` always-false mutation (which would | |
| 396 | + | // make this row become a data row instead). | |
| 397 | + | let doc = parse_table( | |
| 398 | + | "<table><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></table>", | |
| 399 | + | ); | |
| 400 | + | let (h, r) = extract_table_data(select_table(&doc)); | |
| 401 | + | assert_eq!(h, vec!["X", "Y"]); | |
| 402 | + | assert_eq!(r, vec![vec!["1".to_string(), "2".to_string()]]); | |
| 403 | + | } | |
| 404 | + | ||
| 405 | + | #[test] | |
| 406 | + | fn all_td_rows_promote_first_to_headers() { | |
| 407 | + | // No th anywhere → has_th_cells is false for every row → first row promoted | |
| 408 | + | // by the `if headers.is_empty() && !rows.is_empty()` fallback. | |
| 409 | + | // Catches `has_th_cells -> bool` always-true mutation (which would promote | |
| 410 | + | // every row as headers, leaving rows empty after the first). | |
| 411 | + | let doc = parse_table( | |
| 412 | + | "<table><tr><td>Name</td><td>Val</td></tr><tr><td>X</td><td>Y</td></tr><tr><td>P</td><td>Q</td></tr></table>", | |
| 413 | + | ); | |
| 414 | + | let (h, r) = extract_table_data(select_table(&doc)); | |
| 415 | + | assert_eq!(h, vec!["Name", "Val"]); | |
| 416 | + | assert_eq!(r.len(), 2); | |
| 417 | + | } | |
| 418 | + | ||
| 419 | + | // -- Boundary test for has_th_cells (L139 == "th") -- | |
| 420 | + | ||
| 421 | + | #[test] | |
| 422 | + | fn td_only_row_is_not_a_header_row() { | |
| 423 | + | // A tr with only <td> cells should NOT promote to headers when other | |
| 424 | + | // rows exist. Catches L139 `== "th"` mutating to `!=` (which would | |
| 425 | + | // match td cells and incorrectly treat every td row as a header row). | |
| 426 | + | let doc = parse_table( | |
| 427 | + | "<table><tr><td>data-1</td><td>data-2</td></tr>\ | |
| 428 | + | <tr><td>data-3</td><td>data-4</td></tr>\ | |
| 429 | + | <tr><td>data-5</td><td>data-6</td></tr></table>", | |
| 430 | + | ); | |
| 431 | + | let (h, r) = extract_table_data(select_table(&doc)); | |
| 432 | + | // First row is promoted (via the fallback at the end), leaving exactly two data rows. | |
| 433 | + | assert_eq!(h, vec!["data-1", "data-2"]); | |
| 434 | + | assert_eq!(r.len(), 2, "remaining rows should be data, not headers"); | |
| 435 | + | } | |
| 297 | 436 | } |