Skip to main content

max / pter

21.3 KB · 618 lines History Blame Raw
1 use scraper::node::Node;
2 use scraper::ElementRef;
3
4 /// Check if an element marks the beginning of a quoted reply.
5 ///
6 /// This is the central abstraction for reply detection. Rather than
7 /// building per-client logic throughout the converter, all client-specific
8 /// knowledge lives here behind a single predicate.
9 ///
10 /// An element is a reply boundary if it's a container that wraps quoted
11 /// content from a previous message in the thread. The converter treats
12 /// these identically to `<blockquote>` — children get `>` prefixed.
13 pub fn is_reply_boundary(el: ElementRef) -> bool {
14 let element = el.value();
15 let name = element.name();
16
17 // <blockquote> is already handled by the element classifier.
18 // This function catches non-blockquote reply wrappers.
19
20 // Structural: elements with type="cite" (Apple Mail, some webmail)
21 if element.attr("type") == Some("cite") {
22 return true;
23 }
24
25 // Class/ID-based detection — thin per-client checks
26 if element.attr("class").is_some_and(is_reply_class) {
27 return true;
28 }
29
30 if element.attr("id").is_some_and(is_reply_id) {
31 return true;
32 }
33
34 // Heuristic: a <div> whose first meaningful text child matches
35 // an attribution pattern ("On ... wrote:") followed by a blockquote
36 if name == "div" && has_attribution_then_quote(el) {
37 return true;
38 }
39
40 false
41 }
42
43 /// Extract an attribution line from just before or at the start of a reply boundary.
44 ///
45 /// Returns the attribution text (e.g. "On Mon, Jan 5, Alice wrote:") if found,
46 /// so the converter can render it above the quoted block.
47 pub fn find_attribution(el: ElementRef) -> Option<String> {
48 // Check the element's own leading text for attribution patterns
49 for child in el.children() {
50 match child.value() {
51 Node::Text(text) => {
52 let trimmed = text.text.trim();
53 if is_attribution_text(trimmed) {
54 return Some(trimmed.to_string());
55 }
56 }
57 Node::Element(_) => {
58 // Stop at the first child element — attribution is leading text
59 break;
60 }
61 _ => {}
62 }
63 }
64
65 // Check for a preceding sibling text node or element with attribution
66 if let Some(prev) = previous_sibling_text(el) {
67 let trimmed = prev.trim().to_string();
68 if is_attribution_text(&trimmed) {
69 return Some(trimmed);
70 }
71 }
72
73 None
74 }
75
76 /// Check if text matches common email attribution patterns.
77 ///
78 /// These patterns are cross-client — every email client generates some
79 /// variant of "On [date], [person] wrote:" or "--- Forwarded message ---".
80 fn is_attribution_text(text: &str) -> bool {
81 let t = text.trim();
82
83 // "On ... wrote:" (Gmail, Apple Mail, Thunderbird, most clients)
84 if t.starts_with("On ") && t.ends_with("wrote:") {
85 return true;
86 }
87 // Localized variants: "Le ... a écrit :" (French), "Am ... schrieb" (German)
88 if (t.starts_with("Le ") || t.starts_with("El "))
89 && (t.ends_with("crit :") || t.ends_with("crit:"))
90 {
91 return true;
92 }
93 if t.starts_with("Am ") && (t.ends_with("schrieb:") || t.ends_with("schrieb :")) {
94 return true;
95 }
96
97 // Forwarded message separators
98 if t.contains("Forwarded message")
99 || t.contains("Begin forwarded message")
100 || t.contains("Original Message")
101 {
102 return true;
103 }
104
105 false
106 }
107
108 /// Thin per-client class checks. Each is one line — easy to add new clients.
109 fn is_reply_class(class: &str) -> bool {
110 // Split on whitespace to check individual class names
111 class.split_whitespace().any(|c| {
112 matches!(
113 c,
114 "gmail_quote"
115 | "gmail_extra"
116 | "yahoo_quoted"
117 | "protonmail_quote"
118 | "tutanota_quote"
119 | "moz-cite-prefix" // Thunderbird
120 | "zmail_extra" // Zoho
121 | "WordSection1" // Outlook (sometimes wraps replies)
122 )
123 })
124 }
125
126 /// Thin per-client ID checks.
127 fn is_reply_id(id: &str) -> bool {
128 matches!(
129 id,
130 "divRplyFwdMsg" // Outlook
131 | "reply-message" // Generic
132 | "OLK_SRC_BODY_SECTION" // Outlook Mac
133 )
134 }
135
136 /// Check if a div contains attribution text followed by a blockquote.
137 ///
138 /// This catches the common pattern where no class/id is present but
139 /// the structure is: `<div>On ... wrote:<br><blockquote>...</blockquote></div>`
140 fn has_attribution_then_quote(el: ElementRef) -> bool {
141 let mut found_attribution = false;
142
143 for child in el.children() {
144 match child.value() {
145 Node::Text(text) => {
146 if is_attribution_text(text.text.trim()) {
147 found_attribution = true;
148 }
149 }
150 Node::Element(e) => {
151 if found_attribution && e.name() == "blockquote" {
152 return true;
153 }
154 // Skip <br> tags between attribution and blockquote
155 if e.name() != "br" {
156 // If we hit a non-br element before finding attribution, stop
157 if !found_attribution {
158 return false;
159 }
160 }
161 }
162 _ => {}
163 }
164 }
165
166 false
167 }
168
169 /// Get text from the previous sibling, if it exists and is a text or inline element.
170 fn previous_sibling_text(el: ElementRef) -> Option<String> {
171 let prev = el.prev_sibling()?;
172
173 match prev.value() {
174 Node::Text(text) => Some(text.text.to_string()),
175 Node::Element(e) => {
176 // Check inline elements like <span>, <font> that might wrap attribution
177 if matches!(e.name(), "span" | "font" | "b" | "i" | "div" | "p") {
178 let el_ref = ElementRef::wrap(prev)?;
179 let text: String = el_ref.text().collect();
180 if !text.trim().is_empty() {
181 return Some(text);
182 }
183 }
184 None
185 }
186 _ => None,
187 }
188 }
189
190 /// Check if a separator element marks the boundary between original
191 /// content and a forwarded/replied message.
192 ///
193 /// This catches `<hr>` or styled divs that act as visual separators
194 /// before reply content (common in Outlook "From: ... Sent: ..." blocks).
195 pub fn is_outlook_separator(el: ElementRef) -> bool {
196 let element = el.value();
197
198 // Outlook uses a specific pattern: a div containing
199 // "From: ... Sent: ... To: ... Subject: ..." as a reply header
200 if element.name() == "div" || element.name() == "p" {
201 let text: String = el.text().collect();
202 let t = text.trim();
203
204 // Must have at least From + Sent/Date or Subject
205 let has_from = t.contains("From:");
206 let has_sent = t.contains("Sent:") || t.contains("Date:");
207 let has_subject = t.contains("Subject:");
208
209 if has_from && (has_sent || has_subject) {
210 return true;
211 }
212 }
213
214 false
215 }
216
217 #[cfg(test)]
218 mod tests {
219 use super::*;
220 use scraper::{Html, Selector};
221
222 fn parse_and_select(html: &str, selector: &str) -> (Html, Selector) {
223 let doc = Html::parse_document(html);
224 let sel = Selector::parse(selector).unwrap();
225 (doc, sel)
226 }
227
228 // -- Attribution detection --
229
230 #[test]
231 fn attribution_on_wrote() {
232 assert!(is_attribution_text("On Mon, Jan 5, 2026 at 3:00 PM Alice <alice@example.com> wrote:"));
233 }
234
235 #[test]
236 fn attribution_forwarded() {
237 assert!(is_attribution_text("---------- Forwarded message ----------"));
238 }
239
240 #[test]
241 fn attribution_original_message() {
242 assert!(is_attribution_text("-----Original Message-----"));
243 }
244
245 #[test]
246 fn attribution_begin_forwarded() {
247 assert!(is_attribution_text("Begin forwarded message:"));
248 }
249
250 #[test]
251 fn not_attribution() {
252 assert!(!is_attribution_text("Hello, how are you?"));
253 assert!(!is_attribution_text("On the other hand, this is fine."));
254 }
255
256 // -- Reply class detection --
257
258 #[test]
259 fn gmail_quote_class() {
260 assert!(is_reply_class("gmail_quote"));
261 }
262
263 #[test]
264 fn multiple_classes_with_reply() {
265 assert!(is_reply_class("some-class gmail_quote another"));
266 }
267
268 #[test]
269 fn non_reply_class() {
270 assert!(!is_reply_class("regular-div content-wrapper"));
271 }
272
273 // -- Reply boundary detection --
274
275 #[test]
276 fn type_cite_is_boundary() {
277 let html = r#"<div type="cite"><p>quoted</p></div>"#;
278 let (doc, sel) = parse_and_select(html, r#"div[type="cite"]"#);
279 let el = doc.select(&sel).next().unwrap();
280 assert!(is_reply_boundary(el));
281 }
282
283 #[test]
284 fn gmail_quote_is_boundary() {
285 let html = r#"<div class="gmail_quote"><p>quoted</p></div>"#;
286 let (doc, sel) = parse_and_select(html, "div.gmail_quote");
287 let el = doc.select(&sel).next().unwrap();
288 assert!(is_reply_boundary(el));
289 }
290
291 #[test]
292 fn outlook_id_is_boundary() {
293 let html = r#"<div id="divRplyFwdMsg"><p>quoted</p></div>"#;
294 let (doc, sel) = parse_and_select(html, "#divRplyFwdMsg");
295 let el = doc.select(&sel).next().unwrap();
296 assert!(is_reply_boundary(el));
297 }
298
299 #[test]
300 fn plain_div_not_boundary() {
301 let html = r#"<div class="content"><p>not quoted</p></div>"#;
302 let (doc, sel) = parse_and_select(html, "div.content");
303 let el = doc.select(&sel).next().unwrap();
304 assert!(!is_reply_boundary(el));
305 }
306
307 // -- Outlook separator --
308
309 #[test]
310 fn outlook_from_sent_subject() {
311 let html = "<div>From: Alice\nSent: Monday\nTo: Bob\nSubject: Hello</div>";
312 let (doc, sel) = parse_and_select(html, "div");
313 let el = doc.select(&sel).next().unwrap();
314 assert!(is_outlook_separator(el));
315 }
316
317 #[test]
318 fn regular_div_not_separator() {
319 let html = "<div>Just a normal paragraph.</div>";
320 let (doc, sel) = parse_and_select(html, "div");
321 let el = doc.select(&sel).next().unwrap();
322 assert!(!is_outlook_separator(el));
323 }
324
325 // -- Boundary tests for `is_attribution_text`: each arm needs both sides --
326
327 #[test]
328 fn attribution_on_without_wrote_is_false() {
329 // "On ..." without "wrote:" — catches mutating && to ||
330 assert!(!is_attribution_text("On the bright side, this is fine."));
331 }
332
333 #[test]
334 fn attribution_wrote_without_on_is_false() {
335 // "... wrote:" without leading "On " — catches mutating && to ||
336 assert!(!is_attribution_text("Alice wrote:"));
337 }
338
339 #[test]
340 fn attribution_french_le_with_colon_space() {
341 assert!(is_attribution_text("Le lundi 5 janvier 2026, Alice a écrit :"));
342 }
343
344 #[test]
345 fn attribution_french_le_no_space_before_colon() {
346 // "écrit:" without space — covers L89 || mutation between the two ending forms
347 assert!(is_attribution_text("Le lundi, Alice a écrit:"));
348 }
349
350 #[test]
351 fn attribution_spanish_el_with_colon_space() {
352 assert!(is_attribution_text("El lunes 5 de enero, Alice a escrit :"));
353 }
354
355 #[test]
356 fn attribution_spanish_el_no_space_before_colon() {
357 assert!(is_attribution_text("El lunes, Alice a escrit:"));
358 }
359
360 #[test]
361 fn attribution_french_le_without_wrote_ending_is_false() {
362 // "Le X" without "écrit" — catches L89 mutating || to &&
363 assert!(!is_attribution_text("Le lundi, Alice est ici."));
364 }
365
366 #[test]
367 fn attribution_starts_with_le_but_not_french_pattern() {
368 // Word starts with "Le" but isn't the French attribution form.
369 assert!(!is_attribution_text("Le sigh."));
370 }
371
372 #[test]
373 fn attribution_german_am_with_colon() {
374 assert!(is_attribution_text("Am Montag, 5. Januar 2026, schrieb:"));
375 }
376
377 #[test]
378 fn attribution_german_am_with_space_colon() {
379 assert!(is_attribution_text("Am Montag schrieb :"));
380 }
381
382 #[test]
383 fn attribution_german_am_without_schrieb_is_false() {
384 // "Am X" without "schrieb" — catches L93 && mutation
385 assert!(!is_attribution_text("Am very fine, thanks."));
386 }
387
388 #[test]
389 fn attribution_german_schrieb_without_am_is_false() {
390 // "schrieb:" without leading "Am " — catches L93 && mutation
391 assert!(!is_attribution_text("Bob schrieb:"));
392 }
393
394 #[test]
395 fn attribution_begin_forwarded_only() {
396 // Only "Begin forwarded message" present — catches the || chain mutating to &&
397 assert!(is_attribution_text("Begin forwarded message"));
398 }
399
400 #[test]
401 fn attribution_original_message_only() {
402 // Only "Original Message" present — catches the || chain mutating to &&
403 assert!(is_attribution_text("-----Original Message-----"));
404 }
405
406 // -- Boundary tests for `is_reply_id` --
407
408 #[test]
409 fn reply_id_reply_message() {
410 assert!(is_reply_id("reply-message"));
411 }
412
413 #[test]
414 fn reply_id_olk_src_body_section() {
415 assert!(is_reply_id("OLK_SRC_BODY_SECTION"));
416 }
417
418 #[test]
419 fn reply_id_unknown_is_false() {
420 // Catches `replace is_reply_id -> bool with true` mutant
421 assert!(!is_reply_id("main-content"));
422 assert!(!is_reply_id(""));
423 assert!(!is_reply_id("reply"));
424 }
425
426 // -- Boundary tests for `find_attribution` --
427
428 #[test]
429 fn find_attribution_in_leading_text() {
430 let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#;
431 let (doc, sel) = parse_and_select(html, "div");
432 let el = doc.select(&sel).next().unwrap();
433 let attr = find_attribution(el);
434 assert!(attr.is_some());
435 assert!(attr.unwrap().contains("wrote:"));
436 }
437
438 #[test]
439 fn find_attribution_none_when_no_match() {
440 let html = r#"<div>Just regular text here, nothing fancy.</div>"#;
441 let (doc, sel) = parse_and_select(html, "div");
442 let el = doc.select(&sel).next().unwrap();
443 assert!(find_attribution(el).is_none());
444 }
445
446 #[test]
447 fn find_attribution_stops_at_first_element_child() {
448 // Element-then-text: the Text(_) arm should still match leading text BEFORE
449 // hitting any element. With a leading element, the loop should `break`
450 // out without inspecting later text. Catches "delete match arm Node::Element(_)".
451 let html = r#"<div><span>hi</span>On Mon, Alice wrote:</div>"#;
452 let (doc, sel) = parse_and_select(html, "div");
453 let el = doc.select(&sel).next().unwrap();
454 // Leading content is an element, not text — and the later text falls outside
455 // the leading-text scan. So no attribution should be found from leading text.
456 // Also, no preceding sibling. → None.
457 assert!(find_attribution(el).is_none());
458 }
459
460 #[test]
461 fn find_attribution_in_preceding_sibling() {
462 let html = r#"<div><p>On Mon, Alice wrote:</p><div class="quote">body</div></div>"#;
463 let (doc, sel) = parse_and_select(html, "div.quote");
464 let el = doc.select(&sel).next().unwrap();
465 let attr = find_attribution(el);
466 assert!(attr.is_some(), "expected attribution from preceding <p>");
467 }
468
469 // -- Boundary tests for `has_attribution_then_quote` --
470 // These exercise the function via `is_reply_boundary` since it's private.
471
472 #[test]
473 fn boundary_div_with_attribution_then_blockquote() {
474 let html = r#"<div>On Mon, Alice wrote:<blockquote>quoted</blockquote></div>"#;
475 let (doc, sel) = parse_and_select(html, "div");
476 let el = doc.select(&sel).next().unwrap();
477 assert!(is_reply_boundary(el));
478 }
479
480 #[test]
481 fn boundary_div_blockquote_without_attribution_is_false() {
482 // A bare blockquote-wrapping div without attribution text is not a boundary.
483 // Catches "replace has_attribution_then_quote -> bool with false" (would
484 // make this still pass, but the positive case above would fail).
485 let html = r#"<div><blockquote>quoted</blockquote></div>"#;
486 let (doc, sel) = parse_and_select(html, "div");
487 let el = doc.select(&sel).next().unwrap();
488 assert!(!is_reply_boundary(el));
489 }
490
491 #[test]
492 fn boundary_div_attribution_no_blockquote_is_false() {
493 // Attribution text but no blockquote → not a boundary.
494 // Catches the L151 == mutation (would treat any element as blockquote).
495 let html = r#"<div>On Mon, Alice wrote:<p>not a quote</p></div>"#;
496 let (doc, sel) = parse_and_select(html, "div");
497 let el = doc.select(&sel).next().unwrap();
498 assert!(!is_reply_boundary(el));
499 }
500
501 #[test]
502 fn boundary_div_attribution_br_blockquote() {
503 // Attribution → <br> → blockquote. The <br> must be skipped.
504 // Catches the L155 != mutation in br-handling.
505 let html = r#"<div>On Mon, Alice wrote:<br><blockquote>quoted</blockquote></div>"#;
506 let (doc, sel) = parse_and_select(html, "div");
507 let el = doc.select(&sel).next().unwrap();
508 assert!(is_reply_boundary(el));
509 }
510
511 #[test]
512 fn boundary_div_non_br_element_before_attribution_is_false() {
513 // Non-br element BEFORE finding attribution → early return false.
514 // Catches the L157 `!` deletion.
515 let html = r#"<div><p>preface</p>On Mon, Alice wrote:<blockquote>q</blockquote></div>"#;
516 let (doc, sel) = parse_and_select(html, "div");
517 let el = doc.select(&sel).next().unwrap();
518 assert!(!is_reply_boundary(el));
519 }
520
521 // -- Boundary tests for `previous_sibling_text` --
522 // Exercised via find_attribution since the function is private.
523
524 #[test]
525 fn prev_sibling_text_node() {
526 // Raw Text node as preceding sibling. Inside a parent <div>, a leading
527 // text run followed by a child <div class="q"> means the inner div's
528 // `prev_sibling()` is a `Node::Text`. Catches `delete match arm Node::Text(text)`.
529 let html = r#"<div>On Mon, Alice wrote:<div class="q">body</div></div>"#;
530 let (doc, sel) = parse_and_select(html, "div.q");
531 let el = doc.select(&sel).next().unwrap();
532 assert!(find_attribution(el).is_some());
533 }
534
535 #[test]
536 fn prev_sibling_inline_span_with_attribution() {
537 let html = r#"<div><span>On Mon, Alice wrote:</span><div class="q">body</div></div>"#;
538 let (doc, sel) = parse_and_select(html, "div.q");
539 let el = doc.select(&sel).next().unwrap();
540 assert!(find_attribution(el).is_some());
541 }
542
543 #[test]
544 fn prev_sibling_inline_font_with_attribution() {
545 // <font> is also inline-treated; covers a different arm in the matches!.
546 let html = r#"<div><font>On Mon, Alice wrote:</font><div class="q">body</div></div>"#;
547 let (doc, sel) = parse_and_select(html, "div.q");
548 let el = doc.select(&sel).next().unwrap();
549 assert!(find_attribution(el).is_some());
550 }
551
552 #[test]
553 fn prev_sibling_non_inline_element_returns_none() {
554 // <table> is not in the inline whitelist → preceding-sibling lookup fails.
555 let html = r#"<div><table><tr><td>On Mon, Alice wrote:</td></tr></table><div class="q">body</div></div>"#;
556 let (doc, sel) = parse_and_select(html, "div.q");
557 let el = doc.select(&sel).next().unwrap();
558 assert!(find_attribution(el).is_none());
559 }
560
561 #[test]
562 fn prev_sibling_empty_inline_returns_none() {
563 let html = r#"<div><span> </span><div class="q">body</div></div>"#;
564 let (doc, sel) = parse_and_select(html, "div.q");
565 let el = doc.select(&sel).next().unwrap();
566 // Whitespace-only preceding span → no attribution match.
567 assert!(find_attribution(el).is_none());
568 }
569
570 // -- Boundary tests for `is_outlook_separator` --
571
572 #[test]
573 fn outlook_from_date_subject_is_separator() {
574 // Date instead of Sent → covers L206 || (Sent || Date) mutation
575 let html = "<div>From: Alice\nDate: Monday\nSubject: Hello</div>";
576 let (doc, sel) = parse_and_select(html, "div");
577 let el = doc.select(&sel).next().unwrap();
578 assert!(is_outlook_separator(el));
579 }
580
581 #[test]
582 fn outlook_from_sent_no_subject_is_separator() {
583 // From + Sent, no Subject → catches L209 mutating || to &&
584 let html = "<div>From: Alice\nSent: Monday</div>";
585 let (doc, sel) = parse_and_select(html, "div");
586 let el = doc.select(&sel).next().unwrap();
587 assert!(is_outlook_separator(el));
588 }
589
590 #[test]
591 fn outlook_from_subject_no_sent_is_separator() {
592 // From + Subject, no Sent/Date → catches L209 mutating || to &&
593 let html = "<div>From: Alice\nSubject: Hello</div>";
594 let (doc, sel) = parse_and_select(html, "div");
595 let el = doc.select(&sel).next().unwrap();
596 assert!(is_outlook_separator(el));
597 }
598
599 #[test]
600 fn outlook_from_only_is_not_separator() {
601 // From alone (no Sent/Date/Subject) → must be false.
602 // Catches L209 && mutation to ||.
603 let html = "<div>From: Alice</div>";
604 let (doc, sel) = parse_and_select(html, "div");
605 let el = doc.select(&sel).next().unwrap();
606 assert!(!is_outlook_separator(el));
607 }
608
609 #[test]
610 fn outlook_sent_subject_no_from_is_not_separator() {
611 // No From → must be false regardless of Sent/Subject presence.
612 let html = "<div>Sent: Monday\nSubject: Hello</div>";
613 let (doc, sel) = parse_and_select(html, "div");
614 let el = doc.select(&sel).next().unwrap();
615 assert!(!is_outlook_separator(el));
616 }
617 }
618