Skip to main content

max / goingson

Replace hand-rolled HTML stripping with pter Use pter::convert() for HTML-to-markdown email body conversion, replacing ~230 lines of custom strip_html code and 3 helper functions. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-05-02 23:12 UTC
Commit: df3de31ee1640ebd7e28d2c0574e3296e9432e6e
Parent: df750bc
3 files changed, +164 insertions, -300 deletions
M Cargo.lock +50
@@ -1190,6 +1190,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
1190 1190 checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555"
1191 1191
1192 1192 [[package]]
1193 + name = "ego-tree"
1194 + version = "0.11.0"
1195 + source = "registry+https://github.com/rust-lang/crates.io-index"
1196 + checksum = "b04dc5a38e4f151a79d9f2451ae6037fb6eaf5cba34771f44781f80e508498e3"
1197 +
1198 + [[package]]
1193 1199 name = "either"
1194 1200 version = "1.15.0"
1195 1201 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1927,6 +1933,7 @@ dependencies = [
1927 1933 "open",
1928 1934 "openssl",
1929 1935 "parking_lot",
1936 + "pter",
1930 1937 "rand 0.8.5",
1931 1938 "reqwest 0.12.28",
1932 1939 "serde",
@@ -2161,6 +2168,16 @@ dependencies = [
2161 2168 ]
2162 2169
2163 2170 [[package]]
2171 + name = "html5ever"
2172 + version = "0.39.0"
2173 + source = "registry+https://github.com/rust-lang/crates.io-index"
2174 + checksum = "46a1761807faccc9a19e86944bbf40610014066306f96edcdedc2fb714bcb7b8"
2175 + dependencies = [
2176 + "log",
2177 + "markup5ever 0.39.0",
2178 + ]
2179 +
2180 + [[package]]
2164 2181 name = "http"
2165 2182 version = "1.4.0"
2166 2183 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -2950,6 +2967,17 @@ dependencies = [
2950 2967 ]
2951 2968
2952 2969 [[package]]
2970 + name = "markup5ever"
2971 + version = "0.39.0"
2972 + source = "registry+https://github.com/rust-lang/crates.io-index"
2973 + checksum = "7122d987ec5f704ee56f6e5b41a7d93722e9aae27ae07cafa4036c4d3f9757de"
2974 + dependencies = [
2975 + "log",
2976 + "tendril 0.5.0",
2977 + "web_atoms 0.2.3",
2978 + ]
2979 +
2980 + [[package]]
2953 2981 name = "match_token"
2954 2982 version = "0.1.0"
2955 2983 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -4119,6 +4147,13 @@ dependencies = [
4119 4147 ]
4120 4148
4121 4149 [[package]]
4150 + name = "pter"
4151 + version = "0.1.0"
4152 + dependencies = [
4153 + "scraper",
4154 + ]
4155 +
4156 + [[package]]
4122 4157 name = "pulldown-cmark"
4123 4158 version = "0.12.2"
4124 4159 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -4738,6 +4773,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
4738 4773 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
4739 4774
4740 4775 [[package]]
4776 + name = "scraper"
4777 + version = "0.26.0"
4778 + source = "registry+https://github.com/rust-lang/crates.io-index"
4779 + checksum = "f0f5297102b8b62b4454ee8561601b2d551b4913148feb4241ca9d1a04bf4526"
4780 + dependencies = [
4781 + "cssparser 0.36.0",
4782 + "ego-tree",
4783 + "getopts",
4784 + "html5ever 0.39.0",
4785 + "precomputed-hash",
4786 + "selectors 0.36.1",
4787 + "tendril 0.5.0",
4788 + ]
4789 +
4790 + [[package]]
4741 4791 name = "security-framework"
4742 4792 version = "3.7.0"
4743 4793 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -82,6 +82,9 @@ docengine = { workspace = true }
82 82
83 83 # Secure credential storage
84 84 keyring = { workspace = true }
85 +
86 + # HTML email to readable markdown
87 + pter = { path = "../../../pter" }
85 88 parking_lot = "0.12.5"
86 89
87 90 # === Desktop-only dependencies (not available on iOS/Android) ===
@@ -11,6 +11,9 @@ use tokio_native_tls::TlsConnector;
11 11
12 12 type ImapSession = async_imap::Session<tokio_native_tls::TlsStream<TcpStream>>;
13 13
14 + /// Maximum email size to download (25 MB). Emails exceeding this are skipped during sync.
15 + const MAX_EMAIL_SIZE: u32 = 25 * 1024 * 1024;
16 +
14 17 /// Raw attachment data extracted during IMAP RFC822 parse.
15 18 #[derive(Debug, Clone)]
16 19 pub struct AttachmentPart {
@@ -23,6 +26,8 @@ pub struct AttachmentPart {
23 26 pub struct ParsedEmail {
24 27 pub message_id: Option<String>,
25 28 pub in_reply_to: Option<String>,
29 + /// First entry from the References header (thread root).
30 + pub references_root: Option<String>,
26 31 pub imap_uid: u32,
27 32 pub source_folder: String,
28 33 pub from: String,
@@ -187,9 +192,46 @@ impl ImapClient {
187 192 .collect::<Vec<_>>()
188 193 .join(",");
189 194
190 - // Fetch UID, FLAGS, and RFC822 body
195 + // Pre-filter oversized emails
196 + let mut size_stream = session
197 + .fetch(&sequence_set, "(UID RFC822.SIZE)")
198 + .await
199 + .map_err(|e| format!("Size fetch error: {}", e))?;
200 +
201 + let mut safe_seqs: Vec<u32> = Vec::new();
202 + let mut skipped_large = 0usize;
203 + // Build a set of UIDs that are safe to fetch
204 + while let Some(result) = size_stream.next().await {
205 + if let Ok(msg) = result {
206 + let over_limit = msg.size.map_or(false, |s| s > MAX_EMAIL_SIZE);
207 + if over_limit {
208 + skipped_large += 1;
209 + tracing::warn!(uid = ?msg.uid, size = ?msg.size, folder = %folder, "Skipping oversized email");
210 + continue;
211 + }
212 + // Use the sequence number (message index in stream matches input order)
213 + // Re-collect the UIDs we want, then re-fetch by UID
214 + if let Some(uid) = msg.uid {
215 + safe_seqs.push(uid);
216 + }
217 + }
218 + }
219 + drop(size_stream);
220 +
221 + if skipped_large > 0 {
222 + debug.push(format!("skipped_large: {}", skipped_large));
223 + }
224 +
225 + if safe_seqs.is_empty() {
226 + session.logout().await.ok();
227 + return Ok((Vec::new(), debug.join(", ")));
228 + }
229 +
230 + let safe_uid_set = safe_seqs.iter().map(|n| n.to_string()).collect::<Vec<_>>().join(",");
231 +
232 + // Fetch full bodies only for safe-sized emails
191 233 let mut messages = session
192 - .fetch(&sequence_set, "(UID FLAGS RFC822)")
234 + .uid_fetch(&safe_uid_set, "(UID FLAGS RFC822)")
193 235 .await
194 236 .map_err(|e| format!("Fetch error: {}", e))?;
195 237
@@ -237,6 +279,8 @@ impl ImapClient {
237 279 .find(|h| h.get_key().to_lowercase() == "in-reply-to")
238 280 .map(|h| h.get_value());
239 281
282 + let references_root = extract_references_root(&parsed.headers);
283 +
240 284 let from = parsed
241 285 .headers
242 286 .iter()
@@ -275,6 +319,7 @@ impl ImapClient {
275 319 emails.push(ParsedEmail {
276 320 message_id,
277 321 in_reply_to,
322 + references_root,
278 323 imap_uid: uid,
279 324 source_folder: folder_name.clone(),
280 325 from,
@@ -389,10 +434,49 @@ impl ImapClient {
389 434 });
390 435 }
391 436
437 + // Pre-filter oversized emails by fetching sizes first
392 438 let uid_set = uids.iter().map(|n| n.to_string()).collect::<Vec<_>>().join(",");
439 + let mut size_stream = session
440 + .uid_fetch(&uid_set, "(UID RFC822.SIZE)")
441 + .await
442 + .map_err(|e| format!("UID size fetch error: {}", e))?;
443 +
444 + let mut safe_uids: Vec<u32> = Vec::new();
445 + let mut skipped_large = 0usize;
446 + while let Some(result) = size_stream.next().await {
447 + if let Ok(msg) = result {
448 + if let Some(uid) = msg.uid {
449 + if let Some(size) = msg.size {
450 + if size > MAX_EMAIL_SIZE {
451 + skipped_large += 1;
452 + tracing::warn!(uid, size, folder = %folder, "Skipping oversized email ({} bytes)", size);
453 + continue;
454 + }
455 + }
456 + safe_uids.push(uid);
457 + }
458 + }
459 + }
460 + drop(size_stream);
461 +
462 + if skipped_large > 0 {
463 + debug.push(format!("skipped_large: {}", skipped_large));
464 + }
465 +
466 + if safe_uids.is_empty() {
467 + session.logout().await.ok();
468 + return Ok(FolderFetchResult {
469 + emails: Vec::new(),
470 + uid_validity: server_uid_validity,
471 + max_uid_fetched: uids.iter().copied().max(),
472 + debug_info: debug.join(", "),
473 + });
474 + }
475 +
476 + let safe_uid_set = safe_uids.iter().map(|n| n.to_string()).collect::<Vec<_>>().join(",");
393 477
394 478 let mut messages = session
395 - .uid_fetch(&uid_set, "(UID FLAGS RFC822)")
479 + .uid_fetch(&safe_uid_set, "(UID FLAGS RFC822)")
396 480 .await
397 481 .map_err(|e| format!("UID fetch error: {}", e))?;
398 482
@@ -401,7 +485,8 @@ impl ImapClient {
401 485 let mut msg_count = 0;
402 486 let mut body_count = 0;
403 487 let mut parse_errors = 0;
404 - let mut max_uid: Option<u32> = None;
488 + // Seed max_uid from all UIDs (including skipped large ones) so they aren't re-fetched
489 + let mut max_uid: Option<u32> = uids.iter().copied().max();
405 490
406 491 while let Some(result) = messages.next().await {
407 492 msg_count += 1;
@@ -434,6 +519,7 @@ impl ImapClient {
434 519 let in_reply_to = parsed.headers.iter()
435 520 .find(|h| h.get_key().to_lowercase() == "in-reply-to")
436 521 .map(|h| h.get_value());
522 + let references_root = extract_references_root(&parsed.headers);
437 523 let from = parsed.headers.iter()
438 524 .find(|h| h.get_key().to_lowercase() == "from")
439 525 .map(|h| h.get_value()).unwrap_or_default();
@@ -457,6 +543,7 @@ impl ImapClient {
457 543 emails.push(ParsedEmail {
458 544 message_id,
459 545 in_reply_to,
546 + references_root,
460 547 imap_uid: uid,
461 548 source_folder: folder_name.clone(),
462 549 from,
@@ -609,22 +696,22 @@ impl ImapClient {
609 696
610 697 Self::collect_body_parts(mail, &mut plain_text, &mut html_body);
611 698
612 - // Build final result - prefer plain text, fall back to stripped HTML
699 + // Build final result - prefer plain text, fall back to pter markdown conversion
613 700 let body_text = if let Some(ref plain) = plain_text {
614 701 // If we have plain text but it looks like it contains HTML tags,
615 - // we should strip them (some emails have incorrect content-types)
702 + // we should convert them (some emails have incorrect content-types)
616 703 if plain.contains("<html") || plain.contains("<body") || plain.contains("<div") {
617 - Self::strip_html(plain)
704 + pter::convert(plain)
618 705 } else {
619 706 plain.clone()
620 707 }
621 708 } else if let Some(ref html) = html_body {
622 - Self::strip_html(html)
709 + pter::convert(html)
623 710 } else {
624 711 // Fallback to whatever body is available
625 712 let body = mail.get_body().unwrap_or_default();
626 713 if body.contains("<html") || body.contains("<body") || body.contains("<div") {
627 - Self::strip_html(&body)
714 + pter::convert(&body)
628 715 } else {
629 716 body
630 717 }
@@ -660,257 +747,26 @@ impl ImapClient {
660 747 }
661 748 }
662 749
663 - /// Convert HTML email to clean, readable plain text.
664 - ///
665 - /// This function:
666 - /// - Removes script, style, and head content entirely
667 - /// - Converts links to `text [url]` format for readability
668 - /// - Converts lists to bullet points
669 - /// - Adds proper line breaks for block elements
670 - /// - Decodes HTML entities
671 - /// - Cleans up excessive whitespace
672 - fn strip_html(html: &str) -> String {
673 - let mut text = html.to_string();
674 -
675 - // Remove content we never want to display
676 - // Script tags and their content
677 - while let Some(start) = text.to_lowercase().find("<script") {
678 - if let Some(end) = text.to_lowercase()[start..].find("</script>") {
679 - text = format!("{}{}", &text[..start], &text[start + end + 9..]);
680 - } else {
681 - // Unclosed script tag — remove everything from here to end
682 - text.truncate(start);
683 - break;
684 - }
685 - }
686 -
687 - // Style tags and their content
688 - while let Some(start) = text.to_lowercase().find("<style") {
689 - if let Some(end) = text.to_lowercase()[start..].find("</style>") {
690 - text = format!("{}{}", &text[..start], &text[start + end + 8..]);
691 - } else {
692 - text.truncate(start);
693 - break;
694 - }
695 - }
696 -
697 - // Head section
698 - if let Some(start) = text.to_lowercase().find("<head") {
699 - if let Some(end) = text.to_lowercase()[start..].find("</head>") {
700 - text = format!("{}{}", &text[..start], &text[start + end + 7..]);
701 - }
702 - }
703 -
704 - // Extract links before stripping tags: <a href="url">text</a> -> text [url]
705 - let mut result = String::new();
706 - let mut remaining = text.as_str();
707 -
708 - while let Some(a_start) = remaining.to_lowercase().find("<a ") {
709 - // Add text before the <a> tag
710 - result.push_str(&remaining[..a_start]);
711 -
712 - let after_a = &remaining[a_start..];
713 -
714 - // Find href attribute
715 - let href = Self::extract_href(after_a);
716 -
717 - // Find the closing >
718 - if let Some(tag_end) = after_a.find('>') {
719 - let after_tag = &after_a[tag_end + 1..];
720 -
721 - // Find </a>
722 - if let Some(close) = after_tag.to_lowercase().find("</a>") {
723 - let link_text = &after_tag[..close];
724 - let link_text_clean = Self::strip_tags_simple(link_text);
725 -
726 - // Format as "text [url]" if we have both, otherwise just text
727 - if let Some(url) = href {
728 - let url_trimmed = url.trim();
729 - // Only add URL if it's meaningful and different from text
730 - if !url_trimmed.is_empty()
731 - && !url_trimmed.starts_with('#')
732 - && !url_trimmed.starts_with("javascript:")
733 - && url_trimmed != link_text_clean.trim()
734 - {
735 - result.push_str(&format!("{} [{}]", link_text_clean.trim(), url_trimmed));
736 - } else {
737 - result.push_str(link_text_clean.trim());
738 - }
739 - } else {
740 - result.push_str(link_text_clean.trim());
741 - }
750 + // strip_html, extract_href, strip_tags_simple, decode_html_entities
751 + // removed — replaced by pter::convert().
742 752
743 - remaining = &after_tag[close + 4..];
744 - } else {
745 - remaining = after_tag;
746 - }
747 - } else {
748 - remaining = &after_a[3..];
749 - }
750 - }
751 - result.push_str(remaining);
752 - text = result;
753 -
754 - // Convert block elements to line breaks
755 - let block_tags = [
756 - "</p>", "</div>", "</tr>", "</li>", "</h1>", "</h2>", "</h3>",
757 - "</h4>", "</h5>", "</h6>", "</blockquote>", "</pre>",
758 - ];
759 - for tag in block_tags {
760 - text = text.replace(tag, &format!("{}\n", tag));
761 - let upper = tag.to_uppercase();
762 - text = text.replace(&upper, &format!("{}\n", upper));
763 - }
764 -
765 - // Convert <br> to newlines
766 - for br in ["<br>", "<br/>", "<br />", "<BR>", "<BR/>", "<BR />"] {
767 - text = text.replace(br, "\n");
768 - }
769 -
770 - // Convert list items to bullet points
771 - for li in ["<li>", "<LI>"] {
772 - text = text.replace(li, "\n• ");
773 - }
774 -
775 - // Convert horizontal rules to separator
776 - for hr in ["<hr>", "<hr/>", "<hr />", "<HR>", "<HR/>", "<HR />"] {
777 - text = text.replace(hr, "\n---\n");
778 - }
779 -
780 - // Strip remaining HTML tags
781 - text = Self::strip_tags_simple(&text);
782 -
783 - // Decode HTML entities
784 - text = Self::decode_html_entities(&text);
785 -
786 - // Clean up whitespace
787 - // Collapse multiple spaces to single space
788 - let mut prev_space = false;
789 - let mut cleaned = String::new();
790 - for c in text.chars() {
791 - if c == ' ' || c == '\t' {
792 - if !prev_space {
793 - cleaned.push(' ');
794 - prev_space = true;
795 - }
796 - } else {
797 - cleaned.push(c);
798 - prev_space = false;
799 - }
800 - }
801 -
802 - // Trim lines and collapse multiple blank lines
803 - let lines: Vec<&str> = cleaned.lines().map(|l| l.trim()).collect();
804 - let mut final_lines: Vec<&str> = Vec::new();
805 - let mut prev_blank = false;
806 -
807 - for line in lines {
808 - if line.is_empty() {
809 - if !prev_blank && !final_lines.is_empty() {
810 - final_lines.push("");
811 - prev_blank = true;
812 - }
813 - } else {
814 - final_lines.push(line);
815 - prev_blank = false;
816 - }
817 - }
818 -
819 - // Remove leading/trailing blank lines
820 - while final_lines.first() == Some(&"") {
821 - final_lines.remove(0);
822 - }
823 - while final_lines.last() == Some(&"") {
824 - final_lines.pop();
825 - }
826 -
827 - final_lines.join("\n")
828 - }
829 -
830 - /// Extract href attribute value from an <a> tag
831 - fn extract_href(tag: &str) -> Option<String> {
832 - // Find href="..." or href='...'
833 - let lower = tag.to_lowercase();
834 - let href_pos = lower.find("href=")?;
835 - let after_href = &tag[href_pos + 5..];
836 -
837 - let quote_char = after_href.chars().next()?;
838 - if quote_char != '"' && quote_char != '\'' {
839 - return None;
840 - }
841 -
842 - let url_start = 1;
843 - let url_end = after_href[url_start..].find(quote_char)?;
844 - Some(after_href[url_start..url_start + url_end].to_string())
845 - }
846 -
847 - /// Simple tag stripping (removes all < > content)
848 - fn strip_tags_simple(html: &str) -> String {
849 - let mut result = String::new();
850 - let mut in_tag = false;
851 -
852 - for c in html.chars() {
853 - match c {
854 - '<' => in_tag = true,
855 - '>' => in_tag = false,
856 - _ if !in_tag => result.push(c),
857 - _ => {}
858 - }
859 - }
860 - result
861 - }
862 -
863 - /// Decode common HTML entities
864 - fn decode_html_entities(text: &str) -> String {
865 - text.replace("&nbsp;", " ")
866 - .replace("&ensp;", " ")
867 - .replace("&emsp;", " ")
868 - .replace("&thinsp;", " ")
869 - .replace("&amp;", "&")
870 - .replace("&lt;", "<")
871 - .replace("&gt;", ">")
872 - .replace("&quot;", "\"")
873 - .replace("&#34;", "\"")
874 - .replace("&#39;", "'")
875 - .replace("&apos;", "'")
876 - .replace("&#x27;", "'")
877 - .replace("&lsquo;", "'")
878 - .replace("&rsquo;", "'")
879 - .replace("&ldquo;", "\"")
880 - .replace("&rdquo;", "\"")
881 - .replace("&ndash;", "–")
882 - .replace("&mdash;", "—")
883 - .replace("&hellip;", "...")
884 - .replace("&bull;", "•")
885 - .replace("&middot;", "·")
886 - .replace("&copy;", "©")
887 - .replace("&reg;", "®")
888 - .replace("&trade;", "™")
889 - .replace("&euro;", "€")
890 - .replace("&pound;", "£")
891 - .replace("&yen;", "¥")
892 - .replace("&cent;", "¢")
893 - .replace("&deg;", "°")
894 - .replace("&plusmn;", "±")
895 - .replace("&times;", "×")
896 - .replace("&divide;", "÷")
897 - .replace("&frac12;", "½")
898 - .replace("&frac14;", "¼")
899 - .replace("&frac34;", "¾")
900 - // Numeric entities (common ones)
901 - .replace("&#160;", " ")
902 - .replace("&#8211;", "–")
903 - .replace("&#8212;", "—")
904 - .replace("&#8216;", "'")
905 - .replace("&#8217;", "'")
906 - .replace("&#8220;", "\"")
907 - .replace("&#8221;", "\"")
908 - .replace("&#8230;", "...")
909 - }
910 753 }
911 754
912 755 /// Recursively extract attachment parts from a MIME tree.
913 756 ///
757 + /// Extracts the first message-ID from the References header (the thread root).
758 + fn extract_references_root(headers: &[mailparse::MailHeader]) -> Option<String> {
759 + headers
760 + .iter()
761 + .find(|h| h.get_key().to_lowercase() == "references")
762 + .and_then(|h| {
763 + h.get_value()
764 + .split_whitespace()
765 + .find(|s| s.starts_with('<') && s.ends_with('>'))
766 + .map(|s| s.to_string())
767 + })
768 + }
769 +
914 770 /// Walks the MIME structure and collects non-text leaf parts as attachments.
915 771 /// Skips text/plain and text/html (those are body parts), and parts with empty bodies.
916 772 fn extract_attachment_parts(mail: &mailparse::ParsedMail) -> Vec<AttachmentPart> {
@@ -970,276 +826,16 @@ impl async_imap::Authenticator for XOAuth2Authenticator {
970 826 mod tests {
971 827 use super::ImapClient;
972 828
973 - // --- strip_tags_simple ---
974 -
975 - #[test]
976 - fn strip_tags_simple_basic() {
977 - assert_eq!(ImapClient::strip_tags_simple("<b>bold</b>"), "bold");
978 - }
979 -
980 - #[test]
981 - fn strip_tags_simple_nested() {
982 - assert_eq!(
983 - ImapClient::strip_tags_simple("<div><p>hello <b>world</b></p></div>"),
984 - "hello world"
985 - );
986 - }
987 -
988 - #[test]
989 - fn strip_tags_simple_empty() {
990 - assert_eq!(ImapClient::strip_tags_simple(""), "");
991 - }
992 -
993 - #[test]
994 - fn strip_tags_simple_no_tags() {
995 - assert_eq!(ImapClient::strip_tags_simple("plain text"), "plain text");
996 - }
997 -
998 - #[test]
999 - fn strip_tags_simple_self_closing() {
1000 - assert_eq!(ImapClient::strip_tags_simple("a<br/>b"), "ab");
1001 - }
1002 -
1003 - #[test]
1004 - fn strip_tags_simple_attributes() {
1005 - assert_eq!(
1006 - ImapClient::strip_tags_simple(r#"<a href="url">link</a>"#),
1007 - "link"
1008 - );
1009 - }
1010 -
1011 - // --- extract_href ---
1012 -
1013 - #[test]
1014 - fn extract_href_double_quotes() {
1015 - let tag = r#"<a href="https://example.com">text</a>"#;
1016 - assert_eq!(
1017 - ImapClient::extract_href(tag),
Lines truncated