Skip to main content

max / pter

Initial implementation of pter (plain-text-email-renderer) HTML email to readable markdown converter. Handles element conversion, table layout unwrapping, reply chain detection, and tracking pixel stripping. 116 tests, criterion benchmarks, MIT licensed. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-05-02 23:11 UTC
Commit: ef77e5f3ee2407db8d631c4200570f55c9342f54
17 files changed, +2890 insertions, -0 deletions
A .gitignore +1
@@ -0,0 +1 @@
1 + /target
A Cargo.lock +500
@@ -0,0 +1,1390 @@
1 + # This file is automatically @generated by Cargo.
2 + # It is not intended for manual editing.
3 + version = 4
4 +
5 + [[package]]
6 + name = "aho-corasick"
7 + version = "1.1.4"
8 + source = "registry+https://github.com/rust-lang/crates.io-index"
9 + checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301"
10 + dependencies = [
11 + "memchr",
12 + ]
13 +
14 + [[package]]
15 + name = "anes"
16 + version = "0.1.6"
17 + source = "registry+https://github.com/rust-lang/crates.io-index"
18 + checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
19 +
20 + [[package]]
21 + name = "anstyle"
22 + version = "1.0.14"
23 + source = "registry+https://github.com/rust-lang/crates.io-index"
24 + checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000"
25 +
26 + [[package]]
27 + name = "anyhow"
28 + version = "1.0.102"
29 + source = "registry+https://github.com/rust-lang/crates.io-index"
30 + checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c"
31 +
32 + [[package]]
33 + name = "autocfg"
34 + version = "1.5.0"
35 + source = "registry+https://github.com/rust-lang/crates.io-index"
36 + checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8"
37 +
38 + [[package]]
39 + name = "bit-set"
40 + version = "0.8.0"
41 + source = "registry+https://github.com/rust-lang/crates.io-index"
42 + checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3"
43 + dependencies = [
44 + "bit-vec",
45 + ]
46 +
47 + [[package]]
48 + name = "bit-vec"
49 + version = "0.8.0"
50 + source = "registry+https://github.com/rust-lang/crates.io-index"
51 + checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
52 +
53 + [[package]]
54 + name = "bitflags"
55 + version = "2.11.1"
56 + source = "registry+https://github.com/rust-lang/crates.io-index"
57 + checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
58 +
59 + [[package]]
60 + name = "bumpalo"
61 + version = "3.20.2"
62 + source = "registry+https://github.com/rust-lang/crates.io-index"
63 + checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb"
64 +
65 + [[package]]
66 + name = "cast"
67 + version = "0.3.0"
68 + source = "registry+https://github.com/rust-lang/crates.io-index"
69 + checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
70 +
71 + [[package]]
72 + name = "cfg-if"
73 + version = "1.0.4"
74 + source = "registry+https://github.com/rust-lang/crates.io-index"
75 + checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
76 +
77 + [[package]]
78 + name = "ciborium"
79 + version = "0.2.2"
80 + source = "registry+https://github.com/rust-lang/crates.io-index"
81 + checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
82 + dependencies = [
83 + "ciborium-io",
84 + "ciborium-ll",
85 + "serde",
86 + ]
87 +
88 + [[package]]
89 + name = "ciborium-io"
90 + version = "0.2.2"
91 + source = "registry+https://github.com/rust-lang/crates.io-index"
92 + checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
93 +
94 + [[package]]
95 + name = "ciborium-ll"
96 + version = "0.2.2"
97 + source = "registry+https://github.com/rust-lang/crates.io-index"
98 + checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
99 + dependencies = [
100 + "ciborium-io",
101 + "half",
102 + ]
103 +
104 + [[package]]
105 + name = "clap"
106 + version = "4.6.1"
107 + source = "registry+https://github.com/rust-lang/crates.io-index"
108 + checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51"
109 + dependencies = [
110 + "clap_builder",
111 + ]
112 +
113 + [[package]]
114 + name = "clap_builder"
115 + version = "4.6.0"
116 + source = "registry+https://github.com/rust-lang/crates.io-index"
117 + checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f"
118 + dependencies = [
119 + "anstyle",
120 + "clap_lex",
121 + ]
122 +
123 + [[package]]
124 + name = "clap_lex"
125 + version = "1.1.0"
126 + source = "registry+https://github.com/rust-lang/crates.io-index"
127 + checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9"
128 +
129 + [[package]]
130 + name = "criterion"
131 + version = "0.5.1"
132 + source = "registry+https://github.com/rust-lang/crates.io-index"
133 + checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
134 + dependencies = [
135 + "anes",
136 + "cast",
137 + "ciborium",
138 + "clap",
139 + "criterion-plot",
140 + "is-terminal",
141 + "itertools",
142 + "num-traits",
143 + "once_cell",
144 + "oorandom",
145 + "plotters",
146 + "rayon",
147 + "regex",
148 + "serde",
149 + "serde_derive",
150 + "serde_json",
151 + "tinytemplate",
152 + "walkdir",
153 + ]
154 +
155 + [[package]]
156 + name = "criterion-plot"
157 + version = "0.5.0"
158 + source = "registry+https://github.com/rust-lang/crates.io-index"
159 + checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
160 + dependencies = [
161 + "cast",
162 + "itertools",
163 + ]
164 +
165 + [[package]]
166 + name = "crossbeam-deque"
167 + version = "0.8.6"
168 + source = "registry+https://github.com/rust-lang/crates.io-index"
169 + checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
170 + dependencies = [
171 + "crossbeam-epoch",
172 + "crossbeam-utils",
173 + ]
174 +
175 + [[package]]
176 + name = "crossbeam-epoch"
177 + version = "0.9.18"
178 + source = "registry+https://github.com/rust-lang/crates.io-index"
179 + checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
180 + dependencies = [
181 + "crossbeam-utils",
182 + ]
183 +
184 + [[package]]
185 + name = "crossbeam-utils"
186 + version = "0.8.21"
187 + source = "registry+https://github.com/rust-lang/crates.io-index"
188 + checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
189 +
190 + [[package]]
191 + name = "crunchy"
192 + version = "0.2.4"
193 + source = "registry+https://github.com/rust-lang/crates.io-index"
194 + checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
195 +
196 + [[package]]
197 + name = "cssparser"
198 + version = "0.36.0"
199 + source = "registry+https://github.com/rust-lang/crates.io-index"
200 + checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2"
201 + dependencies = [
202 + "cssparser-macros",
203 + "dtoa-short",
204 + "itoa",
205 + "phf",
206 + "smallvec",
207 + ]
208 +
209 + [[package]]
210 + name = "cssparser-macros"
211 + version = "0.6.1"
212 + source = "registry+https://github.com/rust-lang/crates.io-index"
213 + checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
214 + dependencies = [
215 + "quote",
216 + "syn",
217 + ]
218 +
219 + [[package]]
220 + name = "derive_more"
221 + version = "2.1.1"
222 + source = "registry+https://github.com/rust-lang/crates.io-index"
223 + checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134"
224 + dependencies = [
225 + "derive_more-impl",
226 + ]
227 +
228 + [[package]]
229 + name = "derive_more-impl"
230 + version = "2.1.1"
231 + source = "registry+https://github.com/rust-lang/crates.io-index"
232 + checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb"
233 + dependencies = [
234 + "proc-macro2",
235 + "quote",
236 + "rustc_version",
237 + "syn",
238 + ]
239 +
240 + [[package]]
241 + name = "dtoa"
242 + version = "1.0.11"
243 + source = "registry+https://github.com/rust-lang/crates.io-index"
244 + checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590"
245 +
246 + [[package]]
247 + name = "dtoa-short"
248 + version = "0.3.5"
249 + source = "registry+https://github.com/rust-lang/crates.io-index"
250 + checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
251 + dependencies = [
252 + "dtoa",
253 + ]
254 +
255 + [[package]]
256 + name = "ego-tree"
257 + version = "0.11.0"
258 + source = "registry+https://github.com/rust-lang/crates.io-index"
259 + checksum = "b04dc5a38e4f151a79d9f2451ae6037fb6eaf5cba34771f44781f80e508498e3"
260 +
261 + [[package]]
262 + name = "either"
263 + version = "1.15.0"
264 + source = "registry+https://github.com/rust-lang/crates.io-index"
265 + checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
266 +
267 + [[package]]
268 + name = "equivalent"
269 + version = "1.0.2"
270 + source = "registry+https://github.com/rust-lang/crates.io-index"
271 + checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
272 +
273 + [[package]]
274 + name = "errno"
275 + version = "0.3.14"
276 + source = "registry+https://github.com/rust-lang/crates.io-index"
277 + checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
278 + dependencies = [
279 + "libc",
280 + "windows-sys",
281 + ]
282 +
283 + [[package]]
284 + name = "fastrand"
285 + version = "2.4.1"
286 + source = "registry+https://github.com/rust-lang/crates.io-index"
287 + checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6"
288 +
289 + [[package]]
290 + name = "fnv"
291 + version = "1.0.7"
292 + source = "registry+https://github.com/rust-lang/crates.io-index"
293 + checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
294 +
295 + [[package]]
296 + name = "foldhash"
297 + version = "0.1.5"
298 + source = "registry+https://github.com/rust-lang/crates.io-index"
299 + checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
300 +
301 + [[package]]
302 + name = "futures-core"
303 + version = "0.3.32"
304 + source = "registry+https://github.com/rust-lang/crates.io-index"
305 + checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
306 +
307 + [[package]]
308 + name = "futures-task"
309 + version = "0.3.32"
310 + source = "registry+https://github.com/rust-lang/crates.io-index"
311 + checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
312 +
313 + [[package]]
314 + name = "futures-util"
315 + version = "0.3.32"
316 + source = "registry+https://github.com/rust-lang/crates.io-index"
317 + checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
318 + dependencies = [
319 + "futures-core",
320 + "futures-task",
321 + "pin-project-lite",
322 + "slab",
323 + ]
324 +
325 + [[package]]
326 + name = "getopts"
327 + version = "0.2.24"
328 + source = "registry+https://github.com/rust-lang/crates.io-index"
329 + checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df"
330 + dependencies = [
331 + "unicode-width",
332 + ]
333 +
334 + [[package]]
335 + name = "getrandom"
336 + version = "0.3.4"
337 + source = "registry+https://github.com/rust-lang/crates.io-index"
338 + checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
339 + dependencies = [
340 + "cfg-if",
341 + "libc",
342 + "r-efi 5.3.0",
343 + "wasip2",
344 + ]
345 +
346 + [[package]]
347 + name = "getrandom"
348 + version = "0.4.2"
349 + source = "registry+https://github.com/rust-lang/crates.io-index"
350 + checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555"
351 + dependencies = [
352 + "cfg-if",
353 + "libc",
354 + "r-efi 6.0.0",
355 + "wasip2",
356 + "wasip3",
357 + ]
358 +
359 + [[package]]
360 + name = "half"
361 + version = "2.7.1"
362 + source = "registry+https://github.com/rust-lang/crates.io-index"
363 + checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
364 + dependencies = [
365 + "cfg-if",
366 + "crunchy",
367 + "zerocopy",
368 + ]
369 +
370 + [[package]]
371 + name = "hashbrown"
372 + version = "0.15.5"
373 + source = "registry+https://github.com/rust-lang/crates.io-index"
374 + checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
375 + dependencies = [
376 + "foldhash",
377 + ]
378 +
379 + [[package]]
380 + name = "hashbrown"
381 + version = "0.17.0"
382 + source = "registry+https://github.com/rust-lang/crates.io-index"
383 + checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51"
384 +
385 + [[package]]
386 + name = "heck"
387 + version = "0.5.0"
388 + source = "registry+https://github.com/rust-lang/crates.io-index"
389 + checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
390 +
391 + [[package]]
392 + name = "hermit-abi"
393 + version = "0.5.2"
394 + source = "registry+https://github.com/rust-lang/crates.io-index"
395 + checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
396 +
397 + [[package]]
398 + name = "html5ever"
399 + version = "0.39.0"
400 + source = "registry+https://github.com/rust-lang/crates.io-index"
401 + checksum = "46a1761807faccc9a19e86944bbf40610014066306f96edcdedc2fb714bcb7b8"
402 + dependencies = [
403 + "log",
404 + "markup5ever",
405 + ]
406 +
407 + [[package]]
408 + name = "id-arena"
409 + version = "2.3.0"
410 + source = "registry+https://github.com/rust-lang/crates.io-index"
411 + checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954"
412 +
413 + [[package]]
414 + name = "indexmap"
415 + version = "2.14.0"
416 + source = "registry+https://github.com/rust-lang/crates.io-index"
417 + checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
418 + dependencies = [
419 + "equivalent",
420 + "hashbrown 0.17.0",
421 + "serde",
422 + "serde_core",
423 + ]
424 +
425 + [[package]]
426 + name = "is-terminal"
427 + version = "0.4.17"
428 + source = "registry+https://github.com/rust-lang/crates.io-index"
429 + checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46"
430 + dependencies = [
431 + "hermit-abi",
432 + "libc",
433 + "windows-sys",
434 + ]
435 +
436 + [[package]]
437 + name = "itertools"
438 + version = "0.10.5"
439 + source = "registry+https://github.com/rust-lang/crates.io-index"
440 + checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
441 + dependencies = [
442 + "either",
443 + ]
444 +
445 + [[package]]
446 + name = "itoa"
447 + version = "1.0.18"
448 + source = "registry+https://github.com/rust-lang/crates.io-index"
449 + checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
450 +
451 + [[package]]
452 + name = "js-sys"
453 + version = "0.3.97"
454 + source = "registry+https://github.com/rust-lang/crates.io-index"
455 + checksum = "a1840c94c045fbcf8ba2812c95db44499f7c64910a912551aaaa541decebcacf"
456 + dependencies = [
457 + "cfg-if",
458 + "futures-util",
459 + "once_cell",
460 + "wasm-bindgen",
461 + ]
462 +
463 + [[package]]
464 + name = "leb128fmt"
465 + version = "0.1.0"
466 + source = "registry+https://github.com/rust-lang/crates.io-index"
467 + checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2"
468 +
469 + [[package]]
470 + name = "libc"
471 + version = "0.2.186"
472 + source = "registry+https://github.com/rust-lang/crates.io-index"
473 + checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
474 +
475 + [[package]]
476 + name = "linux-raw-sys"
477 + version = "0.12.1"
478 + source = "registry+https://github.com/rust-lang/crates.io-index"
479 + checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
480 +
481 + [[package]]
482 + name = "lock_api"
483 + version = "0.4.14"
484 + source = "registry+https://github.com/rust-lang/crates.io-index"
485 + checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
486 + dependencies = [
487 + "scopeguard",
488 + ]
489 +
490 + [[package]]
491 + name = "log"
492 + version = "0.4.29"
493 + source = "registry+https://github.com/rust-lang/crates.io-index"
494 + checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
495 +
496 + [[package]]
497 + name = "markup5ever"
498 + version = "0.39.0"
499 + source = "registry+https://github.com/rust-lang/crates.io-index"
500 + checksum = "7122d987ec5f704ee56f6e5b41a7d93722e9aae27ae07cafa4036c4d3f9757de"
Lines truncated
A Cargo.toml +20
@@ -0,0 +1,20 @@
1 + [package]
2 + name = "pter"
3 + version = "0.1.0"
4 + edition = "2024"
5 + description = "Plain Text Email Renderer — convert HTML email bodies into readable markdown"
6 + license = "MIT"
7 + repository = "https://github.com/maxjacobson/pter"
8 + keywords = ["email", "html", "markdown", "plaintext", "converter"]
9 + categories = ["email", "text-processing", "parser-implementations"]
10 +
11 + [dependencies]
12 + scraper = "0.26"
13 +
14 + [dev-dependencies]
15 + proptest = "1"
16 + criterion = { version = "0.5", features = ["html_reports"] }
17 +
18 + [[bench]]
19 + name = "convert_bench"
20 + harness = false
A LICENSE +21
@@ -0,0 +1,21 @@
1 + MIT License
2 +
3 + Copyright (c) 2026 Max Jacobson
4 +
5 + Permission is hereby granted, free of charge, to any person obtaining a copy
6 + of this software and associated documentation files (the "Software"), to deal
7 + in the Software without restriction, including without limitation the rights
8 + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 + copies of the Software, and to permit persons to whom the Software is
10 + furnished to do so, subject to the following conditions:
11 +
12 + The above copyright notice and this permission notice shall be included in all
13 + copies or substantial portions of the Software.
14 +
15 + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 + SOFTWARE.
A README.md +33
@@ -0,0 +1,33 @@
1 + # pter
2 +
3 + **Plain Text Email Renderer** — convert HTML email bodies into readable markdown.
4 +
5 + Email HTML is a hostile environment: table-based layouts from 2004, tracking pixels,
6 + Outlook conditional comments, deeply nested reply chains with inconsistent quoting.
7 + pter converts all of it into clean, readable markdown.
8 +
9 + ## Usage
10 +
11 + ```rust
12 + let html = r#"<h1>Hello</h1><p>This is an <strong>email</strong>.</p>"#;
13 + let markdown = pter::convert(html);
14 + assert_eq!(markdown, "# Hello\n\nThis is an **email**.");
15 + ```
16 +
17 + ## What it does
18 +
19 + - Converts HTML elements to markdown equivalents (headings, links, lists, emphasis, code, images)
20 + - Unwraps table-based email layouts (single-cell tables become content, multi-column linearizes)
21 + - Detects and normalizes reply chains into `>` quoted markdown
22 + - Strips tracking pixels, invisible elements, and Outlook conditional comments
23 + - Produces output that is readable as plain text and renderable by any markdown toolchain
24 +
25 + ## What it does not do
26 +
27 + - Parse MIME email structure (use `mailparse` or `mail-parser` for that)
28 + - Extract article content from marketing templates (compose with a separate extractor)
29 + - Render markdown to a display format (use `pulldown-cmark`, `comrak`, etc.)
30 +
31 + ## License
32 +
33 + MIT
@@ -0,0 +1,97 @@
1 + use criterion::{Criterion, black_box, criterion_group, criterion_main};
2 +
3 + fn simple_email() -> &'static str {
4 + r#"<html><body>
5 + <h1>Meeting Tomorrow</h1>
6 + <p>Hi Max,</p>
7 + <p>Just confirming our meeting tomorrow at <strong>2pm</strong>.
8 + Please review the <a href="https://example.com/doc">document</a> beforehand.</p>
9 + <p>Best,<br>Alice</p>
10 + </body></html>"#
11 + }
12 +
13 + fn newsletter_email() -> &'static str {
14 + r#"<html><body>
15 + <table width="100%" cellpadding="0" cellspacing="0" role="presentation">
16 + <tr><td align="center">
17 + <table width="600" cellpadding="0" cellspacing="0">
18 + <tr><td>
19 + <h2>Weekly Digest</h2>
20 + <p>Here are your updates:</p>
21 + <ul>
22 + <li>New feature: <strong>Dark mode</strong> is now available</li>
23 + <li>Bug fix: Resolved <a href="https://example.com/issue/123">issue #123</a></li>
24 + <li>Update: API v2 documentation published</li>
25 + </ul>
26 + <p>Thanks for reading!</p>
27 + <hr>
28 + <p><small>Unsubscribe: <a href="https://example.com/unsub">click here</a></small></p>
29 + </td></tr>
30 + </table>
31 + </td></tr>
32 + </table>
33 + <img src="https://track.example.com/open.gif" width="1" height="1">
34 + </body></html>"#
35 + }
36 +
37 + fn reply_chain() -> &'static str {
38 + r#"<html><body>
39 + <div dir="ltr"><p>Got it, thanks!</p></div>
40 + <div class="gmail_quote">
41 + <div class="gmail_attr">On Tue, Jan 6, Bob wrote:</div>
42 + <blockquote class="gmail_quote">
43 + <div dir="ltr"><p>Here's the update you requested.</p></div>
44 + <div class="gmail_quote">
45 + <div class="gmail_attr">On Mon, Jan 5, Alice wrote:</div>
46 + <blockquote class="gmail_quote">
47 + <div dir="ltr"><p>What's the status on the deployment?</p></div>
48 + </blockquote>
49 + </div>
50 + </blockquote>
51 + </div>
52 + </body></html>"#
53 + }
54 +
55 + fn large_email() -> String {
56 + let paragraph = "<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. \
57 + Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
58 + Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.</p>";
59 + let mut html = String::from("<html><body>");
60 + for i in 0..100 {
61 + html.push_str(&format!("<h3>Section {}</h3>", i));
62 + html.push_str(paragraph);
63 + }
64 + html.push_str("</body></html>");
65 + html
66 + }
67 +
68 + fn bench_simple(c: &mut Criterion) {
69 + let html = simple_email();
70 + c.bench_function("simple_email", |b| {
71 + b.iter(|| pter::convert(black_box(html)))
72 + });
73 + }
74 +
75 + fn bench_newsletter(c: &mut Criterion) {
76 + let html = newsletter_email();
77 + c.bench_function("newsletter_layout_tables", |b| {
78 + b.iter(|| pter::convert(black_box(html)))
79 + });
80 + }
81 +
82 + fn bench_reply_chain(c: &mut Criterion) {
83 + let html = reply_chain();
84 + c.bench_function("reply_chain_nested", |b| {
85 + b.iter(|| pter::convert(black_box(html)))
86 + });
87 + }
88 +
89 + fn bench_large(c: &mut Criterion) {
90 + let html = large_email();
91 + c.bench_function("large_100_sections", |b| {
92 + b.iter(|| pter::convert(black_box(&html)))
93 + });
94 + }
95 +
96 + criterion_group!(benches, bench_simple, bench_newsletter, bench_reply_chain, bench_large);
97 + criterion_main!(benches);
@@ -0,0 +1,47 @@
1 + # pter Architecture
2 +
3 + ## Overview
4 +
5 + pter converts HTML email bodies into readable markdown. It takes an HTML string and returns a markdown string. It does not handle MIME parsing, content extraction, or markdown rendering.
6 +
7 + ## Pipeline
8 +
9 + ```
10 + html: &str
11 + → scraper::Html::parse_document() # html5ever DOM tree
12 + → walk_children(root) # depth-first traversal
13 + → handle_text() # whitespace collapsing, entity decoding
14 + → handle_element() # classify → skip / transparent / block / inline
15 + → handle_block() # paragraphs, headings, lists, blockquotes, pre, hr
16 + → handle_inline() # bold, italic, links, images, code, br
17 + → whitespace::normalize() # collapse blank lines, trim
18 + → String
19 + ```
20 +
21 + ## Module Responsibilities
22 +
23 + | Module | Responsibility |
24 + |--------|---------------|
25 + | `lib.rs` | Public API (`convert`), re-exports |
26 + | `convert.rs` | DOM walker, `Context` state, element dispatch |
27 + | `elements.rs` | Element classification, tracking pixel / hidden detection |
28 + | `whitespace.rs` | Output normalization |
29 + | `tables.rs` | Table layout detection and unwrapping (Phase 2) |
30 + | `replies.rs` | Reply chain detection and quoting (Phase 3) |
31 +
32 + ## Design Decisions
33 +
34 + **scraper over html5ever directly**: We need tree traversal (parent/child/sibling access) for layout table unwrapping and reply chain detection. scraper provides this via ego-tree on top of html5ever's spec-compliant parsing.
35 +
36 + **Markdown output**: Markdown is readable as plain text and renderable by any toolchain. It preserves structural information (headings, links, lists) that plain text loses.
37 +
38 + **Faithful conversion**: pter converts what's there. Content extraction (stripping marketing wrappers) and post-processing (trimming signatures) are separate concerns, composable before or after pter.
39 +
40 + **Blockquote rendering**: Blockquotes render children into a temporary buffer, then prefix each line with `> `. This handles nested blockquotes naturally — inner quotes produce `> ` lines, outer quote prefixes them again to get `> > `.
41 +
42 + ## Dependencies
43 +
44 + | Crate | Purpose |
45 + |-------|---------|
46 + | `scraper` | HTML parsing + DOM tree + CSS selectors |
47 + | `proptest` (dev) | Property-based testing |
A docs/todo.md +95
@@ -0,0 +1,95 @@
1 + # pter - Todo
2 +
3 + Done: Phases 1-5 (except publish). Active: None. Next: cargo publish when ready.
4 +
5 + v0.1.0. 116 tests.
6 +
7 + ---
8 +
9 + ## Phase 1: Core Conversion
10 +
11 + ### Done
12 + - [x] Crate scaffold (Cargo.toml, MIT license, README)
13 + - [x] HTML element to markdown conversion (p, h1-h6, strong, em, a, img, ul/ol/li, blockquote, pre/code, hr, br, del, sup, sub)
14 + - [x] Tracking pixel detection (1x1 img, empty src, data URI, inline style)
15 + - [x] Hidden element skipping (display:none, visibility:hidden)
16 + - [x] Whitespace normalization (collapse blank lines, trim)
17 + - [x] Script/style/head stripping
18 + - [x] Entity decoding (via html5ever)
19 + - [x] Link deduplication (text matches URL)
20 + - [x] Nested list indentation
21 + - [x] Nested blockquote rendering
22 + - [x] Pre/code block rendering (no double-wrap)
23 +
24 + ---
25 +
26 + ## Phase 2: Email Layout Unwrapping
27 +
28 + ### Done
29 + - [x] Layout table detection heuristic (layout vs data table)
30 + - [x] Single-cell table unwrapping
31 + - [x] Multi-column table linearization
32 + - [x] Data table rendering as markdown table
33 + - [x] Nested layout table recursion
34 + - [x] font-size:0 / line-height:0 / height:0+overflow:hidden spacer detection
35 + - [x] role="presentation" detection
36 +
37 + ### Deferred
38 + - [ ] Outlook conditional comment stripping (client-specific, low cross-platform value)
39 +
40 + ---
41 +
42 + ## Phase 3: Reply Chain Detection
43 +
44 + ### Done
45 + - [x] Reply boundary abstraction (`is_reply_boundary` predicate)
46 + - [x] Structural markers (type=cite)
47 + - [x] CSS class markers (gmail_quote, divRplyFwdMsg, yahoo_quoted, protonmail_quote, tutanota_quote, moz-cite-prefix, zmail_extra)
48 + - [x] Attribution text detection (On ... wrote:, Forwarded message, Original Message, Begin forwarded message, French/German variants)
49 + - [x] Attribution line preservation above quote blocks
50 + - [x] Quote depth rendering via temporary buffer + `>` prefix
51 + - [x] Outlook separator detection (From/Sent/To/Subject blocks)
52 + - [x] Heuristic: div with attribution text followed by blockquote
53 + - [x] Previous sibling text scanning for attribution
54 +
55 + ---
56 +
57 + ## Phase 4: Integration
58 +
59 + ### Done
60 + - [x] GoingsOn: pter::convert() replaces strip_html in imap_client.rs extract_body_with_html()
61 + - [x] GoingsOn: removed ~230 lines of hand-rolled HTML stripping code + 30 tests (covered by pter)
62 + - [x] GoingsOn: path dep added to src-tauri/Cargo.toml
63 + - [x] Balanced Breakfast: pter::convert() replaces html2text in html_to_text + extract_article Rhai host functions
64 + - [x] Balanced Breakfast: html2text dependency removed from bb-core/Cargo.toml
65 + - [x] Both projects compile clean, BB tests pass (153 tests)
66 +
67 + ---
68 +
69 + ## Phase 5: Polish + Publish
70 +
71 + ### Done
72 + - [x] Property-based testing with proptest (7 fuzz strategies: never panics, no HTML leak, valid UTF-8, no triple newlines, no trailing whitespace, arbitrary bytes, whitespace-only)
73 + - [x] Edge case hardening (24 tests: empty, whitespace-only, deeply nested divs/blockquotes/lists, malformed HTML, unicode, large input, empty table cells, nested link formatting)
74 + - [x] Benchmarks with criterion (simple: 4µs, newsletter: 15µs, reply chain: 10µs, 100 sections: 101µs)
75 +
76 + ### Remaining
77 + - [ ] cargo publish to crates.io
78 + - [ ] Update GO and BB to crates.io version
79 +
80 + ---
81 +
82 + ## Key Paths
83 +
84 + | What | Where |
85 + |------|-------|
86 + | Public API | `src/lib.rs` |
87 + | Conversion pipeline | `src/convert.rs` |
88 + | Element classification | `src/elements.rs` |
89 + | Table handling | `src/tables.rs` |
90 + | Reply detection | `src/replies.rs` |
91 + | Whitespace normalization | `src/whitespace.rs` |
92 + | Integration tests | `tests/integration.rs` |
93 + | Edge case tests | `tests/edge_cases.rs` |
94 + | Property-based tests | `tests/proptest.rs` |
95 + | Benchmarks | `benches/convert_bench.rs` |
@@ -0,0 +1,689 @@
1 + use scraper::node::Node;
2 + use scraper::{ElementRef, Html};
3 +
4 + use crate::elements::{self, BlockKind, ElementAction, InlineKind};
5 + use crate::replies;
6 + use crate::tables;
7 + use crate::whitespace;
8 +
9 + /// Convert an HTML email body into readable markdown.
10 + ///
11 + /// This is the main entry point for pter. Pass in an HTML string
12 + /// (just the body, not MIME structure) and get back clean markdown.
13 + ///
14 + /// ```
15 + /// let md = pter::convert("<p>Hello <strong>world</strong></p>");
16 + /// assert_eq!(md, "Hello **world**");
17 + /// ```
18 + pub fn convert(html: &str) -> String {
19 + if html.is_empty() {
20 + return String::new();
21 + }
22 +
23 + let document = Html::parse_document(html);
24 + let mut ctx = Context::new();
25 + walk_children(document.root_element(), &mut ctx);
26 + whitespace::normalize(&ctx.output)
27 + }
28 +
29 + /// Conversion state threaded through the tree walk.
30 + struct Context {
31 + output: String,
32 + /// Current list nesting depth (for indentation).
33 + list_depth: u32,
34 + /// Whether we're inside a <pre> block (preserve whitespace).
35 + in_pre: bool,
36 + /// Whether we're inside an <a> tag (don't nest links).
37 + in_link: bool,
38 + /// Stack of list types for proper ordered/unordered rendering.
39 + list_stack: Vec<ListType>,
40 + }
41 +
42 + #[derive(Clone, Copy)]
43 + enum ListType {
44 + Unordered,
45 + Ordered(u32), // current item number
46 + }
47 +
48 + impl Context {
49 + fn new() -> Self {
50 + Self {
51 + output: String::with_capacity(4096),
52 + list_depth: 0,
53 + in_pre: false,
54 + in_link: false,
55 + list_stack: Vec::new(),
56 + }
57 + }
58 +
59 + fn push(&mut self, s: &str) {
60 + self.output.push_str(s);
61 + }
62 +
63 + fn push_char(&mut self, c: char) {
64 + self.output.push(c);
65 + }
66 +
67 + fn ensure_blank_line(&mut self) {
68 + let trimmed = self.output.trim_end_matches(' ');
69 + if trimmed.is_empty() {
70 + return;
71 + }
72 + if trimmed.ends_with("\n\n") {
73 + return;
74 + }
75 + self.output.truncate(trimmed.len());
76 + self.output.push_str("\n\n");
77 + }
78 +
79 + fn ensure_newline(&mut self) {
80 + if !self.output.is_empty() && !self.output.ends_with('\n') {
81 + self.output.push('\n');
82 + }
83 + }
84 +
85 + fn list_indent(&self) -> String {
86 + if self.list_depth <= 1 {
87 + return String::new();
88 + }
89 + " ".repeat((self.list_depth - 1) as usize)
90 + }
91 + }
92 +
93 + /// Walk all children of a node, converting each to markdown.
94 + fn walk_children(parent: ElementRef, ctx: &mut Context) {
95 + for child in parent.children() {
96 + match child.value() {
97 + Node::Text(text) => {
98 + handle_text(&text.text, ctx);
99 + }
100 + Node::Element(_) => {
101 + if let Some(el_ref) = ElementRef::wrap(child) {
102 + handle_element(el_ref, ctx);
103 + }
104 + }
105 + _ => {}
106 + }
107 + }
108 + }
109 +
110 + /// Handle a text node.
111 + fn handle_text(text: &str, ctx: &mut Context) {
112 + if ctx.in_pre {
113 + ctx.push(text);
114 + return;
115 + }
116 +
117 + // Collapse whitespace in normal flow
118 + let mut last_was_space = ctx.output.ends_with(' ') || ctx.output.ends_with('\n');
119 + for ch in text.chars() {
120 + if ch.is_ascii_whitespace() {
121 + if !last_was_space {
122 + ctx.push_char(' ');
123 + last_was_space = true;
124 + }
125 + } else {
126 + ctx.push_char(ch);
127 + last_was_space = false;
128 + }
129 + }
130 + }
131 +
132 + /// Handle an element node — classify it and render accordingly.
133 + fn handle_element(el: ElementRef, ctx: &mut Context) {
134 + let element = el.value();
135 +
136 + // Check hidden elements
137 + if elements::is_hidden(element) {
138 + return;
139 + }
140 +
141 + // Check for reply boundaries before normal classification.
142 + // Reply boundaries (gmail_quote, type=cite, etc.) get rendered
143 + // as blockquotes regardless of their actual element type.
144 + if replies::is_reply_boundary(el) {
145 + render_reply_block(el, ctx);
146 + return;
147 + }
148 +
149 + // Check for Outlook-style "From: ... Sent: ..." separator blocks.
150 + // These introduce quoted content that follows them.
151 + if replies::is_outlook_separator(el) {
152 + ctx.ensure_blank_line();
153 + // Render the separator header as attribution
154 + let text: String = el.text().collect();
155 + let trimmed = text.split_whitespace().collect::<Vec<_>>().join(" ");
156 + ctx.push(&trimmed);
157 + ctx.ensure_blank_line();
158 + return;
159 + }
160 +
161 + match elements::classify(element) {
162 + ElementAction::Skip => {}
163 + ElementAction::Transparent => walk_children(el, ctx),
164 + ElementAction::Block(kind) => handle_block(el, ctx, kind),
165 + ElementAction::Inline(kind) => handle_inline(el, ctx, kind),
166 + }
167 + }
168 +
169 + fn handle_block(el: ElementRef, ctx: &mut Context, kind: BlockKind) {
170 + match kind {
171 + BlockKind::Paragraph => {
172 + ctx.ensure_blank_line();
173 + walk_children(el, ctx);
174 + ctx.ensure_blank_line();
175 + }
176 +
177 + BlockKind::Heading(level) => {
178 + ctx.ensure_blank_line();
179 + let prefix = "#".repeat(level as usize);
180 + ctx.push(&prefix);
181 + ctx.push_char(' ');
182 + walk_children(el, ctx);
183 + ctx.ensure_blank_line();
184 + }
185 +
186 + BlockKind::Blockquote => {
187 + ctx.ensure_blank_line();
188 + // Render children into a temporary buffer, then prefix each line with >
189 + let mut inner_ctx = Context::new();
190 + inner_ctx.in_pre = ctx.in_pre;
191 + inner_ctx.in_link = ctx.in_link;
192 + walk_children(el, &mut inner_ctx);
193 + let inner = whitespace::normalize(&inner_ctx.output);
194 + for line in inner.lines() {
195 + ctx.push("> ");
196 + ctx.push(line);
197 + ctx.push_char('\n');
198 + }
199 + ctx.push_char('\n');
200 + }
201 +
202 + BlockKind::UnorderedList => {
203 + ctx.ensure_blank_line();
204 + ctx.list_depth += 1;
205 + ctx.list_stack.push(ListType::Unordered);
206 + walk_children(el, ctx);
207 + ctx.list_stack.pop();
208 + ctx.list_depth -= 1;
209 + ctx.ensure_blank_line();
210 + }
211 +
212 + BlockKind::OrderedList => {
213 + ctx.ensure_blank_line();
214 + ctx.list_depth += 1;
215 + ctx.list_stack.push(ListType::Ordered(0));
216 + walk_children(el, ctx);
217 + ctx.list_stack.pop();
218 + ctx.list_depth -= 1;
219 + ctx.ensure_blank_line();
220 + }
221 +
222 + BlockKind::ListItem => {
223 + ctx.ensure_newline();
224 + let indent = ctx.list_indent();
225 + ctx.push(&indent);
226 +
227 + // Determine bullet or number
228 + let marker = match ctx.list_stack.last_mut() {
229 + Some(ListType::Unordered) => "- ".to_string(),
230 + Some(ListType::Ordered(n)) => {
231 + *n += 1;
232 + format!("{}. ", *n)
233 + }
234 + None => "- ".to_string(),
235 + };
236 + ctx.push(&marker);
237 + walk_children(el, ctx);
238 + ctx.ensure_newline();
239 + }
240 +
241 + BlockKind::PreFormatted => {
242 + ctx.ensure_blank_line();
243 + ctx.push("```\n");
244 + ctx.in_pre = true;
245 + walk_children(el, ctx);
246 + ctx.in_pre = false;
247 + ctx.ensure_newline();
248 + ctx.push("```");
249 + ctx.ensure_blank_line();
250 + }
251 +
252 + BlockKind::HorizontalRule => {
253 + ctx.ensure_blank_line();
254 + ctx.push("---");
255 + ctx.ensure_blank_line();
256 + }
257 +
258 + BlockKind::Table => {
259 + ctx.ensure_blank_line();
260 + if tables::is_data_table(el) {
261 + let (headers, rows) = tables::extract_table_data(el);
262 + let md = tables::render_markdown_table(&headers, &rows);
263 + if !md.is_empty() {
264 + ctx.push(&md);
265 + }
266 + } else {
267 + // Layout table — unwrap and render cell contents directly
268 + render_layout_table(el, ctx);
269 + }
270 + ctx.ensure_blank_line();
271 + }
272 +
273 + BlockKind::Div => {
274 + // Divs act as block separators but don't add their own markup
275 + ctx.ensure_blank_line();
276 + walk_children(el, ctx);
277 + ctx.ensure_blank_line();
278 + }
279 + }
280 + }
281 +
282 + fn handle_inline(el: ElementRef, ctx: &mut Context, kind: InlineKind) {
283 + match kind {
284 + InlineKind::Bold => {
285 + ctx.push("**");
286 + walk_children(el, ctx);
287 + ctx.push("**");
288 + }
289 +
290 + InlineKind::Italic => {
291 + ctx.push("*");
292 + walk_children(el, ctx);
293 + ctx.push("*");
294 + }
295 +
296 + InlineKind::Strikethrough => {
297 + ctx.push("~~");
298 + walk_children(el, ctx);
299 + ctx.push("~~");
300 + }
301 +
302 + InlineKind::Code => {
303 + if ctx.in_pre {
304 + // Inside a <pre>, don't double-wrap
305 + walk_children(el, ctx);
306 + } else {
307 + ctx.push("`");
308 + walk_children(el, ctx);
309 + ctx.push("`");
310 + }
311 + }
312 +
313 + InlineKind::Link => {
314 + if ctx.in_link {
315 + // Don't nest links
316 + walk_children(el, ctx);
317 + return;
318 + }
319 +
320 + let href = el.value().attr("href").unwrap_or("");
321 +
322 + if href.is_empty() || href == "#" {
323 + walk_children(el, ctx);
324 + return;
325 + }
326 +
327 + // Collect the link text
328 + let mut text_ctx = Context::new();
329 + text_ctx.in_link = true;
330 + walk_children(el, &mut text_ctx);
331 + let text = text_ctx.output.trim().to_string();
332 +
333 + if text.is_empty() {
334 + // Link with no text — just show the URL
335 + ctx.push(href);
336 + } else if text == href {
337 + // Link text matches URL — no need for markdown link syntax
338 + ctx.push(href);
339 + } else {
340 + ctx.push("[");
341 + ctx.push(&text);
342 + ctx.push("](");
343 + ctx.push(href);
344 + ctx.push(")");
345 + }
346 + }
347 +
348 + InlineKind::Image => {
349 + let element = el.value();
350 + if elements::is_tracking_pixel(element) {
351 + return;
352 + }
353 +
354 + let alt = element.attr("alt").unwrap_or("");
355 + let src = element.attr("src").unwrap_or("");
356 +
357 + if src.is_empty() {
358 + return;
359 + }
360 +
361 + ctx.push("![");
362 + ctx.push(alt);
363 + ctx.push("](");
364 + ctx.push(src);
365 + ctx.push(")");
366 + }
367 +
368 + InlineKind::LineBreak => {
369 + ctx.push_char('\n');
370 + }
371 +
372 + InlineKind::Superscript => {
373 + ctx.push("^");
374 + walk_children(el, ctx);
375 + }
376 +
377 + InlineKind::Subscript => {
378 + ctx.push("~");
379 + walk_children(el, ctx);
380 + }
381 + }
382 + }
383 +
384 + /// Render a reply boundary as a quoted block.
385 + ///
386 + /// This is the same rendering logic as `<blockquote>` — children are
387 + /// rendered into a temporary buffer and each line gets `> ` prefixed.
388 + /// Attribution lines (e.g. "On ... wrote:") are rendered above the quote.
389 + fn render_reply_block(el: ElementRef, ctx: &mut Context) {
390 + ctx.ensure_blank_line();
391 +
392 + // Look for attribution text
393 + if let Some(attribution) = replies::find_attribution(el) {
394 + ctx.push(&attribution);
395 + ctx.push_char('\n');
396 + }
397 +
398 + // Render children into temp buffer, then prefix with >
399 + let mut inner_ctx = Context::new();
400 + inner_ctx.in_pre = ctx.in_pre;
401 + inner_ctx.in_link = ctx.in_link;
402 + walk_children(el, &mut inner_ctx);
403 + let inner = whitespace::normalize(&inner_ctx.output);
404 +
405 + if !inner.is_empty() {
406 + for line in inner.lines() {
407 + ctx.push("> ");
408 + ctx.push(line);
409 + ctx.push_char('\n');
410 + }
411 + ctx.push_char('\n');
412 + }
413 + }
414 +
415 + /// Unwrap a layout table by rendering cell contents sequentially.
416 + ///
417 + /// Walks through rows and cells, rendering each cell's content as if
418 + /// the table wrapper didn't exist. This handles the common email pattern
419 + /// of wrapping everything in `<table><tr><td>...</td></tr></table>`.
420 + fn render_layout_table(table: ElementRef, ctx: &mut Context) {
421 + for descendant in table.descendants() {
422 + if let Some(el_ref) = ElementRef::wrap(descendant) {
423 + let name = el_ref.value().name();
424 + if name == "td" || name == "th" {
425 + // Check if the cell itself is hidden
426 + if !elements::is_hidden(el_ref.value()) {
427 + walk_children(el_ref, ctx);
428 + ctx.ensure_blank_line();
429 + }
430 + }
431 + }
432 + }
433 + }
434 +
435 + #[cfg(test)]
436 + mod tests {
437 + use super::*;
438 +
439 + // -- Basic elements --
440 +
441 + #[test]
442 + fn empty_input() {
443 + assert_eq!(convert(""), "");
444 + }
445 +
446 + #[test]
447 + fn plain_text() {
448 + assert_eq!(convert("hello world"), "hello world");
449 + }
450 +
451 + #[test]
452 + fn paragraph() {
453 + assert_eq!(convert("<p>one</p><p>two</p>"), "one\n\ntwo");
454 + }
455 +
456 + #[test]
457 + fn headings() {
458 + assert_eq!(convert("<h1>Title</h1>"), "# Title");
459 + assert_eq!(convert("<h3>Sub</h3>"), "### Sub");
460 + }
461 +
462 + #[test]
463 + fn bold_and_italic() {
464 + assert_eq!(
465 + convert("<p><strong>bold</strong> and <em>italic</em></p>"),
466 + "**bold** and *italic*"
467 + );
468 + }
469 +
470 + #[test]
471 + fn link() {
472 + assert_eq!(
473 + convert(r#"<a href="https://example.com">click</a>"#),
474 + "[click](https://example.com)"
475 + );
476 + }
477 +
478 + #[test]
479 + fn link_text_matches_url() {
480 + assert_eq!(
481 + convert(r#"<a href="https://example.com">https://example.com</a>"#),
482 + "https://example.com"
483 + );
484 + }
485 +
486 + #[test]
487 + fn link_empty_href() {
488 + assert_eq!(convert(r#"<a href="">click</a>"#), "click");
489 + }
490 +
491 + #[test]
492 + fn image() {
493 + assert_eq!(
494 + convert(r#"<img src="photo.jpg" alt="A photo">"#),
495 + "![A photo](photo.jpg)"
496 + );
497 + }
498 +
499 + #[test]
500 + fn tracking_pixel_skipped() {
Lines truncated
@@ -0,0 +1,156 @@
1 + use scraper::node::Element;
2 +
3 + /// What kind of markdown wrapper an element produces.
4 + pub enum ElementAction {
5 + /// Skip this element and all its children entirely.
6 + Skip,
7 + /// Render children only, no wrapper (transparent element).
8 + Transparent,
9 + /// Block element with specific rendering.
10 + Block(BlockKind),
11 + /// Inline element with specific rendering.
12 + Inline(InlineKind),
13 + }
14 +
15 + pub enum BlockKind {
16 + Paragraph,
17 + Heading(u8),
18 + Blockquote,
19 + UnorderedList,
20 + OrderedList,
21 + ListItem,
22 + PreFormatted,
23 + HorizontalRule,
24 + Table,
25 + Div,
26 + }
27 +
28 + pub enum InlineKind {
29 + Bold,
30 + Italic,
31 + Strikethrough,
32 + Code,
33 + Link,
34 + Image,
35 + LineBreak,
36 + Superscript,
37 + Subscript,
38 + }
39 +
40 + /// Classify an HTML element into the action pter should take.
41 + pub fn classify(el: &Element) -> ElementAction {
42 + match el.name() {
43 + // Skip entirely
44 + "script" | "style" | "head" | "meta" | "link" | "title" | "noscript" => {
45 + ElementAction::Skip
46 + }
47 +
48 + // Block elements
49 + "p" => ElementAction::Block(BlockKind::Paragraph),
50 + "h1" => ElementAction::Block(BlockKind::Heading(1)),
51 + "h2" => ElementAction::Block(BlockKind::Heading(2)),
52 + "h3" => ElementAction::Block(BlockKind::Heading(3)),
53 + "h4" => ElementAction::Block(BlockKind::Heading(4)),
54 + "h5" => ElementAction::Block(BlockKind::Heading(5)),
55 + "h6" => ElementAction::Block(BlockKind::Heading(6)),
56 + "blockquote" => ElementAction::Block(BlockKind::Blockquote),
57 + "ul" | "menu" => ElementAction::Block(BlockKind::UnorderedList),
58 + "ol" => ElementAction::Block(BlockKind::OrderedList),
59 + "li" => ElementAction::Block(BlockKind::ListItem),
60 + "pre" => ElementAction::Block(BlockKind::PreFormatted),
61 + "hr" => ElementAction::Block(BlockKind::HorizontalRule),
62 + "table" => ElementAction::Block(BlockKind::Table),
63 + // Table sub-elements are handled by the Table block handler, not individually
64 + "thead" | "tbody" | "tfoot" | "tr" | "td" | "th" | "caption" | "colgroup" | "col" => {
65 + ElementAction::Transparent
66 + }
67 + "div" | "section" | "article" | "main" | "header" | "footer" | "nav" | "aside"
68 + | "figure" | "figcaption" | "details" | "summary" => {
69 + ElementAction::Block(BlockKind::Div)
70 + }
71 +
72 + // Inline elements
73 + "strong" | "b" => ElementAction::Inline(InlineKind::Bold),
74 + "em" | "i" => ElementAction::Inline(InlineKind::Italic),
75 + "del" | "s" | "strike" => ElementAction::Inline(InlineKind::Strikethrough),
76 + "code" | "tt" => ElementAction::Inline(InlineKind::Code),
77 + "a" => ElementAction::Inline(InlineKind::Link),
78 + "img" => ElementAction::Inline(InlineKind::Image),
79 + "br" => ElementAction::Inline(InlineKind::LineBreak),
80 + "sup" => ElementAction::Inline(InlineKind::Superscript),
81 + "sub" => ElementAction::Inline(InlineKind::Subscript),
82 +
83 + // Everything else: transparent (render children)
84 + _ => ElementAction::Transparent,
85 + }
86 + }
87 +
88 + /// Check if an <img> element is a tracking pixel.
89 + /// Returns true if it should be skipped.
90 + pub fn is_tracking_pixel(el: &Element) -> bool {
91 + let width = el.attr("width");
92 + let height = el.attr("height");
93 +
94 + // 1x1 or 0x0 images
95 + if matches!(width, Some("1" | "0")) || matches!(height, Some("1" | "0")) {
96 + return true;
97 + }
98 +
99 + // No src attribute
100 + let Some(src) = el.attr("src") else {
101 + return true;
102 + };
103 +
104 + // Empty or data:image/gif (common transparent pixel)
105 + if src.is_empty() {
106 + return true;
107 + }
108 + if src.starts_with("data:image/gif;base64,R0lGOD") {
109 + return true;
110 + }
111 +
112 + // Check inline style for tiny dimensions
113 + if let Some(style) = el.attr("style") {
114 + let style_lower = style.to_lowercase();
115 + if style_lower.contains("width:1px")
116 + || style_lower.contains("width: 1px")
117 + || style_lower.contains("width:0")
118 + || style_lower.contains("height:1px")
119 + || style_lower.contains("height: 1px")
120 + || style_lower.contains("height:0")
121 + || style_lower.contains("display:none")
122 + || style_lower.contains("display: none")
123 + {
124 + return true;
125 + }
126 + }
127 +
128 + false
129 + }
130 +
131 + /// Check if an element is hidden via inline style.
132 + ///
133 + /// Catches display:none, visibility:hidden, and spacer tricks
134 + /// like font-size:0 or line-height:0 (commonly used in email templates).
135 + pub fn is_hidden(el: &Element) -> bool {
136 + if let Some(style) = el.attr("style") {
137 + let s = style.to_lowercase();
138 + if s.contains("display:none")
139 + || s.contains("display: none")
140 + || s.contains("visibility:hidden")
141 + || s.contains("visibility: hidden")
142 + || s.contains("font-size:0")
143 + || s.contains("font-size: 0")
144 + || s.contains("line-height:0")
145 + || s.contains("line-height: 0")
146 + || (s.contains("height:0") && s.contains("overflow:hidden"))
147 + || (s.contains("height: 0") && s.contains("overflow: hidden"))
148 + || s.contains("max-height:0")
149 + || s.contains("max-height: 0")
150 + {
151 + return true;
152 + }
153 + }
154 + false
155 + }
156 +
A src/lib.rs +11
@@ -0,0 +1,11 @@
1 + //! pter — Plain Text Email Renderer
2 + //!
3 + //! Converts HTML email bodies into readable markdown.
4 +
5 + mod convert;
6 + mod elements;
7 + mod replies;
8 + mod tables;
9 + mod whitespace;
10 +
11 + pub use convert::convert;
@@ -0,0 +1,324 @@
1 + use scraper::node::Node;
2 + use scraper::ElementRef;
3 +
4 + /// Check if an element marks the beginning of a quoted reply.
5 + ///
6 + /// This is the central abstraction for reply detection. Rather than
7 + /// building per-client logic throughout the converter, all client-specific
8 + /// knowledge lives here behind a single predicate.
9 + ///
10 + /// An element is a reply boundary if it's a container that wraps quoted
11 + /// content from a previous message in the thread. The converter treats
12 + /// these identically to `<blockquote>` — children get `>` prefixed.
13 + pub fn is_reply_boundary(el: ElementRef) -> bool {
14 + let element = el.value();
15 + let name = element.name();
16 +
17 + // <blockquote> is already handled by the element classifier.
18 + // This function catches non-blockquote reply wrappers.
19 +
20 + // Structural: elements with type="cite" (Apple Mail, some webmail)
21 + if element.attr("type") == Some("cite") {
22 + return true;
23 + }
24 +
25 + // Class/ID-based detection — thin per-client checks
26 + if element.attr("class").is_some_and(is_reply_class) {
27 + return true;
28 + }
29 +
30 + if element.attr("id").is_some_and(is_reply_id) {
31 + return true;
32 + }
33 +
34 + // Heuristic: a <div> whose first meaningful text child matches
35 + // an attribution pattern ("On ... wrote:") followed by a blockquote
36 + if name == "div" && has_attribution_then_quote(el) {
37 + return true;
38 + }
39 +
40 + false
41 + }
42 +
43 + /// Extract an attribution line from just before or at the start of a reply boundary.
44 + ///
45 + /// Returns the attribution text (e.g. "On Mon, Jan 5, Alice wrote:") if found,
46 + /// so the converter can render it above the quoted block.
47 + pub fn find_attribution(el: ElementRef) -> Option<String> {
48 + // Check the element's own leading text for attribution patterns
49 + for child in el.children() {
50 + match child.value() {
51 + Node::Text(text) => {
52 + let trimmed = text.text.trim();
53 + if is_attribution_text(trimmed) {
54 + return Some(trimmed.to_string());
55 + }
56 + }
57 + Node::Element(_) => {
58 + // Stop at the first child element — attribution is leading text
59 + break;
60 + }
61 + _ => {}
62 + }
63 + }
64 +
65 + // Check for a preceding sibling text node or element with attribution
66 + if let Some(prev) = previous_sibling_text(el) {
67 + let trimmed = prev.trim().to_string();
68 + if is_attribution_text(&trimmed) {
69 + return Some(trimmed);
70 + }
71 + }
72 +
73 + None
74 + }
75 +
76 + /// Check if text matches common email attribution patterns.
77 + ///
78 + /// These patterns are cross-client — every email client generates some
79 + /// variant of "On [date], [person] wrote:" or "--- Forwarded message ---".
80 + fn is_attribution_text(text: &str) -> bool {
81 + let t = text.trim();
82 +
83 + // "On ... wrote:" (Gmail, Apple Mail, Thunderbird, most clients)
84 + if t.starts_with("On ") && t.ends_with("wrote:") {
85 + return true;
86 + }
87 + // Localized variants: "Le ... a écrit :" (French), "Am ... schrieb" (German)
88 + if (t.starts_with("Le ") || t.starts_with("El "))
89 + && (t.ends_with("crit :") || t.ends_with("crit:"))
90 + {
91 + return true;
92 + }
93 + if t.starts_with("Am ") && (t.ends_with("schrieb:") || t.ends_with("schrieb :")) {
94 + return true;
95 + }
96 +
97 + // Forwarded message separators
98 + if t.contains("Forwarded message")
99 + || t.contains("Begin forwarded message")
100 + || t.contains("Original Message")
101 + {
102 + return true;
103 + }
104 +
105 + false
106 + }
107 +
108 + /// Thin per-client class checks. Each is one line — easy to add new clients.
109 + fn is_reply_class(class: &str) -> bool {
110 + // Split on whitespace to check individual class names
111 + class.split_whitespace().any(|c| {
112 + matches!(
113 + c,
114 + "gmail_quote"
115 + | "gmail_extra"
116 + | "yahoo_quoted"
117 + | "protonmail_quote"
118 + | "tutanota_quote"
119 + | "moz-cite-prefix" // Thunderbird
120 + | "zmail_extra" // Zoho
121 + | "WordSection1" // Outlook (sometimes wraps replies)
122 + )
123 + })
124 + }
125 +
126 + /// Thin per-client ID checks.
127 + fn is_reply_id(id: &str) -> bool {
128 + matches!(
129 + id,
130 + "divRplyFwdMsg" // Outlook
131 + | "reply-message" // Generic
132 + | "OLK_SRC_BODY_SECTION" // Outlook Mac
133 + )
134 + }
135 +
136 + /// Check if a div contains attribution text followed by a blockquote.
137 + ///
138 + /// This catches the common pattern where no class/id is present but
139 + /// the structure is: `<div>On ... wrote:<br><blockquote>...</blockquote></div>`
140 + fn has_attribution_then_quote(el: ElementRef) -> bool {
141 + let mut found_attribution = false;
142 +
143 + for child in el.children() {
144 + match child.value() {
145 + Node::Text(text) => {
146 + if is_attribution_text(text.text.trim()) {
147 + found_attribution = true;
148 + }
149 + }
150 + Node::Element(e) => {
151 + if found_attribution && e.name() == "blockquote" {
152 + return true;
153 + }
154 + // Skip <br> tags between attribution and blockquote
155 + if e.name() != "br" {
156 + // If we hit a non-br element before finding attribution, stop
157 + if !found_attribution {
158 + return false;
159 + }
160 + }
161 + }
162 + _ => {}
163 + }
164 + }
165 +
166 + false
167 + }
168 +
169 + /// Get text from the previous sibling, if it exists and is a text or inline element.
170 + fn previous_sibling_text(el: ElementRef) -> Option<String> {
171 + let prev = el.prev_sibling()?;
172 +
173 + match prev.value() {
174 + Node::Text(text) => Some(text.text.to_string()),
175 + Node::Element(e) => {
176 + // Check inline elements like <span>, <font> that might wrap attribution
177 + if matches!(e.name(), "span" | "font" | "b" | "i" | "div" | "p") {
178 + let el_ref = ElementRef::wrap(prev)?;
179 + let text: String = el_ref.text().collect();
180 + if !text.trim().is_empty() {
181 + return Some(text);
182 + }
183 + }
184 + None
185 + }
186 + _ => None,
187 + }
188 + }
189 +
190 + /// Check if a separator element marks the boundary between original
191 + /// content and a forwarded/replied message.
192 + ///
193 + /// This catches `<hr>` or styled divs that act as visual separators
194 + /// before reply content (common in Outlook "From: ... Sent: ..." blocks).
195 + pub fn is_outlook_separator(el: ElementRef) -> bool {
196 + let element = el.value();
197 +
198 + // Outlook uses a specific pattern: a div containing
199 + // "From: ... Sent: ... To: ... Subject: ..." as a reply header
200 + if element.name() == "div" || element.name() == "p" {
201 + let text: String = el.text().collect();
202 + let t = text.trim();
203 +
204 + // Must have at least From + Sent/Date or Subject
205 + let has_from = t.contains("From:");
206 + let has_sent = t.contains("Sent:") || t.contains("Date:");
207 + let has_subject = t.contains("Subject:");
208 +
209 + if has_from && (has_sent || has_subject) {
210 + return true;
211 + }
212 + }
213 +
214 + false
215 + }
216 +
217 + #[cfg(test)]
218 + mod tests {
219 + use super::*;
220 + use scraper::{Html, Selector};
221 +
222 + fn parse_and_select(html: &str, selector: &str) -> (Html, Selector) {
223 + let doc = Html::parse_document(html);
224 + let sel = Selector::parse(selector).unwrap();
225 + (doc, sel)
226 + }
227 +
228 + // -- Attribution detection --
229 +
230 + #[test]
231 + fn attribution_on_wrote() {
232 + assert!(is_attribution_text("On Mon, Jan 5, 2026 at 3:00 PM Alice <alice@example.com> wrote:"));
233 + }
234 +
235 + #[test]
236 + fn attribution_forwarded() {
237 + assert!(is_attribution_text("---------- Forwarded message ----------"));
238 + }
239 +
240 + #[test]
241 + fn attribution_original_message() {
242 + assert!(is_attribution_text("-----Original Message-----"));
243 + }
244 +
245 + #[test]
246 + fn attribution_begin_forwarded() {
247 + assert!(is_attribution_text("Begin forwarded message:"));
248 + }
249 +
250 + #[test]
251 + fn not_attribution() {
252 + assert!(!is_attribution_text("Hello, how are you?"));
253 + assert!(!is_attribution_text("On the other hand, this is fine."));
254 + }
255 +
256 + // -- Reply class detection --
257 +
258 + #[test]
259 + fn gmail_quote_class() {
260 + assert!(is_reply_class("gmail_quote"));
261 + }
262 +
263 + #[test]
264 + fn multiple_classes_with_reply() {
265 + assert!(is_reply_class("some-class gmail_quote another"));
266 + }
267 +
268 + #[test]
269 + fn non_reply_class() {
270 + assert!(!is_reply_class("regular-div content-wrapper"));
271 + }
272 +
273 + // -- Reply boundary detection --
274 +
275 + #[test]
276 + fn type_cite_is_boundary() {
277 + let html = r#"<div type="cite"><p>quoted</p></div>"#;
278 + let (doc, sel) = parse_and_select(html, r#"div[type="cite"]"#);
279 + let el = doc.select(&sel).next().unwrap();
280 + assert!(is_reply_boundary(el));
281 + }
282 +
283 + #[test]
284 + fn gmail_quote_is_boundary() {
285 + let html = r#"<div class="gmail_quote"><p>quoted</p></div>"#;
286 + let (doc, sel) = parse_and_select(html, "div.gmail_quote");
287 + let el = doc.select(&sel).next().unwrap();
288 + assert!(is_reply_boundary(el));
289 + }
290 +
291 + #[test]
292 + fn outlook_id_is_boundary() {
293 + let html = r#"<div id="divRplyFwdMsg"><p>quoted</p></div>"#;
294 + let (doc, sel) = parse_and_select(html, "#divRplyFwdMsg");
295 + let el = doc.select(&sel).next().unwrap();
296 + assert!(is_reply_boundary(el));
297 + }
298 +
299 + #[test]
300 + fn plain_div_not_boundary() {
301 + let html = r#"<div class="content"><p>not quoted</p></div>"#;
302 + let (doc, sel) = parse_and_select(html, "div.content");
303 + let el = doc.select(&sel).next().unwrap();
304 + assert!(!is_reply_boundary(el));
305 + }
306 +
307 + // -- Outlook separator --
308 +
309 + #[test]
310 + fn outlook_from_sent_subject() {
311 + let html = "<div>From: Alice\nSent: Monday\nTo: Bob\nSubject: Hello</div>";
312 + let (doc, sel) = parse_and_select(html, "div");
313 + let el = doc.select(&sel).next().unwrap();
314 + assert!(is_outlook_separator(el));
315 + }
316 +
317 + #[test]
318 + fn regular_div_not_separator() {
319 + let html = "<div>Just a normal paragraph.</div>";
320 + let (doc, sel) = parse_and_select(html, "div");
321 + let el = doc.select(&sel).next().unwrap();
322 + assert!(!is_outlook_separator(el));
323 + }
324 + }
A src/tables.rs +297
@@ -0,0 +1,297 @@
1 + use scraper::ElementRef;
2 +
3 + /// Determine whether a `<table>` element is a data table or a layout table.
4 + ///
5 + /// Email HTML overwhelmingly uses tables for layout. A table is considered
6 + /// a **data table** if it has structural indicators of tabular data:
7 + /// - Contains `<th>` elements
8 + /// - Has a `<caption>` child
9 + /// - Has `role="grid"` or `role="table"`
10 + /// - Has multiple rows where multiple cells contain substantive text
11 + ///
12 + /// Everything else is treated as a layout table and unwrapped.
13 + pub fn is_data_table(table: ElementRef) -> bool {
14 + let el = table.value();
15 +
16 + // role attribute
17 + if let Some(role) = el.attr("role") {
18 + if role == "grid" || role == "table" {
19 + return true;
20 + }
21 + // role="presentation" is an explicit layout signal
22 + if role == "presentation" || role == "none" {
23 + return false;
24 + }
25 + }
26 +
27 + let mut has_th = false;
28 + let mut has_caption = false;
29 + let mut multi_cell_rows = 0u32;
30 +
31 + for descendant in table.descendants() {
32 + if let Some(el_ref) = ElementRef::wrap(descendant) {
33 + match el_ref.value().name() {
34 + "th" => has_th = true,
35 + "caption" => has_caption = true,
36 + "tr" => {
37 + let cell_count = el_ref
38 + .children()
39 + .filter_map(ElementRef::wrap)
40 + .filter(|c| {
41 + let name = c.value().name();
42 + (name == "td" || name == "th") && has_substantive_text(*c)
43 + })
44 + .count();
45 + if cell_count > 1 {
46 + multi_cell_rows += 1;
47 + }
48 + }
49 + _ => {}
50 + }
51 + }
52 + }
53 +
54 + if has_th || has_caption {
55 + return true;
56 + }
57 +
58 + // Multiple rows with multiple substantive cells = data table
59 + multi_cell_rows >= 2
60 + }
61 +
62 + /// Check if an element contains meaningful text (not just whitespace/nbsp).
63 + fn has_substantive_text(el: ElementRef) -> bool {
64 + let text = el.text().collect::<String>();
65 + let trimmed = text.trim().replace('\u{a0}', ""); // strip &nbsp;
66 + trimmed.len() > 1 // more than a single character
67 + }
68 +
69 + /// Extract rows and cells from a data table for markdown rendering.
70 + ///
71 + /// Returns (headers, rows) where each is a Vec of cell text strings.
72 + /// If no `<thead>`/`<th>` row exists, the first row is used as headers.
73 + pub fn extract_table_data(table: ElementRef) -> (Vec<String>, Vec<Vec<String>>) {
74 + let mut headers: Vec<String> = Vec::new();
75 + let mut rows: Vec<Vec<String>> = Vec::new();
76 +
77 + // Look for thead/th first
78 + for descendant in table.children().filter_map(ElementRef::wrap) {
79 + let name = descendant.value().name();
80 + if name == "thead" {
81 + for tr in descendant.children().filter_map(ElementRef::wrap) {
82 + if tr.value().name() == "tr" {
83 + headers = extract_cells(tr);
84 + break; // first row of thead
85 + }
86 + }
87 + } else if name == "tbody" || name == "tr" {
88 + let trs: Box<dyn Iterator<Item = ElementRef>> = if name == "tbody" {
89 + Box::new(
90 + descendant
91 + .children()
92 + .filter_map(ElementRef::wrap)
93 + .filter(|e| e.value().name() == "tr"),
94 + )
95 + } else {
96 + Box::new(std::iter::once(descendant))
97 + };
98 +
99 + for tr in trs {
100 + let cells = extract_cells(tr);
101 + if !cells.is_empty() {
102 + // If we haven't found headers yet and this row has <th> cells,
103 + // treat it as the header row
104 + if headers.is_empty() && has_th_cells(tr) {
105 + headers = cells;
106 + } else {
107 + rows.push(cells);
108 + }
109 + }
110 + }
111 + }
112 + }
113 +
114 + // If still no headers, promote first data row
115 + if headers.is_empty() && !rows.is_empty() {
116 + headers = rows.remove(0);
117 + }
118 +
119 + (headers, rows)
120 + }
121 +
122 + fn extract_cells(tr: ElementRef) -> Vec<String> {
123 + tr.children()
124 + .filter_map(ElementRef::wrap)
125 + .filter(|e| {
126 + let n = e.value().name();
127 + n == "td" || n == "th"
128 + })
129 + .map(|cell| {
130 + let text = cell.text().collect::<String>();
131 + text.split_whitespace().collect::<Vec<_>>().join(" ")
132 + })
133 + .collect()
134 + }
135 +
136 + fn has_th_cells(tr: ElementRef) -> bool {
137 + tr.children()
138 + .filter_map(ElementRef::wrap)
139 + .any(|e| e.value().name() == "th")
140 + }
141 +
142 + /// Render a data table as a GFM markdown table.
143 + pub fn render_markdown_table(headers: &[String], rows: &[Vec<String>]) -> String {
144 + if headers.is_empty() {
145 + return String::new();
146 + }
147 +
148 + let col_count = headers.len();
149 + let mut out = String::new();
150 +
151 + // Header row
152 + out.push('|');
153 + for h in headers {
154 + out.push(' ');
155 + out.push_str(h);
156 + out.push_str(" |");
157 + }
158 + out.push('\n');
159 +
160 + // Separator row
161 + out.push('|');
162 + for _ in 0..col_count {
163 + out.push_str(" --- |");
164 + }
165 + out.push('\n');
166 +
167 + // Data rows
168 + for row in rows {
169 + out.push('|');
170 + for i in 0..col_count {
171 + out.push(' ');
172 + if let Some(cell) = row.get(i) {
173 + out.push_str(cell);
174 + }
175 + out.push_str(" |");
176 + }
177 + out.push('\n');
178 + }
179 +
180 + // Remove trailing newline (caller handles spacing)
181 + out.trim_end().to_string()
182 + }
183 +
184 + #[cfg(test)]
185 + mod tests {
186 + use super::*;
187 + use scraper::{Html, Selector};
188 +
189 + fn parse_table(html: &str) -> Html {
190 + Html::parse_document(html)
191 + }
192 +
193 + fn select_table(doc: &Html) -> ElementRef<'_> {
194 + let sel = Selector::parse("table").unwrap();
195 + doc.select(&sel).next().unwrap()
196 + }
197 +
198 + #[test]
199 + fn single_cell_is_layout() {
200 + let doc = parse_table("<table><tr><td>content</td></tr></table>");
201 + assert!(!is_data_table(select_table(&doc)));
202 + }
203 +
204 + #[test]
205 + fn table_with_th_is_data() {
206 + let doc = parse_table(
207 + "<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>",
208 + );
209 + assert!(is_data_table(select_table(&doc)));
210 + }
211 +
212 + #[test]
213 + fn table_with_caption_is_data() {
214 + let doc = parse_table(
215 + "<table><caption>Users</caption><tr><td>Alice</td><td>30</td></tr></table>",
216 + );
217 + assert!(is_data_table(select_table(&doc)));
218 + }
219 +
220 + #[test]
221 + fn role_presentation_is_layout() {
222 + let doc = parse_table(
223 + r#"<table role="presentation"><tr><td>layout</td><td>stuff</td></tr></table>"#,
224 + );
225 + assert!(!is_data_table(select_table(&doc)));
226 + }
227 +
228 + #[test]
229 + fn role_grid_is_data() {
230 + let doc =
231 + parse_table(r#"<table role="grid"><tr><td>Alice</td><td>30</td></tr></table>"#);
232 + assert!(is_data_table(select_table(&doc)));
233 + }
234 +
235 + #[test]
236 + fn multi_row_multi_cell_is_data() {
237 + let doc = parse_table(
238 + "<table>\
239 + <tr><td>Alice</td><td>Engineer</td></tr>\
240 + <tr><td>Bob</td><td>Designer</td></tr>\
241 + </table>",
242 + );
243 + assert!(is_data_table(select_table(&doc)));
244 + }
245 +
246 + #[test]
247 + fn spacer_cells_not_substantive() {
248 + let doc = parse_table(
249 + "<table><tr><td>content</td><td>&nbsp;</td></tr>\
250 + <tr><td>more</td><td> </td></tr></table>",
251 + );
252 + // Only one substantive cell per row
253 + assert!(!is_data_table(select_table(&doc)));
254 + }
255 +
256 + #[test]
257 + fn render_simple_table() {
258 + let headers = vec!["Name".into(), "Age".into()];
259 + let rows = vec![
260 + vec!["Alice".into(), "30".into()],
261 + vec!["Bob".into(), "25".into()],
262 + ];
263 + let md = render_markdown_table(&headers, &rows);
264 + assert_eq!(
265 + md,
266 + "| Name | Age |\n| --- | --- |\n| Alice | 30 |\n| Bob | 25 |"
267 + );
268 + }
269 +
270 + #[test]
271 + fn render_empty_headers() {
272 + let md = render_markdown_table(&[], &[]);
273 + assert_eq!(md, "");
274 + }
275 +
276 + #[test]
277 + fn extract_with_thead() {
278 + let doc = parse_table(
279 + "<table><thead><tr><th>A</th><th>B</th></tr></thead>\
280 + <tbody><tr><td>1</td><td>2</td></tr></tbody></table>",
281 + );
282 + let (h, r) = extract_table_data(select_table(&doc));
283 + assert_eq!(h, vec!["A", "B"]);
284 + assert_eq!(r, vec![vec!["1".to_string(), "2".to_string()]]);
285 + }
286 +
287 + #[test]
288 + fn extract_promotes_first_row() {
289 + let doc = parse_table(
290 + "<table><tr><td>Name</td><td>Val</td></tr>\
291 + <tr><td>X</td><td>Y</td></tr></table>",
292 + );
293 + let (h, r) = extract_table_data(select_table(&doc));
294 + assert_eq!(h, vec!["Name", "Val"]);
295 + assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]);
296 + }
297 + }
@@ -0,0 +1,56 @@
1 + /// Normalize whitespace in the final markdown output.
2 + ///
3 + /// - Collapse runs of 3+ newlines into 2 (one blank line)
4 + /// - Trim leading/trailing whitespace
5 + /// - Remove trailing whitespace from each line
6 + pub fn normalize(input: &str) -> String {
7 + let mut result = String::with_capacity(input.len());
8 + let mut consecutive_newlines = 0u32;
9 +
10 + for ch in input.chars() {
11 + if ch == '\n' {
12 + consecutive_newlines += 1;
13 + if consecutive_newlines <= 2 {
14 + result.push('\n');
15 + }
16 + } else {
17 + consecutive_newlines = 0;
18 + result.push(ch);
19 + }
20 + }
21 +
22 + // Trim trailing whitespace from each line
23 + let lines: Vec<&str> = result.lines().map(|l| l.trim_end()).collect();
24 + let joined = lines.join("\n");
25 + joined.trim().to_string()
26 + }
27 +
28 + #[cfg(test)]
29 + mod tests {
30 + use super::*;
31 +
32 + #[test]
33 + fn collapse_excessive_newlines() {
34 + assert_eq!(normalize("a\n\n\n\nb"), "a\n\nb");
35 + }
36 +
37 + #[test]
38 + fn preserve_single_blank_line() {
39 + assert_eq!(normalize("a\n\nb"), "a\n\nb");
40 + }
41 +
42 + #[test]
43 + fn trim_trailing_whitespace() {
44 + assert_eq!(normalize("hello \nworld "), "hello\nworld");
45 + }
46 +
47 + #[test]
48 + fn trim_outer_whitespace() {
49 + assert_eq!(normalize("\n\nhello\n\n"), "hello");
50 + }
51 +
52 + #[test]
53 + fn empty_input() {
54 + assert_eq!(normalize(""), "");
55 + }
56 + }
@@ -0,0 +1,202 @@
1 + use pter::convert;
2 +
3 + #[test]
4 + fn empty_string() {
5 + assert_eq!(convert(""), "");
6 + }
7 +
8 + #[test]
9 + fn whitespace_only() {
10 + assert_eq!(convert(" \n\t "), "");
11 + }
12 +
13 + #[test]
14 + fn just_tags_no_content() {
15 + assert_eq!(convert("<div><p><span></span></p></div>"), "");
16 + }
17 +
18 + #[test]
19 + fn deeply_nested_divs() {
20 + let mut html = String::new();
21 + for _ in 0..100 {
22 + html.push_str("<div>");
23 + }
24 + html.push_str("deep content");
25 + for _ in 0..100 {
26 + html.push_str("</div>");
27 + }
28 + let md = convert(&html);
29 + assert!(md.contains("deep content"));
30 + }
31 +
32 + #[test]
33 + fn deeply_nested_blockquotes() {
34 + let mut html = String::new();
35 + for _ in 0..20 {
36 + html.push_str("<blockquote>");
37 + }
38 + html.push_str("very deep");
39 + for _ in 0..20 {
40 + html.push_str("</blockquote>");
41 + }
42 + let md = convert(&html);
43 + assert!(md.contains("very deep"));
44 + // Should have many > prefixes
45 + assert!(md.contains("> > > > >"));
46 + }
47 +
48 + #[test]
49 + fn deeply_nested_lists() {
50 + let mut html = String::new();
51 + for _ in 0..10 {
52 + html.push_str("<ul><li>");
53 + }
54 + html.push_str("deep item");
55 + for _ in 0..10 {
56 + html.push_str("</li></ul>");
57 + }
58 + let md = convert(&html);
59 + assert!(md.contains("deep item"));
60 + }
61 +
62 + #[test]
63 + fn malformed_unclosed_tags() {
64 + // html5ever auto-corrects these
65 + let md = convert("<p>unclosed paragraph<p>another one");
66 + assert!(md.contains("unclosed paragraph"));
67 + assert!(md.contains("another one"));
68 + }
69 +
70 + #[test]
71 + fn malformed_mismatched_tags() {
72 + let md = convert("<b><i>crossed</b></i>");
73 + assert!(md.contains("crossed"));
74 + }
75 +
76 + #[test]
77 + fn only_script_content() {
78 + assert_eq!(convert("<script>alert('xss')</script>"), "");
79 + }
80 +
81 + #[test]
82 + fn only_style_content() {
83 + assert_eq!(convert("<style>.x { color: red; }</style>"), "");
84 + }
85 +
86 + #[test]
87 + fn only_tracking_pixels() {
88 + let html = r#"
89 + <img src="a.gif" width="1" height="1">
90 + <img src="b.gif" width="1" height="1">
91 + "#;
92 + assert_eq!(convert(html), "");
93 + }
94 +
95 + #[test]
96 + fn unicode_content() {
97 + let md = convert("<p>日本語テスト 🎉 émojis café</p>");
98 + assert!(md.contains("日本語テスト"));
99 + assert!(md.contains("🎉"));
100 + assert!(md.contains("café"));
101 + }
102 +
103 + #[test]
104 + fn html_entities_numeric() {
105 + let md = convert("<p>&#169; &#8212; &#x2019;</p>");
106 + assert!(md.contains("©"));
107 + assert!(md.contains("—"));
108 + }
109 +
110 + #[test]
111 + fn large_input_doesnt_blow_up() {
112 + let para = "<p>Hello world. This is a test paragraph with some content.</p>";
113 + let html: String = para.repeat(1000);
114 + let md = convert(&html);
115 + assert!(md.contains("Hello world"));
116 + // Should be proportional, not quadratic
117 + assert!(md.len() < html.len());
118 + }
119 +
120 + #[test]
121 + fn link_with_nested_formatting() {
122 + let html = r#"<a href="https://example.com"><strong>bold link</strong></a>"#;
123 + let md = convert(html);
124 + assert!(md.contains("[**bold link**](https://example.com)"));
125 + }
126 +
127 + #[test]
128 + fn image_with_no_alt() {
129 + let md = convert(r#"<img src="photo.jpg">"#);
130 + assert!(md.contains("![](photo.jpg)"));
131 + }
132 +
133 + #[test]
134 + fn consecutive_inline_elements() {
135 + let md = convert("<b>bold</b><i>italic</i><code>code</code>");
136 + assert_eq!(md, "**bold***italic*`code`");
137 + }
138 +
139 + #[test]
140 + fn table_with_empty_cells() {
141 + let html = "<table><tr><th>A</th><th>B</th></tr>\
142 + <tr><td></td><td>val</td></tr></table>";
143 + let md = convert(html);
144 + assert!(md.contains("| A | B |"));
145 + assert!(md.contains("| | val |"));
146 + }
147 +
148 + #[test]
149 + fn pre_with_html_inside() {
150 + let html = "<pre>&lt;div&gt;not a tag&lt;/div&gt;</pre>";
151 + let md = convert(html);
152 + assert!(md.contains("```"));
153 + assert!(md.contains("<div>not a tag</div>"));
154 + }
155 +
156 + #[test]
157 + fn multiple_spaces_in_source() {
158 + let md = convert("<p>word1 word2 word3</p>");
159 + assert_eq!(md, "word1 word2 word3");
160 + }
161 +
162 + #[test]
163 + fn newlines_in_source_collapsed() {
164 + let md = convert("<p>line1\n\n\nline2</p>");
165 + assert_eq!(md, "line1 line2");
166 + }
167 +
168 + #[test]
169 + fn full_html_document() {
170 + let html = r#"
171 + <!DOCTYPE html>
172 + <html lang="en">
173 + <head>
174 + <meta charset="UTF-8">
175 + <title>Test Email</title>
176 + <style>body { font-family: sans-serif; }</style>
177 + </head>
178 + <body>
179 + <p>Hello!</p>
180 + </body>
181 + </html>
182 + "#;
183 + let md = convert(html);
184 + assert_eq!(md, "Hello!");
185 + }
186 +
187 + #[test]
188 + fn data_uri_image_not_tracking_pixel() {
189 + // A data URI image that's not 1x1 should render
190 + let html = r#"<img src="data:image/png;base64,iVBOR..." alt="inline" width="100">"#;
191 + let md = convert(html);
192 + assert!(md.contains("![inline]"));
193 + }
194 +
195 + #[test]
196 + fn blockquote_with_paragraphs() {
197 + let html = "<blockquote><p>First para</p><p>Second para</p></blockquote>";
198 + let md = convert(html);
199 + assert!(md.contains("> First para"));
200 + assert!(md.contains("> "));
201 + assert!(md.contains("> Second para"));
202 + }
@@ -0,0 +1,436 @@
1 + use pter::convert;
2 +
3 + #[test]
4 + fn simple_email() {
5 + let html = r#"
6 + <html>
7 + <head><title>Email</title></head>
8 + <body>
9 + <h1>Meeting Tomorrow</h1>
10 + <p>Hi Max,</p>
11 + <p>Just confirming our meeting tomorrow at <strong>2pm</strong>.</p>
12 + <p>Best,<br>Alice</p>
13 + </body>
14 + </html>
15 + "#;
16 +
17 + let md = convert(html);
18 + assert!(md.contains("# Meeting Tomorrow"));
19 + assert!(md.contains("Hi Max,"));
20 + assert!(md.contains("**2pm**"));
21 + assert!(md.contains("Best,\nAlice"));
22 + }
23 +
24 + #[test]
25 + fn email_with_links() {
26 + let html = r#"
27 + <body>
28 + <p>Please review the <a href="https://example.com/doc">document</a>.</p>
29 + <p>Direct link: <a href="https://example.com">https://example.com</a></p>
30 + </body>
31 + "#;
32 +
33 + let md = convert(html);
34 + assert!(md.contains("[document](https://example.com/doc)"));
35 + // Link text matches URL — no markdown link syntax
36 + assert!(md.contains("Direct link: https://example.com"));
37 + }
38 +
39 + #[test]
40 + fn email_with_tracking_pixels() {
41 + let html = r#"
42 + <body>
43 + <p>Content here</p>
44 + <img src="https://tracker.example.com/open.gif" width="1" height="1" alt="">
45 + <img src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7" alt="">
46 + <img src="real-image.jpg" alt="A real photo" width="600">
47 + </body>
48 + "#;
49 +
50 + let md = convert(html);
51 + assert!(md.contains("Content here"));
52 + assert!(!md.contains("tracker"));
53 + assert!(!md.contains("data:image"));
54 + assert!(md.contains("![A real photo](real-image.jpg)"));
55 + }
56 +
57 + #[test]
58 + fn email_with_quoted_reply() {
59 + let html = r#"
60 + <body>
61 + <p>Thanks, that works for me.</p>
62 + <blockquote>
63 + <p>Can we meet at 3pm instead?</p>
64 + </blockquote>
65 + </body>
66 + "#;
67 +
68 + let md = convert(html);
69 + assert!(md.contains("Thanks, that works for me."));
70 + assert!(md.contains("> Can we meet at 3pm instead?"));
71 + }
72 +
73 + #[test]
74 + fn email_with_signature_line() {
75 + let html = r#"
76 + <body>
77 + <p>See you then.</p>
78 + <hr>
79 + <p>Alice Smith</p>
80 + <p>Engineering Lead</p>
81 + </body>
82 + "#;
83 +
84 + let md = convert(html);
85 + assert!(md.contains("See you then."));
86 + assert!(md.contains("---"));
87 + assert!(md.contains("Alice Smith"));
88 + }
89 +
90 + #[test]
91 + fn deeply_nested_blockquotes() {
92 + let html = r#"
93 + <body>
94 + <p>Got it.</p>
95 + <blockquote>
96 + <p>Sounds good.</p>
97 + <blockquote>
98 + <p>Can we reschedule?</p>
99 + <blockquote>
100 + <p>Original message here.</p>
101 + </blockquote>
102 + </blockquote>
103 + </blockquote>
104 + </body>
105 + "#;
106 +
107 + let md = convert(html);
108 + assert!(md.contains("Got it."));
109 + assert!(md.contains("> Sounds good."));
110 + assert!(md.contains("> > Can we reschedule?"));
111 + assert!(md.contains("> > > Original message here."));
112 + }
113 +
114 + #[test]
115 + fn complex_list_structure() {
116 + let html = r#"
117 + <body>
118 + <p>Action items:</p>
119 + <ol>
120 + <li>Review the PR
121 + <ul>
122 + <li>Check tests</li>
123 + <li>Check docs</li>
124 + </ul>
125 + </li>
126 + <li>Deploy to staging</li>
127 + </ol>
128 + </body>
129 + "#;
130 +
131 + let md = convert(html);
132 + assert!(md.contains("Action items:"));
133 + assert!(md.contains("1. Review the PR"));
134 + assert!(md.contains(" - Check tests"));
135 + assert!(md.contains("2. Deploy to staging"));
136 + }
137 +
138 + #[test]
139 + fn pre_block_preserves_formatting() {
140 + let html = r#"
141 + <body>
142 + <p>Here's the code:</p>
143 + <pre><code>fn main() {
144 + println!("hello");
145 + }</code></pre>
146 + </body>
147 + "#;
148 +
149 + let md = convert(html);
150 + assert!(md.contains("Here's the code:"));
151 + assert!(md.contains("```\nfn main()"));
152 + assert!(md.contains(" println!"));
153 + }
154 +
155 + #[test]
156 + fn hidden_content_stripped() {
157 + let html = r#"
158 + <body>
159 + <p>Visible content</p>
160 + <div style="display: none;">
161 + <p>This should not appear</p>
162 + </div>
163 + <span style="visibility: hidden;">Also hidden</span>
164 + <p>More visible</p>
165 + </body>
166 + "#;
167 +
168 + let md = convert(html);
169 + assert!(md.contains("Visible content"));
170 + assert!(!md.contains("should not appear"));
171 + assert!(!md.contains("Also hidden"));
172 + assert!(md.contains("More visible"));
173 + }
174 +
175 + #[test]
176 + fn script_and_style_fully_removed() {
177 + let html = r#"
178 + <html>
179 + <head>
180 + <style>body { color: red; }</style>
181 + <script>alert('xss');</script>
182 + </head>
183 + <body>
184 + <p>Safe content</p>
185 + <script>document.write('injected')</script>
186 + </body>
187 + </html>
188 + "#;
189 +
190 + let md = convert(html);
191 + assert_eq!(md, "Safe content");
192 + }
193 +
194 + #[test]
195 + fn newsletter_table_layout() {
196 + // Typical email newsletter wrapped in layout tables
197 + let html = r#"
198 + <html>
199 + <body>
200 + <table width="100%" cellpadding="0" cellspacing="0" role="presentation">
201 + <tr>
202 + <td align="center">
203 + <table width="600" cellpadding="0" cellspacing="0">
204 + <tr>
205 + <td>
206 + <h2>Weekly Digest</h2>
207 + <p>Here are your updates for this week.</p>
208 + <ul>
209 + <li>New release v2.0</li>
210 + <li>Bug fixes</li>
211 + </ul>
212 + <p>Thanks for reading!</p>
213 + </td>
214 + </tr>
215 + </table>
216 + </td>
217 + </tr>
218 + </table>
219 + <img src="https://track.example.com/open.gif" width="1" height="1">
220 + </body>
221 + </html>
222 + "#;
223 +
224 + let md = convert(html);
225 + assert!(md.contains("## Weekly Digest"));
226 + assert!(md.contains("Here are your updates for this week."));
227 + assert!(md.contains("- New release v2.0"));
228 + assert!(md.contains("- Bug fixes"));
229 + assert!(md.contains("Thanks for reading!"));
230 + assert!(!md.contains("track.example.com"));
231 + // No table markup in output
232 + assert!(!md.contains("| "));
233 + }
234 +
235 + #[test]
236 + fn data_table_preserved() {
237 + let html = r#"
238 + <body>
239 + <p>Order summary:</p>
240 + <table>
241 + <thead><tr><th>Item</th><th>Qty</th><th>Price</th></tr></thead>
242 + <tbody>
243 + <tr><td>Widget</td><td>3</td><td>$15.00</td></tr>
244 + <tr><td>Gadget</td><td>1</td><td>$29.99</td></tr>
245 + </tbody>
246 + </table>
247 + </body>
248 + "#;
249 +
250 + let md = convert(html);
251 + assert!(md.contains("Order summary:"));
252 + assert!(md.contains("| Item | Qty | Price |"));
253 + assert!(md.contains("| --- | --- | --- |"));
254 + assert!(md.contains("| Widget | 3 | $15.00 |"));
255 + assert!(md.contains("| Gadget | 1 | $29.99 |"));
256 + }
257 +
258 + #[test]
259 + fn spacer_and_tracking_stripped() {
260 + let html = r#"
261 + <body>
262 + <p>Real content</p>
263 + <div style="font-size: 0; line-height: 0;">&nbsp;</div>
264 + <img src="pixel.gif" width="1" height="1" style="display:none">
265 + <div style="height:0;overflow:hidden">invisible</div>
266 + <p>More content</p>
267 + </body>
268 + "#;
269 +
270 + let md = convert(html);
271 + assert!(md.contains("Real content"));
272 + assert!(md.contains("More content"));
273 + assert!(!md.contains("invisible"));
274 + assert!(!md.contains("pixel.gif"));
275 + }
276 +
277 + // -- Reply chain tests --
278 +
279 + #[test]
280 + fn gmail_reply_chain() {
281 + let html = r#"
282 + <body>
283 + <div dir="ltr">
284 + <p>Thanks, that works for me.</p>
285 + </div>
286 + <div class="gmail_quote">
287 + <div class="gmail_attr">On Mon, Jan 5, 2026 at 3:00 PM Alice &lt;alice@example.com&gt; wrote:</div>
288 + <blockquote class="gmail_quote">
289 + <div dir="ltr">
290 + <p>Can we meet at 3pm instead of 2pm?</p>
291 + </div>
292 + </blockquote>
293 + </div>
294 + </body>
295 + "#;
296 +
297 + let md = convert(html);
298 + assert!(md.contains("Thanks, that works for me."));
299 + // The gmail_quote div should be rendered as a quote block
300 + assert!(md.contains("> "));
301 + assert!(md.contains("3pm instead of 2pm"));
302 + }
303 +
304 + #[test]
305 + fn apple_mail_reply() {
306 + let html = r#"
307 + <body>
308 + <div>Sounds good, see you then.</div>
309 + <div>
310 + <br>
311 + <blockquote type="cite">
312 + <div>Hey, are we still on for lunch?</div>
313 + </blockquote>
314 + </div>
315 + </body>
316 + "#;
317 +
318 + let md = convert(html);
319 + assert!(md.contains("Sounds good, see you then."));
320 + assert!(md.contains("> "));
321 + assert!(md.contains("still on for lunch"));
322 + }
323 +
324 + #[test]
325 + fn outlook_reply_with_separator() {
326 + let html = r#"
327 + <body>
328 + <div>
329 + <p>I'll handle it.</p>
330 + </div>
331 + <hr>
332 + <div>
333 + <p>From: Alice Smith<br>
334 + Sent: Monday, January 5, 2026<br>
335 + To: Bob Jones<br>
336 + Subject: Action needed</p>
337 + </div>
338 + <div>
339 + <p>Can you take a look at the report?</p>
340 + </div>
341 + </body>
342 + "#;
343 +
344 + let md = convert(html);
345 + assert!(md.contains("I'll handle it."));
346 + assert!(md.contains("---")); // hr separator
347 + assert!(md.contains("From: Alice Smith"));
348 + assert!(md.contains("take a look at the report"));
349 + }
350 +
351 + #[test]
352 + fn nested_gmail_reply_chain() {
353 + let html = r#"
354 + <body>
355 + <div dir="ltr"><p>Got it, thanks!</p></div>
356 + <div class="gmail_quote">
357 + On Tue, Jan 6, Bob wrote:
358 + <blockquote class="gmail_quote">
359 + <div dir="ltr"><p>Here's the update.</p></div>
360 + <div class="gmail_quote">
361 + On Mon, Jan 5, Alice wrote:
362 + <blockquote class="gmail_quote">
363 + <div dir="ltr"><p>What's the status?</p></div>
364 + </blockquote>
365 + </div>
366 + </blockquote>
367 + </div>
368 + </body>
369 + "#;
370 +
371 + let md = convert(html);
372 + assert!(md.contains("Got it, thanks!"));
373 + // Should have nested quoting
374 + assert!(md.contains("> "));
375 + assert!(md.contains("Here's the update."));
376 + assert!(md.contains("What's the status?"));
377 + }
378 +
379 + #[test]
380 + fn forwarded_message() {
381 + let html = r#"
382 + <body>
383 + <div><p>FYI, see below.</p></div>
384 + <div class="gmail_quote">
385 + ---------- Forwarded message ----------
386 + <blockquote>
387 + <p>From: Alice</p>
388 + <p>The deadline has been moved to Friday.</p>
389 + </blockquote>
390 + </div>
391 + </body>
392 + "#;
393 +
394 + let md = convert(html);
395 + assert!(md.contains("FYI, see below."));
396 + assert!(md.contains("Forwarded message"));
397 + assert!(md.contains("deadline has been moved"));
398 + }
399 +
400 + #[test]
401 + fn protonmail_reply() {
402 + let html = r#"
403 + <body>
404 + <div>Will do, thanks.</div>
405 + <blockquote class="protonmail_quote" type="cite">
406 + <div>Please send me the files by EOD.</div>
407 + </blockquote>
408 + </body>
409 + "#;
410 +
411 + let md = convert(html);
412 + assert!(md.contains("Will do, thanks."));
413 + assert!(md.contains("> "));
414 + assert!(md.contains("send me the files"));
415 + }
416 +
417 + #[test]
418 + fn attribution_preserved_above_quote() {
419 + let html = r#"
420 + <body>
421 + <p>Agreed.</p>
422 + <div class="gmail_quote">
423 + On Wed, Jan 7, 2026 at 10:00 AM Carol wrote:
424 + <blockquote>
425 + <p>Let's go with option B.</p>
426 + </blockquote>
427 + </div>
428 + </body>
429 + "#;
430 +
431 + let md = convert(html);
432 + assert!(md.contains("Agreed."));
433 + // Attribution should appear
434 + assert!(md.contains("Carol wrote:"));
435 + assert!(md.contains("option B"));
436 + }
@@ -0,0 +1,94 @@
1 + use proptest::prelude::*;
2 +
3 + // Strategy: generate arbitrary HTML-like strings
4 + fn html_fragment() -> impl Strategy<Value = String> {
5 + let tags = prop::sample::select(vec![
6 + "p", "div", "span", "strong", "em", "a", "h1", "h2", "h3",
7 + "ul", "ol", "li", "blockquote", "pre", "code", "br", "hr",
8 + "img", "table", "tr", "td", "th", "b", "i", "del", "sup", "sub",
9 + ]);
10 +
11 + let text = "[a-zA-Z0-9 .,!?]{0,100}";
12 +
13 + prop::collection::vec(
14 + prop_oneof![
15 + // Plain text
16 + text.prop_map(|s| s),
17 + // Opening + closing tag with text
18 + (tags.clone(), text).prop_map(|(tag, content)| {
19 + format!("<{tag}>{content}</{tag}>")
20 + }),
21 + // Self-closing tag
22 + tags.clone().prop_map(|tag| format!("<{tag}/>")),
23 + // Nested tags
24 + (tags.clone(), tags.clone(), text).prop_map(|(outer, inner, content)| {
25 + format!("<{outer}><{inner}>{content}</{inner}></{outer}>")
26 + }),
27 + ],
28 + 1..10,
29 + )
30 + .prop_map(|parts| parts.join(""))
31 + }
32 +
33 + proptest! {
34 + #[test]
35 + fn never_panics(html in html_fragment()) {
36 + let _ = pter::convert(&html);
37 + }
38 +
39 + #[test]
40 + fn never_panics_on_arbitrary_bytes(s in "\\PC{0,500}") {
41 + let _ = pter::convert(&s);
42 + }
43 +
44 + #[test]
45 + fn output_contains_no_html_tags(html in html_fragment()) {
46 + let md = pter::convert(&html);
47 + // Output should never contain raw HTML tags
48 + // (except inside code blocks, which we skip checking)
49 + let without_code_blocks: String = md
50 + .split("```")
51 + .enumerate()
52 + .filter(|(i, _)| i % 2 == 0) // only outside code blocks
53 + .map(|(_, s)| s)
54 + .collect();
55 +
56 + // No <script>, <style>, <div>, etc. should leak through
57 + assert!(!without_code_blocks.contains("<script"), "leaked <script> in: {md}");
58 + assert!(!without_code_blocks.contains("<style"), "leaked <style> in: {md}");
59 + assert!(!without_code_blocks.contains("<head"), "leaked <head> in: {md}");
60 + }
61 +
62 + #[test]
63 + fn output_is_valid_utf8(html in html_fragment()) {
64 + let md = pter::convert(&html);
65 + // String type guarantees UTF-8, but verify no replacement chars snuck in
66 + // from bad entity decoding
67 + assert!(!md.contains('\u{FFFD}'), "replacement char in: {md}");
68 + }
69 +
70 + #[test]
71 + fn no_excessive_blank_lines(html in html_fragment()) {
72 + let md = pter::convert(&html);
73 + assert!(!md.contains("\n\n\n"), "triple newline in output: {md}");
74 + }
75 +
76 + #[test]
77 + fn no_trailing_whitespace_on_lines(html in html_fragment()) {
78 + let md = pter::convert(&html);
79 + for (i, line) in md.lines().enumerate() {
80 + assert!(
81 + line == line.trim_end(),
82 + "trailing whitespace on line {i}: '{line}'"
83 + );
84 + }
85 + }
86 +
87 + #[test]
88 + fn empty_input_returns_empty(s in "\\s{0,20}") {
89 + let html = format!("<html><body>{s}</body></html>");
90 + let md = pter::convert(&html);
91 + // Whitespace-only input should produce empty or whitespace-only output
92 + assert!(md.trim().is_empty() || !s.trim().is_empty());
93 + }
94 + }