max / pter
17 files changed,
+2890 insertions,
-0 deletions
| @@ -0,0 +1 @@ | |||
| 1 | + | /target |
| @@ -0,0 +1,1390 @@ | |||
| 1 | + | # This file is automatically @generated by Cargo. | |
| 2 | + | # It is not intended for manual editing. | |
| 3 | + | version = 4 | |
| 4 | + | ||
| 5 | + | [[package]] | |
| 6 | + | name = "aho-corasick" | |
| 7 | + | version = "1.1.4" | |
| 8 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 9 | + | checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" | |
| 10 | + | dependencies = [ | |
| 11 | + | "memchr", | |
| 12 | + | ] | |
| 13 | + | ||
| 14 | + | [[package]] | |
| 15 | + | name = "anes" | |
| 16 | + | version = "0.1.6" | |
| 17 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 18 | + | checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" | |
| 19 | + | ||
| 20 | + | [[package]] | |
| 21 | + | name = "anstyle" | |
| 22 | + | version = "1.0.14" | |
| 23 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 24 | + | checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" | |
| 25 | + | ||
| 26 | + | [[package]] | |
| 27 | + | name = "anyhow" | |
| 28 | + | version = "1.0.102" | |
| 29 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 30 | + | checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" | |
| 31 | + | ||
| 32 | + | [[package]] | |
| 33 | + | name = "autocfg" | |
| 34 | + | version = "1.5.0" | |
| 35 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 36 | + | checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" | |
| 37 | + | ||
| 38 | + | [[package]] | |
| 39 | + | name = "bit-set" | |
| 40 | + | version = "0.8.0" | |
| 41 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 42 | + | checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" | |
| 43 | + | dependencies = [ | |
| 44 | + | "bit-vec", | |
| 45 | + | ] | |
| 46 | + | ||
| 47 | + | [[package]] | |
| 48 | + | name = "bit-vec" | |
| 49 | + | version = "0.8.0" | |
| 50 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 51 | + | checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" | |
| 52 | + | ||
| 53 | + | [[package]] | |
| 54 | + | name = "bitflags" | |
| 55 | + | version = "2.11.1" | |
| 56 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 57 | + | checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" | |
| 58 | + | ||
| 59 | + | [[package]] | |
| 60 | + | name = "bumpalo" | |
| 61 | + | version = "3.20.2" | |
| 62 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 63 | + | checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" | |
| 64 | + | ||
| 65 | + | [[package]] | |
| 66 | + | name = "cast" | |
| 67 | + | version = "0.3.0" | |
| 68 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 69 | + | checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" | |
| 70 | + | ||
| 71 | + | [[package]] | |
| 72 | + | name = "cfg-if" | |
| 73 | + | version = "1.0.4" | |
| 74 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 75 | + | checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" | |
| 76 | + | ||
| 77 | + | [[package]] | |
| 78 | + | name = "ciborium" | |
| 79 | + | version = "0.2.2" | |
| 80 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 81 | + | checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" | |
| 82 | + | dependencies = [ | |
| 83 | + | "ciborium-io", | |
| 84 | + | "ciborium-ll", | |
| 85 | + | "serde", | |
| 86 | + | ] | |
| 87 | + | ||
| 88 | + | [[package]] | |
| 89 | + | name = "ciborium-io" | |
| 90 | + | version = "0.2.2" | |
| 91 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 92 | + | checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" | |
| 93 | + | ||
| 94 | + | [[package]] | |
| 95 | + | name = "ciborium-ll" | |
| 96 | + | version = "0.2.2" | |
| 97 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 98 | + | checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" | |
| 99 | + | dependencies = [ | |
| 100 | + | "ciborium-io", | |
| 101 | + | "half", | |
| 102 | + | ] | |
| 103 | + | ||
| 104 | + | [[package]] | |
| 105 | + | name = "clap" | |
| 106 | + | version = "4.6.1" | |
| 107 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 108 | + | checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" | |
| 109 | + | dependencies = [ | |
| 110 | + | "clap_builder", | |
| 111 | + | ] | |
| 112 | + | ||
| 113 | + | [[package]] | |
| 114 | + | name = "clap_builder" | |
| 115 | + | version = "4.6.0" | |
| 116 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 117 | + | checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" | |
| 118 | + | dependencies = [ | |
| 119 | + | "anstyle", | |
| 120 | + | "clap_lex", | |
| 121 | + | ] | |
| 122 | + | ||
| 123 | + | [[package]] | |
| 124 | + | name = "clap_lex" | |
| 125 | + | version = "1.1.0" | |
| 126 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 127 | + | checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" | |
| 128 | + | ||
| 129 | + | [[package]] | |
| 130 | + | name = "criterion" | |
| 131 | + | version = "0.5.1" | |
| 132 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 133 | + | checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" | |
| 134 | + | dependencies = [ | |
| 135 | + | "anes", | |
| 136 | + | "cast", | |
| 137 | + | "ciborium", | |
| 138 | + | "clap", | |
| 139 | + | "criterion-plot", | |
| 140 | + | "is-terminal", | |
| 141 | + | "itertools", | |
| 142 | + | "num-traits", | |
| 143 | + | "once_cell", | |
| 144 | + | "oorandom", | |
| 145 | + | "plotters", | |
| 146 | + | "rayon", | |
| 147 | + | "regex", | |
| 148 | + | "serde", | |
| 149 | + | "serde_derive", | |
| 150 | + | "serde_json", | |
| 151 | + | "tinytemplate", | |
| 152 | + | "walkdir", | |
| 153 | + | ] | |
| 154 | + | ||
| 155 | + | [[package]] | |
| 156 | + | name = "criterion-plot" | |
| 157 | + | version = "0.5.0" | |
| 158 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 159 | + | checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" | |
| 160 | + | dependencies = [ | |
| 161 | + | "cast", | |
| 162 | + | "itertools", | |
| 163 | + | ] | |
| 164 | + | ||
| 165 | + | [[package]] | |
| 166 | + | name = "crossbeam-deque" | |
| 167 | + | version = "0.8.6" | |
| 168 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 169 | + | checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" | |
| 170 | + | dependencies = [ | |
| 171 | + | "crossbeam-epoch", | |
| 172 | + | "crossbeam-utils", | |
| 173 | + | ] | |
| 174 | + | ||
| 175 | + | [[package]] | |
| 176 | + | name = "crossbeam-epoch" | |
| 177 | + | version = "0.9.18" | |
| 178 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 179 | + | checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" | |
| 180 | + | dependencies = [ | |
| 181 | + | "crossbeam-utils", | |
| 182 | + | ] | |
| 183 | + | ||
| 184 | + | [[package]] | |
| 185 | + | name = "crossbeam-utils" | |
| 186 | + | version = "0.8.21" | |
| 187 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 188 | + | checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" | |
| 189 | + | ||
| 190 | + | [[package]] | |
| 191 | + | name = "crunchy" | |
| 192 | + | version = "0.2.4" | |
| 193 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 194 | + | checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" | |
| 195 | + | ||
| 196 | + | [[package]] | |
| 197 | + | name = "cssparser" | |
| 198 | + | version = "0.36.0" | |
| 199 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 200 | + | checksum = "dae61cf9c0abb83bd659dab65b7e4e38d8236824c85f0f804f173567bda257d2" | |
| 201 | + | dependencies = [ | |
| 202 | + | "cssparser-macros", | |
| 203 | + | "dtoa-short", | |
| 204 | + | "itoa", | |
| 205 | + | "phf", | |
| 206 | + | "smallvec", | |
| 207 | + | ] | |
| 208 | + | ||
| 209 | + | [[package]] | |
| 210 | + | name = "cssparser-macros" | |
| 211 | + | version = "0.6.1" | |
| 212 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 213 | + | checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" | |
| 214 | + | dependencies = [ | |
| 215 | + | "quote", | |
| 216 | + | "syn", | |
| 217 | + | ] | |
| 218 | + | ||
| 219 | + | [[package]] | |
| 220 | + | name = "derive_more" | |
| 221 | + | version = "2.1.1" | |
| 222 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 223 | + | checksum = "d751e9e49156b02b44f9c1815bcb94b984cdcc4396ecc32521c739452808b134" | |
| 224 | + | dependencies = [ | |
| 225 | + | "derive_more-impl", | |
| 226 | + | ] | |
| 227 | + | ||
| 228 | + | [[package]] | |
| 229 | + | name = "derive_more-impl" | |
| 230 | + | version = "2.1.1" | |
| 231 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 232 | + | checksum = "799a97264921d8623a957f6c3b9011f3b5492f557bbb7a5a19b7fa6d06ba8dcb" | |
| 233 | + | dependencies = [ | |
| 234 | + | "proc-macro2", | |
| 235 | + | "quote", | |
| 236 | + | "rustc_version", | |
| 237 | + | "syn", | |
| 238 | + | ] | |
| 239 | + | ||
| 240 | + | [[package]] | |
| 241 | + | name = "dtoa" | |
| 242 | + | version = "1.0.11" | |
| 243 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 244 | + | checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" | |
| 245 | + | ||
| 246 | + | [[package]] | |
| 247 | + | name = "dtoa-short" | |
| 248 | + | version = "0.3.5" | |
| 249 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 250 | + | checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" | |
| 251 | + | dependencies = [ | |
| 252 | + | "dtoa", | |
| 253 | + | ] | |
| 254 | + | ||
| 255 | + | [[package]] | |
| 256 | + | name = "ego-tree" | |
| 257 | + | version = "0.11.0" | |
| 258 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 259 | + | checksum = "b04dc5a38e4f151a79d9f2451ae6037fb6eaf5cba34771f44781f80e508498e3" | |
| 260 | + | ||
| 261 | + | [[package]] | |
| 262 | + | name = "either" | |
| 263 | + | version = "1.15.0" | |
| 264 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 265 | + | checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" | |
| 266 | + | ||
| 267 | + | [[package]] | |
| 268 | + | name = "equivalent" | |
| 269 | + | version = "1.0.2" | |
| 270 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 271 | + | checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" | |
| 272 | + | ||
| 273 | + | [[package]] | |
| 274 | + | name = "errno" | |
| 275 | + | version = "0.3.14" | |
| 276 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 277 | + | checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" | |
| 278 | + | dependencies = [ | |
| 279 | + | "libc", | |
| 280 | + | "windows-sys", | |
| 281 | + | ] | |
| 282 | + | ||
| 283 | + | [[package]] | |
| 284 | + | name = "fastrand" | |
| 285 | + | version = "2.4.1" | |
| 286 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 287 | + | checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" | |
| 288 | + | ||
| 289 | + | [[package]] | |
| 290 | + | name = "fnv" | |
| 291 | + | version = "1.0.7" | |
| 292 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 293 | + | checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" | |
| 294 | + | ||
| 295 | + | [[package]] | |
| 296 | + | name = "foldhash" | |
| 297 | + | version = "0.1.5" | |
| 298 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 299 | + | checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" | |
| 300 | + | ||
| 301 | + | [[package]] | |
| 302 | + | name = "futures-core" | |
| 303 | + | version = "0.3.32" | |
| 304 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 305 | + | checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" | |
| 306 | + | ||
| 307 | + | [[package]] | |
| 308 | + | name = "futures-task" | |
| 309 | + | version = "0.3.32" | |
| 310 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 311 | + | checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" | |
| 312 | + | ||
| 313 | + | [[package]] | |
| 314 | + | name = "futures-util" | |
| 315 | + | version = "0.3.32" | |
| 316 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 317 | + | checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" | |
| 318 | + | dependencies = [ | |
| 319 | + | "futures-core", | |
| 320 | + | "futures-task", | |
| 321 | + | "pin-project-lite", | |
| 322 | + | "slab", | |
| 323 | + | ] | |
| 324 | + | ||
| 325 | + | [[package]] | |
| 326 | + | name = "getopts" | |
| 327 | + | version = "0.2.24" | |
| 328 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 329 | + | checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" | |
| 330 | + | dependencies = [ | |
| 331 | + | "unicode-width", | |
| 332 | + | ] | |
| 333 | + | ||
| 334 | + | [[package]] | |
| 335 | + | name = "getrandom" | |
| 336 | + | version = "0.3.4" | |
| 337 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 338 | + | checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" | |
| 339 | + | dependencies = [ | |
| 340 | + | "cfg-if", | |
| 341 | + | "libc", | |
| 342 | + | "r-efi 5.3.0", | |
| 343 | + | "wasip2", | |
| 344 | + | ] | |
| 345 | + | ||
| 346 | + | [[package]] | |
| 347 | + | name = "getrandom" | |
| 348 | + | version = "0.4.2" | |
| 349 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 350 | + | checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" | |
| 351 | + | dependencies = [ | |
| 352 | + | "cfg-if", | |
| 353 | + | "libc", | |
| 354 | + | "r-efi 6.0.0", | |
| 355 | + | "wasip2", | |
| 356 | + | "wasip3", | |
| 357 | + | ] | |
| 358 | + | ||
| 359 | + | [[package]] | |
| 360 | + | name = "half" | |
| 361 | + | version = "2.7.1" | |
| 362 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 363 | + | checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" | |
| 364 | + | dependencies = [ | |
| 365 | + | "cfg-if", | |
| 366 | + | "crunchy", | |
| 367 | + | "zerocopy", | |
| 368 | + | ] | |
| 369 | + | ||
| 370 | + | [[package]] | |
| 371 | + | name = "hashbrown" | |
| 372 | + | version = "0.15.5" | |
| 373 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 374 | + | checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" | |
| 375 | + | dependencies = [ | |
| 376 | + | "foldhash", | |
| 377 | + | ] | |
| 378 | + | ||
| 379 | + | [[package]] | |
| 380 | + | name = "hashbrown" | |
| 381 | + | version = "0.17.0" | |
| 382 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 383 | + | checksum = "4f467dd6dccf739c208452f8014c75c18bb8301b050ad1cfb27153803edb0f51" | |
| 384 | + | ||
| 385 | + | [[package]] | |
| 386 | + | name = "heck" | |
| 387 | + | version = "0.5.0" | |
| 388 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 389 | + | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" | |
| 390 | + | ||
| 391 | + | [[package]] | |
| 392 | + | name = "hermit-abi" | |
| 393 | + | version = "0.5.2" | |
| 394 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 395 | + | checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" | |
| 396 | + | ||
| 397 | + | [[package]] | |
| 398 | + | name = "html5ever" | |
| 399 | + | version = "0.39.0" | |
| 400 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 401 | + | checksum = "46a1761807faccc9a19e86944bbf40610014066306f96edcdedc2fb714bcb7b8" | |
| 402 | + | dependencies = [ | |
| 403 | + | "log", | |
| 404 | + | "markup5ever", | |
| 405 | + | ] | |
| 406 | + | ||
| 407 | + | [[package]] | |
| 408 | + | name = "id-arena" | |
| 409 | + | version = "2.3.0" | |
| 410 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 411 | + | checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" | |
| 412 | + | ||
| 413 | + | [[package]] | |
| 414 | + | name = "indexmap" | |
| 415 | + | version = "2.14.0" | |
| 416 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 417 | + | checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" | |
| 418 | + | dependencies = [ | |
| 419 | + | "equivalent", | |
| 420 | + | "hashbrown 0.17.0", | |
| 421 | + | "serde", | |
| 422 | + | "serde_core", | |
| 423 | + | ] | |
| 424 | + | ||
| 425 | + | [[package]] | |
| 426 | + | name = "is-terminal" | |
| 427 | + | version = "0.4.17" | |
| 428 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 429 | + | checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" | |
| 430 | + | dependencies = [ | |
| 431 | + | "hermit-abi", | |
| 432 | + | "libc", | |
| 433 | + | "windows-sys", | |
| 434 | + | ] | |
| 435 | + | ||
| 436 | + | [[package]] | |
| 437 | + | name = "itertools" | |
| 438 | + | version = "0.10.5" | |
| 439 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 440 | + | checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" | |
| 441 | + | dependencies = [ | |
| 442 | + | "either", | |
| 443 | + | ] | |
| 444 | + | ||
| 445 | + | [[package]] | |
| 446 | + | name = "itoa" | |
| 447 | + | version = "1.0.18" | |
| 448 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 449 | + | checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" | |
| 450 | + | ||
| 451 | + | [[package]] | |
| 452 | + | name = "js-sys" | |
| 453 | + | version = "0.3.97" | |
| 454 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 455 | + | checksum = "a1840c94c045fbcf8ba2812c95db44499f7c64910a912551aaaa541decebcacf" | |
| 456 | + | dependencies = [ | |
| 457 | + | "cfg-if", | |
| 458 | + | "futures-util", | |
| 459 | + | "once_cell", | |
| 460 | + | "wasm-bindgen", | |
| 461 | + | ] | |
| 462 | + | ||
| 463 | + | [[package]] | |
| 464 | + | name = "leb128fmt" | |
| 465 | + | version = "0.1.0" | |
| 466 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 467 | + | checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" | |
| 468 | + | ||
| 469 | + | [[package]] | |
| 470 | + | name = "libc" | |
| 471 | + | version = "0.2.186" | |
| 472 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 473 | + | checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" | |
| 474 | + | ||
| 475 | + | [[package]] | |
| 476 | + | name = "linux-raw-sys" | |
| 477 | + | version = "0.12.1" | |
| 478 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 479 | + | checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" | |
| 480 | + | ||
| 481 | + | [[package]] | |
| 482 | + | name = "lock_api" | |
| 483 | + | version = "0.4.14" | |
| 484 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 485 | + | checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" | |
| 486 | + | dependencies = [ | |
| 487 | + | "scopeguard", | |
| 488 | + | ] | |
| 489 | + | ||
| 490 | + | [[package]] | |
| 491 | + | name = "log" | |
| 492 | + | version = "0.4.29" | |
| 493 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 494 | + | checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" | |
| 495 | + | ||
| 496 | + | [[package]] | |
| 497 | + | name = "markup5ever" | |
| 498 | + | version = "0.39.0" | |
| 499 | + | source = "registry+https://github.com/rust-lang/crates.io-index" | |
| 500 | + | checksum = "7122d987ec5f704ee56f6e5b41a7d93722e9aae27ae07cafa4036c4d3f9757de" |
Lines truncated
| @@ -0,0 +1,20 @@ | |||
| 1 | + | [package] | |
| 2 | + | name = "pter" | |
| 3 | + | version = "0.1.0" | |
| 4 | + | edition = "2024" | |
| 5 | + | description = "Plain Text Email Renderer — convert HTML email bodies into readable markdown" | |
| 6 | + | license = "MIT" | |
| 7 | + | repository = "https://github.com/maxjacobson/pter" | |
| 8 | + | keywords = ["email", "html", "markdown", "plaintext", "converter"] | |
| 9 | + | categories = ["email", "text-processing", "parser-implementations"] | |
| 10 | + | ||
| 11 | + | [dependencies] | |
| 12 | + | scraper = "0.26" | |
| 13 | + | ||
| 14 | + | [dev-dependencies] | |
| 15 | + | proptest = "1" | |
| 16 | + | criterion = { version = "0.5", features = ["html_reports"] } | |
| 17 | + | ||
| 18 | + | [[bench]] | |
| 19 | + | name = "convert_bench" | |
| 20 | + | harness = false |
| @@ -0,0 +1,21 @@ | |||
| 1 | + | MIT License | |
| 2 | + | ||
| 3 | + | Copyright (c) 2026 Max Jacobson | |
| 4 | + | ||
| 5 | + | Permission is hereby granted, free of charge, to any person obtaining a copy | |
| 6 | + | of this software and associated documentation files (the "Software"), to deal | |
| 7 | + | in the Software without restriction, including without limitation the rights | |
| 8 | + | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| 9 | + | copies of the Software, and to permit persons to whom the Software is | |
| 10 | + | furnished to do so, subject to the following conditions: | |
| 11 | + | ||
| 12 | + | The above copyright notice and this permission notice shall be included in all | |
| 13 | + | copies or substantial portions of the Software. | |
| 14 | + | ||
| 15 | + | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| 16 | + | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| 17 | + | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| 18 | + | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| 19 | + | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| 20 | + | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| 21 | + | SOFTWARE. |
| @@ -0,0 +1,33 @@ | |||
| 1 | + | # pter | |
| 2 | + | ||
| 3 | + | **Plain Text Email Renderer** — convert HTML email bodies into readable markdown. | |
| 4 | + | ||
| 5 | + | Email HTML is a hostile environment: table-based layouts from 2004, tracking pixels, | |
| 6 | + | Outlook conditional comments, deeply nested reply chains with inconsistent quoting. | |
| 7 | + | pter converts all of it into clean, readable markdown. | |
| 8 | + | ||
| 9 | + | ## Usage | |
| 10 | + | ||
| 11 | + | ```rust | |
| 12 | + | let html = r#"<h1>Hello</h1><p>This is an <strong>email</strong>.</p>"#; | |
| 13 | + | let markdown = pter::convert(html); | |
| 14 | + | assert_eq!(markdown, "# Hello\n\nThis is an **email**."); | |
| 15 | + | ``` | |
| 16 | + | ||
| 17 | + | ## What it does | |
| 18 | + | ||
| 19 | + | - Converts HTML elements to markdown equivalents (headings, links, lists, emphasis, code, images) | |
| 20 | + | - Unwraps table-based email layouts (single-cell tables become content, multi-column linearizes) | |
| 21 | + | - Detects and normalizes reply chains into `>` quoted markdown | |
| 22 | + | - Strips tracking pixels, invisible elements, and Outlook conditional comments | |
| 23 | + | - Produces output that is readable as plain text and renderable by any markdown toolchain | |
| 24 | + | ||
| 25 | + | ## What it does not do | |
| 26 | + | ||
| 27 | + | - Parse MIME email structure (use `mailparse` or `mail-parser` for that) | |
| 28 | + | - Extract article content from marketing templates (compose with a separate extractor) | |
| 29 | + | - Render markdown to a display format (use `pulldown-cmark`, `comrak`, etc.) | |
| 30 | + | ||
| 31 | + | ## License | |
| 32 | + | ||
| 33 | + | MIT |
| @@ -0,0 +1,97 @@ | |||
| 1 | + | use criterion::{Criterion, black_box, criterion_group, criterion_main}; | |
| 2 | + | ||
| 3 | + | fn simple_email() -> &'static str { | |
| 4 | + | r#"<html><body> | |
| 5 | + | <h1>Meeting Tomorrow</h1> | |
| 6 | + | <p>Hi Max,</p> | |
| 7 | + | <p>Just confirming our meeting tomorrow at <strong>2pm</strong>. | |
| 8 | + | Please review the <a href="https://example.com/doc">document</a> beforehand.</p> | |
| 9 | + | <p>Best,<br>Alice</p> | |
| 10 | + | </body></html>"# | |
| 11 | + | } | |
| 12 | + | ||
| 13 | + | fn newsletter_email() -> &'static str { | |
| 14 | + | r#"<html><body> | |
| 15 | + | <table width="100%" cellpadding="0" cellspacing="0" role="presentation"> | |
| 16 | + | <tr><td align="center"> | |
| 17 | + | <table width="600" cellpadding="0" cellspacing="0"> | |
| 18 | + | <tr><td> | |
| 19 | + | <h2>Weekly Digest</h2> | |
| 20 | + | <p>Here are your updates:</p> | |
| 21 | + | <ul> | |
| 22 | + | <li>New feature: <strong>Dark mode</strong> is now available</li> | |
| 23 | + | <li>Bug fix: Resolved <a href="https://example.com/issue/123">issue #123</a></li> | |
| 24 | + | <li>Update: API v2 documentation published</li> | |
| 25 | + | </ul> | |
| 26 | + | <p>Thanks for reading!</p> | |
| 27 | + | <hr> | |
| 28 | + | <p><small>Unsubscribe: <a href="https://example.com/unsub">click here</a></small></p> | |
| 29 | + | </td></tr> | |
| 30 | + | </table> | |
| 31 | + | </td></tr> | |
| 32 | + | </table> | |
| 33 | + | <img src="https://track.example.com/open.gif" width="1" height="1"> | |
| 34 | + | </body></html>"# | |
| 35 | + | } | |
| 36 | + | ||
| 37 | + | fn reply_chain() -> &'static str { | |
| 38 | + | r#"<html><body> | |
| 39 | + | <div dir="ltr"><p>Got it, thanks!</p></div> | |
| 40 | + | <div class="gmail_quote"> | |
| 41 | + | <div class="gmail_attr">On Tue, Jan 6, Bob wrote:</div> | |
| 42 | + | <blockquote class="gmail_quote"> | |
| 43 | + | <div dir="ltr"><p>Here's the update you requested.</p></div> | |
| 44 | + | <div class="gmail_quote"> | |
| 45 | + | <div class="gmail_attr">On Mon, Jan 5, Alice wrote:</div> | |
| 46 | + | <blockquote class="gmail_quote"> | |
| 47 | + | <div dir="ltr"><p>What's the status on the deployment?</p></div> | |
| 48 | + | </blockquote> | |
| 49 | + | </div> | |
| 50 | + | </blockquote> | |
| 51 | + | </div> | |
| 52 | + | </body></html>"# | |
| 53 | + | } | |
| 54 | + | ||
| 55 | + | fn large_email() -> String { | |
| 56 | + | let paragraph = "<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. \ | |
| 57 | + | Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. \ | |
| 58 | + | Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris.</p>"; | |
| 59 | + | let mut html = String::from("<html><body>"); | |
| 60 | + | for i in 0..100 { | |
| 61 | + | html.push_str(&format!("<h3>Section {}</h3>", i)); | |
| 62 | + | html.push_str(paragraph); | |
| 63 | + | } | |
| 64 | + | html.push_str("</body></html>"); | |
| 65 | + | html | |
| 66 | + | } | |
| 67 | + | ||
| 68 | + | fn bench_simple(c: &mut Criterion) { | |
| 69 | + | let html = simple_email(); | |
| 70 | + | c.bench_function("simple_email", |b| { | |
| 71 | + | b.iter(|| pter::convert(black_box(html))) | |
| 72 | + | }); | |
| 73 | + | } | |
| 74 | + | ||
| 75 | + | fn bench_newsletter(c: &mut Criterion) { | |
| 76 | + | let html = newsletter_email(); | |
| 77 | + | c.bench_function("newsletter_layout_tables", |b| { | |
| 78 | + | b.iter(|| pter::convert(black_box(html))) | |
| 79 | + | }); | |
| 80 | + | } | |
| 81 | + | ||
| 82 | + | fn bench_reply_chain(c: &mut Criterion) { | |
| 83 | + | let html = reply_chain(); | |
| 84 | + | c.bench_function("reply_chain_nested", |b| { | |
| 85 | + | b.iter(|| pter::convert(black_box(html))) | |
| 86 | + | }); | |
| 87 | + | } | |
| 88 | + | ||
| 89 | + | fn bench_large(c: &mut Criterion) { | |
| 90 | + | let html = large_email(); | |
| 91 | + | c.bench_function("large_100_sections", |b| { | |
| 92 | + | b.iter(|| pter::convert(black_box(&html))) | |
| 93 | + | }); | |
| 94 | + | } | |
| 95 | + | ||
| 96 | + | criterion_group!(benches, bench_simple, bench_newsletter, bench_reply_chain, bench_large); | |
| 97 | + | criterion_main!(benches); |
| @@ -0,0 +1,47 @@ | |||
| 1 | + | # pter Architecture | |
| 2 | + | ||
| 3 | + | ## Overview | |
| 4 | + | ||
| 5 | + | pter converts HTML email bodies into readable markdown. It takes an HTML string and returns a markdown string. It does not handle MIME parsing, content extraction, or markdown rendering. | |
| 6 | + | ||
| 7 | + | ## Pipeline | |
| 8 | + | ||
| 9 | + | ``` | |
| 10 | + | html: &str | |
| 11 | + | → scraper::Html::parse_document() # html5ever DOM tree | |
| 12 | + | → walk_children(root) # depth-first traversal | |
| 13 | + | → handle_text() # whitespace collapsing, entity decoding | |
| 14 | + | → handle_element() # classify → skip / transparent / block / inline | |
| 15 | + | → handle_block() # paragraphs, headings, lists, blockquotes, pre, hr | |
| 16 | + | → handle_inline() # bold, italic, links, images, code, br | |
| 17 | + | → whitespace::normalize() # collapse blank lines, trim | |
| 18 | + | → String | |
| 19 | + | ``` | |
| 20 | + | ||
| 21 | + | ## Module Responsibilities | |
| 22 | + | ||
| 23 | + | | Module | Responsibility | | |
| 24 | + | |--------|---------------| | |
| 25 | + | | `lib.rs` | Public API (`convert`), re-exports | | |
| 26 | + | | `convert.rs` | DOM walker, `Context` state, element dispatch | | |
| 27 | + | | `elements.rs` | Element classification, tracking pixel / hidden detection | | |
| 28 | + | | `whitespace.rs` | Output normalization | | |
| 29 | + | | `tables.rs` | Table layout detection and unwrapping (Phase 2) | | |
| 30 | + | | `replies.rs` | Reply chain detection and quoting (Phase 3) | | |
| 31 | + | ||
| 32 | + | ## Design Decisions | |
| 33 | + | ||
| 34 | + | **scraper over html5ever directly**: We need tree traversal (parent/child/sibling access) for layout table unwrapping and reply chain detection. scraper provides this via ego-tree on top of html5ever's spec-compliant parsing. | |
| 35 | + | ||
| 36 | + | **Markdown output**: Markdown is readable as plain text and renderable by any toolchain. It preserves structural information (headings, links, lists) that plain text loses. | |
| 37 | + | ||
| 38 | + | **Faithful conversion**: pter converts what's there. Content extraction (stripping marketing wrappers) and post-processing (trimming signatures) are separate concerns, composable before or after pter. | |
| 39 | + | ||
| 40 | + | **Blockquote rendering**: Blockquotes render children into a temporary buffer, then prefix each line with `> `. This handles nested blockquotes naturally — inner quotes produce `> ` lines, outer quote prefixes them again to get `> > `. | |
| 41 | + | ||
| 42 | + | ## Dependencies | |
| 43 | + | ||
| 44 | + | | Crate | Purpose | | |
| 45 | + | |-------|---------| | |
| 46 | + | | `scraper` | HTML parsing + DOM tree + CSS selectors | | |
| 47 | + | | `proptest` (dev) | Property-based testing | |
| @@ -0,0 +1,95 @@ | |||
| 1 | + | # pter - Todo | |
| 2 | + | ||
| 3 | + | Done: Phases 1-5 (except publish). Active: None. Next: cargo publish when ready. | |
| 4 | + | ||
| 5 | + | v0.1.0. 116 tests. | |
| 6 | + | ||
| 7 | + | --- | |
| 8 | + | ||
| 9 | + | ## Phase 1: Core Conversion | |
| 10 | + | ||
| 11 | + | ### Done | |
| 12 | + | - [x] Crate scaffold (Cargo.toml, MIT license, README) | |
| 13 | + | - [x] HTML element to markdown conversion (p, h1-h6, strong, em, a, img, ul/ol/li, blockquote, pre/code, hr, br, del, sup, sub) | |
| 14 | + | - [x] Tracking pixel detection (1x1 img, empty src, data URI, inline style) | |
| 15 | + | - [x] Hidden element skipping (display:none, visibility:hidden) | |
| 16 | + | - [x] Whitespace normalization (collapse blank lines, trim) | |
| 17 | + | - [x] Script/style/head stripping | |
| 18 | + | - [x] Entity decoding (via html5ever) | |
| 19 | + | - [x] Link deduplication (text matches URL) | |
| 20 | + | - [x] Nested list indentation | |
| 21 | + | - [x] Nested blockquote rendering | |
| 22 | + | - [x] Pre/code block rendering (no double-wrap) | |
| 23 | + | ||
| 24 | + | --- | |
| 25 | + | ||
| 26 | + | ## Phase 2: Email Layout Unwrapping | |
| 27 | + | ||
| 28 | + | ### Done | |
| 29 | + | - [x] Layout table detection heuristic (layout vs data table) | |
| 30 | + | - [x] Single-cell table unwrapping | |
| 31 | + | - [x] Multi-column table linearization | |
| 32 | + | - [x] Data table rendering as markdown table | |
| 33 | + | - [x] Nested layout table recursion | |
| 34 | + | - [x] font-size:0 / line-height:0 / height:0+overflow:hidden spacer detection | |
| 35 | + | - [x] role="presentation" detection | |
| 36 | + | ||
| 37 | + | ### Deferred | |
| 38 | + | - [ ] Outlook conditional comment stripping (client-specific, low cross-platform value) | |
| 39 | + | ||
| 40 | + | --- | |
| 41 | + | ||
| 42 | + | ## Phase 3: Reply Chain Detection | |
| 43 | + | ||
| 44 | + | ### Done | |
| 45 | + | - [x] Reply boundary abstraction (`is_reply_boundary` predicate) | |
| 46 | + | - [x] Structural markers (type=cite) | |
| 47 | + | - [x] CSS class markers (gmail_quote, divRplyFwdMsg, yahoo_quoted, protonmail_quote, tutanota_quote, moz-cite-prefix, zmail_extra) | |
| 48 | + | - [x] Attribution text detection (On ... wrote:, Forwarded message, Original Message, Begin forwarded message, French/German variants) | |
| 49 | + | - [x] Attribution line preservation above quote blocks | |
| 50 | + | - [x] Quote depth rendering via temporary buffer + `>` prefix | |
| 51 | + | - [x] Outlook separator detection (From/Sent/To/Subject blocks) | |
| 52 | + | - [x] Heuristic: div with attribution text followed by blockquote | |
| 53 | + | - [x] Previous sibling text scanning for attribution | |
| 54 | + | ||
| 55 | + | --- | |
| 56 | + | ||
| 57 | + | ## Phase 4: Integration | |
| 58 | + | ||
| 59 | + | ### Done | |
| 60 | + | - [x] GoingsOn: pter::convert() replaces strip_html in imap_client.rs extract_body_with_html() | |
| 61 | + | - [x] GoingsOn: removed ~230 lines of hand-rolled HTML stripping code + 30 tests (covered by pter) | |
| 62 | + | - [x] GoingsOn: path dep added to src-tauri/Cargo.toml | |
| 63 | + | - [x] Balanced Breakfast: pter::convert() replaces html2text in html_to_text + extract_article Rhai host functions | |
| 64 | + | - [x] Balanced Breakfast: html2text dependency removed from bb-core/Cargo.toml | |
| 65 | + | - [x] Both projects compile clean, BB tests pass (153 tests) | |
| 66 | + | ||
| 67 | + | --- | |
| 68 | + | ||
| 69 | + | ## Phase 5: Polish + Publish | |
| 70 | + | ||
| 71 | + | ### Done | |
| 72 | + | - [x] Property-based testing with proptest (7 fuzz strategies: never panics, no HTML leak, valid UTF-8, no triple newlines, no trailing whitespace, arbitrary bytes, whitespace-only) | |
| 73 | + | - [x] Edge case hardening (24 tests: empty, whitespace-only, deeply nested divs/blockquotes/lists, malformed HTML, unicode, large input, empty table cells, nested link formatting) | |
| 74 | + | - [x] Benchmarks with criterion (simple: 4µs, newsletter: 15µs, reply chain: 10µs, 100 sections: 101µs) | |
| 75 | + | ||
| 76 | + | ### Remaining | |
| 77 | + | - [ ] cargo publish to crates.io | |
| 78 | + | - [ ] Update GO and BB to crates.io version | |
| 79 | + | ||
| 80 | + | --- | |
| 81 | + | ||
| 82 | + | ## Key Paths | |
| 83 | + | ||
| 84 | + | | What | Where | | |
| 85 | + | |------|-------| | |
| 86 | + | | Public API | `src/lib.rs` | | |
| 87 | + | | Conversion pipeline | `src/convert.rs` | | |
| 88 | + | | Element classification | `src/elements.rs` | | |
| 89 | + | | Table handling | `src/tables.rs` | | |
| 90 | + | | Reply detection | `src/replies.rs` | | |
| 91 | + | | Whitespace normalization | `src/whitespace.rs` | | |
| 92 | + | | Integration tests | `tests/integration.rs` | | |
| 93 | + | | Edge case tests | `tests/edge_cases.rs` | | |
| 94 | + | | Property-based tests | `tests/proptest.rs` | | |
| 95 | + | | Benchmarks | `benches/convert_bench.rs` | |
| @@ -0,0 +1,689 @@ | |||
| 1 | + | use scraper::node::Node; | |
| 2 | + | use scraper::{ElementRef, Html}; | |
| 3 | + | ||
| 4 | + | use crate::elements::{self, BlockKind, ElementAction, InlineKind}; | |
| 5 | + | use crate::replies; | |
| 6 | + | use crate::tables; | |
| 7 | + | use crate::whitespace; | |
| 8 | + | ||
| 9 | + | /// Convert an HTML email body into readable markdown. | |
| 10 | + | /// | |
| 11 | + | /// This is the main entry point for pter. Pass in an HTML string | |
| 12 | + | /// (just the body, not MIME structure) and get back clean markdown. | |
| 13 | + | /// | |
| 14 | + | /// ``` | |
| 15 | + | /// let md = pter::convert("<p>Hello <strong>world</strong></p>"); | |
| 16 | + | /// assert_eq!(md, "Hello **world**"); | |
| 17 | + | /// ``` | |
| 18 | + | pub fn convert(html: &str) -> String { | |
| 19 | + | if html.is_empty() { | |
| 20 | + | return String::new(); | |
| 21 | + | } | |
| 22 | + | ||
| 23 | + | let document = Html::parse_document(html); | |
| 24 | + | let mut ctx = Context::new(); | |
| 25 | + | walk_children(document.root_element(), &mut ctx); | |
| 26 | + | whitespace::normalize(&ctx.output) | |
| 27 | + | } | |
| 28 | + | ||
| 29 | + | /// Conversion state threaded through the tree walk. | |
| 30 | + | struct Context { | |
| 31 | + | output: String, | |
| 32 | + | /// Current list nesting depth (for indentation). | |
| 33 | + | list_depth: u32, | |
| 34 | + | /// Whether we're inside a <pre> block (preserve whitespace). | |
| 35 | + | in_pre: bool, | |
| 36 | + | /// Whether we're inside an <a> tag (don't nest links). | |
| 37 | + | in_link: bool, | |
| 38 | + | /// Stack of list types for proper ordered/unordered rendering. | |
| 39 | + | list_stack: Vec<ListType>, | |
| 40 | + | } | |
| 41 | + | ||
| 42 | + | #[derive(Clone, Copy)] | |
| 43 | + | enum ListType { | |
| 44 | + | Unordered, | |
| 45 | + | Ordered(u32), // current item number | |
| 46 | + | } | |
| 47 | + | ||
| 48 | + | impl Context { | |
| 49 | + | fn new() -> Self { | |
| 50 | + | Self { | |
| 51 | + | output: String::with_capacity(4096), | |
| 52 | + | list_depth: 0, | |
| 53 | + | in_pre: false, | |
| 54 | + | in_link: false, | |
| 55 | + | list_stack: Vec::new(), | |
| 56 | + | } | |
| 57 | + | } | |
| 58 | + | ||
| 59 | + | fn push(&mut self, s: &str) { | |
| 60 | + | self.output.push_str(s); | |
| 61 | + | } | |
| 62 | + | ||
| 63 | + | fn push_char(&mut self, c: char) { | |
| 64 | + | self.output.push(c); | |
| 65 | + | } | |
| 66 | + | ||
| 67 | + | fn ensure_blank_line(&mut self) { | |
| 68 | + | let trimmed = self.output.trim_end_matches(' '); | |
| 69 | + | if trimmed.is_empty() { | |
| 70 | + | return; | |
| 71 | + | } | |
| 72 | + | if trimmed.ends_with("\n\n") { | |
| 73 | + | return; | |
| 74 | + | } | |
| 75 | + | self.output.truncate(trimmed.len()); | |
| 76 | + | self.output.push_str("\n\n"); | |
| 77 | + | } | |
| 78 | + | ||
| 79 | + | fn ensure_newline(&mut self) { | |
| 80 | + | if !self.output.is_empty() && !self.output.ends_with('\n') { | |
| 81 | + | self.output.push('\n'); | |
| 82 | + | } | |
| 83 | + | } | |
| 84 | + | ||
| 85 | + | fn list_indent(&self) -> String { | |
| 86 | + | if self.list_depth <= 1 { | |
| 87 | + | return String::new(); | |
| 88 | + | } | |
| 89 | + | " ".repeat((self.list_depth - 1) as usize) | |
| 90 | + | } | |
| 91 | + | } | |
| 92 | + | ||
| 93 | + | /// Walk all children of a node, converting each to markdown. | |
| 94 | + | fn walk_children(parent: ElementRef, ctx: &mut Context) { | |
| 95 | + | for child in parent.children() { | |
| 96 | + | match child.value() { | |
| 97 | + | Node::Text(text) => { | |
| 98 | + | handle_text(&text.text, ctx); | |
| 99 | + | } | |
| 100 | + | Node::Element(_) => { | |
| 101 | + | if let Some(el_ref) = ElementRef::wrap(child) { | |
| 102 | + | handle_element(el_ref, ctx); | |
| 103 | + | } | |
| 104 | + | } | |
| 105 | + | _ => {} | |
| 106 | + | } | |
| 107 | + | } | |
| 108 | + | } | |
| 109 | + | ||
| 110 | + | /// Handle a text node. | |
| 111 | + | fn handle_text(text: &str, ctx: &mut Context) { | |
| 112 | + | if ctx.in_pre { | |
| 113 | + | ctx.push(text); | |
| 114 | + | return; | |
| 115 | + | } | |
| 116 | + | ||
| 117 | + | // Collapse whitespace in normal flow | |
| 118 | + | let mut last_was_space = ctx.output.ends_with(' ') || ctx.output.ends_with('\n'); | |
| 119 | + | for ch in text.chars() { | |
| 120 | + | if ch.is_ascii_whitespace() { | |
| 121 | + | if !last_was_space { | |
| 122 | + | ctx.push_char(' '); | |
| 123 | + | last_was_space = true; | |
| 124 | + | } | |
| 125 | + | } else { | |
| 126 | + | ctx.push_char(ch); | |
| 127 | + | last_was_space = false; | |
| 128 | + | } | |
| 129 | + | } | |
| 130 | + | } | |
| 131 | + | ||
| 132 | + | /// Handle an element node — classify it and render accordingly. | |
| 133 | + | fn handle_element(el: ElementRef, ctx: &mut Context) { | |
| 134 | + | let element = el.value(); | |
| 135 | + | ||
| 136 | + | // Check hidden elements | |
| 137 | + | if elements::is_hidden(element) { | |
| 138 | + | return; | |
| 139 | + | } | |
| 140 | + | ||
| 141 | + | // Check for reply boundaries before normal classification. | |
| 142 | + | // Reply boundaries (gmail_quote, type=cite, etc.) get rendered | |
| 143 | + | // as blockquotes regardless of their actual element type. | |
| 144 | + | if replies::is_reply_boundary(el) { | |
| 145 | + | render_reply_block(el, ctx); | |
| 146 | + | return; | |
| 147 | + | } | |
| 148 | + | ||
| 149 | + | // Check for Outlook-style "From: ... Sent: ..." separator blocks. | |
| 150 | + | // These introduce quoted content that follows them. | |
| 151 | + | if replies::is_outlook_separator(el) { | |
| 152 | + | ctx.ensure_blank_line(); | |
| 153 | + | // Render the separator header as attribution | |
| 154 | + | let text: String = el.text().collect(); | |
| 155 | + | let trimmed = text.split_whitespace().collect::<Vec<_>>().join(" "); | |
| 156 | + | ctx.push(&trimmed); | |
| 157 | + | ctx.ensure_blank_line(); | |
| 158 | + | return; | |
| 159 | + | } | |
| 160 | + | ||
| 161 | + | match elements::classify(element) { | |
| 162 | + | ElementAction::Skip => {} | |
| 163 | + | ElementAction::Transparent => walk_children(el, ctx), | |
| 164 | + | ElementAction::Block(kind) => handle_block(el, ctx, kind), | |
| 165 | + | ElementAction::Inline(kind) => handle_inline(el, ctx, kind), | |
| 166 | + | } | |
| 167 | + | } | |
| 168 | + | ||
| 169 | + | fn handle_block(el: ElementRef, ctx: &mut Context, kind: BlockKind) { | |
| 170 | + | match kind { | |
| 171 | + | BlockKind::Paragraph => { | |
| 172 | + | ctx.ensure_blank_line(); | |
| 173 | + | walk_children(el, ctx); | |
| 174 | + | ctx.ensure_blank_line(); | |
| 175 | + | } | |
| 176 | + | ||
| 177 | + | BlockKind::Heading(level) => { | |
| 178 | + | ctx.ensure_blank_line(); | |
| 179 | + | let prefix = "#".repeat(level as usize); | |
| 180 | + | ctx.push(&prefix); | |
| 181 | + | ctx.push_char(' '); | |
| 182 | + | walk_children(el, ctx); | |
| 183 | + | ctx.ensure_blank_line(); | |
| 184 | + | } | |
| 185 | + | ||
| 186 | + | BlockKind::Blockquote => { | |
| 187 | + | ctx.ensure_blank_line(); | |
| 188 | + | // Render children into a temporary buffer, then prefix each line with > | |
| 189 | + | let mut inner_ctx = Context::new(); | |
| 190 | + | inner_ctx.in_pre = ctx.in_pre; | |
| 191 | + | inner_ctx.in_link = ctx.in_link; | |
| 192 | + | walk_children(el, &mut inner_ctx); | |
| 193 | + | let inner = whitespace::normalize(&inner_ctx.output); | |
| 194 | + | for line in inner.lines() { | |
| 195 | + | ctx.push("> "); | |
| 196 | + | ctx.push(line); | |
| 197 | + | ctx.push_char('\n'); | |
| 198 | + | } | |
| 199 | + | ctx.push_char('\n'); | |
| 200 | + | } | |
| 201 | + | ||
| 202 | + | BlockKind::UnorderedList => { | |
| 203 | + | ctx.ensure_blank_line(); | |
| 204 | + | ctx.list_depth += 1; | |
| 205 | + | ctx.list_stack.push(ListType::Unordered); | |
| 206 | + | walk_children(el, ctx); | |
| 207 | + | ctx.list_stack.pop(); | |
| 208 | + | ctx.list_depth -= 1; | |
| 209 | + | ctx.ensure_blank_line(); | |
| 210 | + | } | |
| 211 | + | ||
| 212 | + | BlockKind::OrderedList => { | |
| 213 | + | ctx.ensure_blank_line(); | |
| 214 | + | ctx.list_depth += 1; | |
| 215 | + | ctx.list_stack.push(ListType::Ordered(0)); | |
| 216 | + | walk_children(el, ctx); | |
| 217 | + | ctx.list_stack.pop(); | |
| 218 | + | ctx.list_depth -= 1; | |
| 219 | + | ctx.ensure_blank_line(); | |
| 220 | + | } | |
| 221 | + | ||
| 222 | + | BlockKind::ListItem => { | |
| 223 | + | ctx.ensure_newline(); | |
| 224 | + | let indent = ctx.list_indent(); | |
| 225 | + | ctx.push(&indent); | |
| 226 | + | ||
| 227 | + | // Determine bullet or number | |
| 228 | + | let marker = match ctx.list_stack.last_mut() { | |
| 229 | + | Some(ListType::Unordered) => "- ".to_string(), | |
| 230 | + | Some(ListType::Ordered(n)) => { | |
| 231 | + | *n += 1; | |
| 232 | + | format!("{}. ", *n) | |
| 233 | + | } | |
| 234 | + | None => "- ".to_string(), | |
| 235 | + | }; | |
| 236 | + | ctx.push(&marker); | |
| 237 | + | walk_children(el, ctx); | |
| 238 | + | ctx.ensure_newline(); | |
| 239 | + | } | |
| 240 | + | ||
| 241 | + | BlockKind::PreFormatted => { | |
| 242 | + | ctx.ensure_blank_line(); | |
| 243 | + | ctx.push("```\n"); | |
| 244 | + | ctx.in_pre = true; | |
| 245 | + | walk_children(el, ctx); | |
| 246 | + | ctx.in_pre = false; | |
| 247 | + | ctx.ensure_newline(); | |
| 248 | + | ctx.push("```"); | |
| 249 | + | ctx.ensure_blank_line(); | |
| 250 | + | } | |
| 251 | + | ||
| 252 | + | BlockKind::HorizontalRule => { | |
| 253 | + | ctx.ensure_blank_line(); | |
| 254 | + | ctx.push("---"); | |
| 255 | + | ctx.ensure_blank_line(); | |
| 256 | + | } | |
| 257 | + | ||
| 258 | + | BlockKind::Table => { | |
| 259 | + | ctx.ensure_blank_line(); | |
| 260 | + | if tables::is_data_table(el) { | |
| 261 | + | let (headers, rows) = tables::extract_table_data(el); | |
| 262 | + | let md = tables::render_markdown_table(&headers, &rows); | |
| 263 | + | if !md.is_empty() { | |
| 264 | + | ctx.push(&md); | |
| 265 | + | } | |
| 266 | + | } else { | |
| 267 | + | // Layout table — unwrap and render cell contents directly | |
| 268 | + | render_layout_table(el, ctx); | |
| 269 | + | } | |
| 270 | + | ctx.ensure_blank_line(); | |
| 271 | + | } | |
| 272 | + | ||
| 273 | + | BlockKind::Div => { | |
| 274 | + | // Divs act as block separators but don't add their own markup | |
| 275 | + | ctx.ensure_blank_line(); | |
| 276 | + | walk_children(el, ctx); | |
| 277 | + | ctx.ensure_blank_line(); | |
| 278 | + | } | |
| 279 | + | } | |
| 280 | + | } | |
| 281 | + | ||
| 282 | + | fn handle_inline(el: ElementRef, ctx: &mut Context, kind: InlineKind) { | |
| 283 | + | match kind { | |
| 284 | + | InlineKind::Bold => { | |
| 285 | + | ctx.push("**"); | |
| 286 | + | walk_children(el, ctx); | |
| 287 | + | ctx.push("**"); | |
| 288 | + | } | |
| 289 | + | ||
| 290 | + | InlineKind::Italic => { | |
| 291 | + | ctx.push("*"); | |
| 292 | + | walk_children(el, ctx); | |
| 293 | + | ctx.push("*"); | |
| 294 | + | } | |
| 295 | + | ||
| 296 | + | InlineKind::Strikethrough => { | |
| 297 | + | ctx.push("~~"); | |
| 298 | + | walk_children(el, ctx); | |
| 299 | + | ctx.push("~~"); | |
| 300 | + | } | |
| 301 | + | ||
| 302 | + | InlineKind::Code => { | |
| 303 | + | if ctx.in_pre { | |
| 304 | + | // Inside a <pre>, don't double-wrap | |
| 305 | + | walk_children(el, ctx); | |
| 306 | + | } else { | |
| 307 | + | ctx.push("`"); | |
| 308 | + | walk_children(el, ctx); | |
| 309 | + | ctx.push("`"); | |
| 310 | + | } | |
| 311 | + | } | |
| 312 | + | ||
| 313 | + | InlineKind::Link => { | |
| 314 | + | if ctx.in_link { | |
| 315 | + | // Don't nest links | |
| 316 | + | walk_children(el, ctx); | |
| 317 | + | return; | |
| 318 | + | } | |
| 319 | + | ||
| 320 | + | let href = el.value().attr("href").unwrap_or(""); | |
| 321 | + | ||
| 322 | + | if href.is_empty() || href == "#" { | |
| 323 | + | walk_children(el, ctx); | |
| 324 | + | return; | |
| 325 | + | } | |
| 326 | + | ||
| 327 | + | // Collect the link text | |
| 328 | + | let mut text_ctx = Context::new(); | |
| 329 | + | text_ctx.in_link = true; | |
| 330 | + | walk_children(el, &mut text_ctx); | |
| 331 | + | let text = text_ctx.output.trim().to_string(); | |
| 332 | + | ||
| 333 | + | if text.is_empty() { | |
| 334 | + | // Link with no text — just show the URL | |
| 335 | + | ctx.push(href); | |
| 336 | + | } else if text == href { | |
| 337 | + | // Link text matches URL — no need for markdown link syntax | |
| 338 | + | ctx.push(href); | |
| 339 | + | } else { | |
| 340 | + | ctx.push("["); | |
| 341 | + | ctx.push(&text); | |
| 342 | + | ctx.push("]("); | |
| 343 | + | ctx.push(href); | |
| 344 | + | ctx.push(")"); | |
| 345 | + | } | |
| 346 | + | } | |
| 347 | + | ||
| 348 | + | InlineKind::Image => { | |
| 349 | + | let element = el.value(); | |
| 350 | + | if elements::is_tracking_pixel(element) { | |
| 351 | + | return; | |
| 352 | + | } | |
| 353 | + | ||
| 354 | + | let alt = element.attr("alt").unwrap_or(""); | |
| 355 | + | let src = element.attr("src").unwrap_or(""); | |
| 356 | + | ||
| 357 | + | if src.is_empty() { | |
| 358 | + | return; | |
| 359 | + | } | |
| 360 | + | ||
| 361 | + | ctx.push("; | |
| 364 | + | ctx.push(src); | |
| 365 | + | ctx.push(")"); | |
| 366 | + | } | |
| 367 | + | ||
| 368 | + | InlineKind::LineBreak => { | |
| 369 | + | ctx.push_char('\n'); | |
| 370 | + | } | |
| 371 | + | ||
| 372 | + | InlineKind::Superscript => { | |
| 373 | + | ctx.push("^"); | |
| 374 | + | walk_children(el, ctx); | |
| 375 | + | } | |
| 376 | + | ||
| 377 | + | InlineKind::Subscript => { | |
| 378 | + | ctx.push("~"); | |
| 379 | + | walk_children(el, ctx); | |
| 380 | + | } | |
| 381 | + | } | |
| 382 | + | } | |
| 383 | + | ||
| 384 | + | /// Render a reply boundary as a quoted block. | |
| 385 | + | /// | |
| 386 | + | /// This is the same rendering logic as `<blockquote>` — children are | |
| 387 | + | /// rendered into a temporary buffer and each line gets `> ` prefixed. | |
| 388 | + | /// Attribution lines (e.g. "On ... wrote:") are rendered above the quote. | |
| 389 | + | fn render_reply_block(el: ElementRef, ctx: &mut Context) { | |
| 390 | + | ctx.ensure_blank_line(); | |
| 391 | + | ||
| 392 | + | // Look for attribution text | |
| 393 | + | if let Some(attribution) = replies::find_attribution(el) { | |
| 394 | + | ctx.push(&attribution); | |
| 395 | + | ctx.push_char('\n'); | |
| 396 | + | } | |
| 397 | + | ||
| 398 | + | // Render children into temp buffer, then prefix with > | |
| 399 | + | let mut inner_ctx = Context::new(); | |
| 400 | + | inner_ctx.in_pre = ctx.in_pre; | |
| 401 | + | inner_ctx.in_link = ctx.in_link; | |
| 402 | + | walk_children(el, &mut inner_ctx); | |
| 403 | + | let inner = whitespace::normalize(&inner_ctx.output); | |
| 404 | + | ||
| 405 | + | if !inner.is_empty() { | |
| 406 | + | for line in inner.lines() { | |
| 407 | + | ctx.push("> "); | |
| 408 | + | ctx.push(line); | |
| 409 | + | ctx.push_char('\n'); | |
| 410 | + | } | |
| 411 | + | ctx.push_char('\n'); | |
| 412 | + | } | |
| 413 | + | } | |
| 414 | + | ||
| 415 | + | /// Unwrap a layout table by rendering cell contents sequentially. | |
| 416 | + | /// | |
| 417 | + | /// Walks through rows and cells, rendering each cell's content as if | |
| 418 | + | /// the table wrapper didn't exist. This handles the common email pattern | |
| 419 | + | /// of wrapping everything in `<table><tr><td>...</td></tr></table>`. | |
| 420 | + | fn render_layout_table(table: ElementRef, ctx: &mut Context) { | |
| 421 | + | for descendant in table.descendants() { | |
| 422 | + | if let Some(el_ref) = ElementRef::wrap(descendant) { | |
| 423 | + | let name = el_ref.value().name(); | |
| 424 | + | if name == "td" || name == "th" { | |
| 425 | + | // Check if the cell itself is hidden | |
| 426 | + | if !elements::is_hidden(el_ref.value()) { | |
| 427 | + | walk_children(el_ref, ctx); | |
| 428 | + | ctx.ensure_blank_line(); | |
| 429 | + | } | |
| 430 | + | } | |
| 431 | + | } | |
| 432 | + | } | |
| 433 | + | } | |
| 434 | + | ||
| 435 | + | #[cfg(test)] | |
| 436 | + | mod tests { | |
| 437 | + | use super::*; | |
| 438 | + | ||
| 439 | + | // -- Basic elements -- | |
| 440 | + | ||
| 441 | + | #[test] | |
| 442 | + | fn empty_input() { | |
| 443 | + | assert_eq!(convert(""), ""); | |
| 444 | + | } | |
| 445 | + | ||
| 446 | + | #[test] | |
| 447 | + | fn plain_text() { | |
| 448 | + | assert_eq!(convert("hello world"), "hello world"); | |
| 449 | + | } | |
| 450 | + | ||
| 451 | + | #[test] | |
| 452 | + | fn paragraph() { | |
| 453 | + | assert_eq!(convert("<p>one</p><p>two</p>"), "one\n\ntwo"); | |
| 454 | + | } | |
| 455 | + | ||
| 456 | + | #[test] | |
| 457 | + | fn headings() { | |
| 458 | + | assert_eq!(convert("<h1>Title</h1>"), "# Title"); | |
| 459 | + | assert_eq!(convert("<h3>Sub</h3>"), "### Sub"); | |
| 460 | + | } | |
| 461 | + | ||
| 462 | + | #[test] | |
| 463 | + | fn bold_and_italic() { | |
| 464 | + | assert_eq!( | |
| 465 | + | convert("<p><strong>bold</strong> and <em>italic</em></p>"), | |
| 466 | + | "**bold** and *italic*" | |
| 467 | + | ); | |
| 468 | + | } | |
| 469 | + | ||
| 470 | + | #[test] | |
| 471 | + | fn link() { | |
| 472 | + | assert_eq!( | |
| 473 | + | convert(r#"<a href="https://example.com">click</a>"#), | |
| 474 | + | "[click](https://example.com)" | |
| 475 | + | ); | |
| 476 | + | } | |
| 477 | + | ||
| 478 | + | #[test] | |
| 479 | + | fn link_text_matches_url() { | |
| 480 | + | assert_eq!( | |
| 481 | + | convert(r#"<a href="https://example.com">https://example.com</a>"#), | |
| 482 | + | "https://example.com" | |
| 483 | + | ); | |
| 484 | + | } | |
| 485 | + | ||
| 486 | + | #[test] | |
| 487 | + | fn link_empty_href() { | |
| 488 | + | assert_eq!(convert(r#"<a href="">click</a>"#), "click"); | |
| 489 | + | } | |
| 490 | + | ||
| 491 | + | #[test] | |
| 492 | + | fn image() { | |
| 493 | + | assert_eq!( | |
| 494 | + | convert(r#"<img src="photo.jpg" alt="A photo">"#), | |
| 495 | + | "" | |
| 496 | + | ); | |
| 497 | + | } | |
| 498 | + | ||
| 499 | + | #[test] | |
| 500 | + | fn tracking_pixel_skipped() { |
Lines truncated
| @@ -0,0 +1,156 @@ | |||
| 1 | + | use scraper::node::Element; | |
| 2 | + | ||
| 3 | + | /// What kind of markdown wrapper an element produces. | |
| 4 | + | pub enum ElementAction { | |
| 5 | + | /// Skip this element and all its children entirely. | |
| 6 | + | Skip, | |
| 7 | + | /// Render children only, no wrapper (transparent element). | |
| 8 | + | Transparent, | |
| 9 | + | /// Block element with specific rendering. | |
| 10 | + | Block(BlockKind), | |
| 11 | + | /// Inline element with specific rendering. | |
| 12 | + | Inline(InlineKind), | |
| 13 | + | } | |
| 14 | + | ||
| 15 | + | pub enum BlockKind { | |
| 16 | + | Paragraph, | |
| 17 | + | Heading(u8), | |
| 18 | + | Blockquote, | |
| 19 | + | UnorderedList, | |
| 20 | + | OrderedList, | |
| 21 | + | ListItem, | |
| 22 | + | PreFormatted, | |
| 23 | + | HorizontalRule, | |
| 24 | + | Table, | |
| 25 | + | Div, | |
| 26 | + | } | |
| 27 | + | ||
| 28 | + | pub enum InlineKind { | |
| 29 | + | Bold, | |
| 30 | + | Italic, | |
| 31 | + | Strikethrough, | |
| 32 | + | Code, | |
| 33 | + | Link, | |
| 34 | + | Image, | |
| 35 | + | LineBreak, | |
| 36 | + | Superscript, | |
| 37 | + | Subscript, | |
| 38 | + | } | |
| 39 | + | ||
| 40 | + | /// Classify an HTML element into the action pter should take. | |
| 41 | + | pub fn classify(el: &Element) -> ElementAction { | |
| 42 | + | match el.name() { | |
| 43 | + | // Skip entirely | |
| 44 | + | "script" | "style" | "head" | "meta" | "link" | "title" | "noscript" => { | |
| 45 | + | ElementAction::Skip | |
| 46 | + | } | |
| 47 | + | ||
| 48 | + | // Block elements | |
| 49 | + | "p" => ElementAction::Block(BlockKind::Paragraph), | |
| 50 | + | "h1" => ElementAction::Block(BlockKind::Heading(1)), | |
| 51 | + | "h2" => ElementAction::Block(BlockKind::Heading(2)), | |
| 52 | + | "h3" => ElementAction::Block(BlockKind::Heading(3)), | |
| 53 | + | "h4" => ElementAction::Block(BlockKind::Heading(4)), | |
| 54 | + | "h5" => ElementAction::Block(BlockKind::Heading(5)), | |
| 55 | + | "h6" => ElementAction::Block(BlockKind::Heading(6)), | |
| 56 | + | "blockquote" => ElementAction::Block(BlockKind::Blockquote), | |
| 57 | + | "ul" | "menu" => ElementAction::Block(BlockKind::UnorderedList), | |
| 58 | + | "ol" => ElementAction::Block(BlockKind::OrderedList), | |
| 59 | + | "li" => ElementAction::Block(BlockKind::ListItem), | |
| 60 | + | "pre" => ElementAction::Block(BlockKind::PreFormatted), | |
| 61 | + | "hr" => ElementAction::Block(BlockKind::HorizontalRule), | |
| 62 | + | "table" => ElementAction::Block(BlockKind::Table), | |
| 63 | + | // Table sub-elements are handled by the Table block handler, not individually | |
| 64 | + | "thead" | "tbody" | "tfoot" | "tr" | "td" | "th" | "caption" | "colgroup" | "col" => { | |
| 65 | + | ElementAction::Transparent | |
| 66 | + | } | |
| 67 | + | "div" | "section" | "article" | "main" | "header" | "footer" | "nav" | "aside" | |
| 68 | + | | "figure" | "figcaption" | "details" | "summary" => { | |
| 69 | + | ElementAction::Block(BlockKind::Div) | |
| 70 | + | } | |
| 71 | + | ||
| 72 | + | // Inline elements | |
| 73 | + | "strong" | "b" => ElementAction::Inline(InlineKind::Bold), | |
| 74 | + | "em" | "i" => ElementAction::Inline(InlineKind::Italic), | |
| 75 | + | "del" | "s" | "strike" => ElementAction::Inline(InlineKind::Strikethrough), | |
| 76 | + | "code" | "tt" => ElementAction::Inline(InlineKind::Code), | |
| 77 | + | "a" => ElementAction::Inline(InlineKind::Link), | |
| 78 | + | "img" => ElementAction::Inline(InlineKind::Image), | |
| 79 | + | "br" => ElementAction::Inline(InlineKind::LineBreak), | |
| 80 | + | "sup" => ElementAction::Inline(InlineKind::Superscript), | |
| 81 | + | "sub" => ElementAction::Inline(InlineKind::Subscript), | |
| 82 | + | ||
| 83 | + | // Everything else: transparent (render children) | |
| 84 | + | _ => ElementAction::Transparent, | |
| 85 | + | } | |
| 86 | + | } | |
| 87 | + | ||
| 88 | + | /// Check if an <img> element is a tracking pixel. | |
| 89 | + | /// Returns true if it should be skipped. | |
| 90 | + | pub fn is_tracking_pixel(el: &Element) -> bool { | |
| 91 | + | let width = el.attr("width"); | |
| 92 | + | let height = el.attr("height"); | |
| 93 | + | ||
| 94 | + | // 1x1 or 0x0 images | |
| 95 | + | if matches!(width, Some("1" | "0")) || matches!(height, Some("1" | "0")) { | |
| 96 | + | return true; | |
| 97 | + | } | |
| 98 | + | ||
| 99 | + | // No src attribute | |
| 100 | + | let Some(src) = el.attr("src") else { | |
| 101 | + | return true; | |
| 102 | + | }; | |
| 103 | + | ||
| 104 | + | // Empty or data:image/gif (common transparent pixel) | |
| 105 | + | if src.is_empty() { | |
| 106 | + | return true; | |
| 107 | + | } | |
| 108 | + | if src.starts_with("data:image/gif;base64,R0lGOD") { | |
| 109 | + | return true; | |
| 110 | + | } | |
| 111 | + | ||
| 112 | + | // Check inline style for tiny dimensions | |
| 113 | + | if let Some(style) = el.attr("style") { | |
| 114 | + | let style_lower = style.to_lowercase(); | |
| 115 | + | if style_lower.contains("width:1px") | |
| 116 | + | || style_lower.contains("width: 1px") | |
| 117 | + | || style_lower.contains("width:0") | |
| 118 | + | || style_lower.contains("height:1px") | |
| 119 | + | || style_lower.contains("height: 1px") | |
| 120 | + | || style_lower.contains("height:0") | |
| 121 | + | || style_lower.contains("display:none") | |
| 122 | + | || style_lower.contains("display: none") | |
| 123 | + | { | |
| 124 | + | return true; | |
| 125 | + | } | |
| 126 | + | } | |
| 127 | + | ||
| 128 | + | false | |
| 129 | + | } | |
| 130 | + | ||
| 131 | + | /// Check if an element is hidden via inline style. | |
| 132 | + | /// | |
| 133 | + | /// Catches display:none, visibility:hidden, and spacer tricks | |
| 134 | + | /// like font-size:0 or line-height:0 (commonly used in email templates). | |
| 135 | + | pub fn is_hidden(el: &Element) -> bool { | |
| 136 | + | if let Some(style) = el.attr("style") { | |
| 137 | + | let s = style.to_lowercase(); | |
| 138 | + | if s.contains("display:none") | |
| 139 | + | || s.contains("display: none") | |
| 140 | + | || s.contains("visibility:hidden") | |
| 141 | + | || s.contains("visibility: hidden") | |
| 142 | + | || s.contains("font-size:0") | |
| 143 | + | || s.contains("font-size: 0") | |
| 144 | + | || s.contains("line-height:0") | |
| 145 | + | || s.contains("line-height: 0") | |
| 146 | + | || (s.contains("height:0") && s.contains("overflow:hidden")) | |
| 147 | + | || (s.contains("height: 0") && s.contains("overflow: hidden")) | |
| 148 | + | || s.contains("max-height:0") | |
| 149 | + | || s.contains("max-height: 0") | |
| 150 | + | { | |
| 151 | + | return true; | |
| 152 | + | } | |
| 153 | + | } | |
| 154 | + | false | |
| 155 | + | } | |
| 156 | + |
| @@ -0,0 +1,11 @@ | |||
| 1 | + | //! pter — Plain Text Email Renderer | |
| 2 | + | //! | |
| 3 | + | //! Converts HTML email bodies into readable markdown. | |
| 4 | + | ||
| 5 | + | mod convert; | |
| 6 | + | mod elements; | |
| 7 | + | mod replies; | |
| 8 | + | mod tables; | |
| 9 | + | mod whitespace; | |
| 10 | + | ||
| 11 | + | pub use convert::convert; |
| @@ -0,0 +1,324 @@ | |||
| 1 | + | use scraper::node::Node; | |
| 2 | + | use scraper::ElementRef; | |
| 3 | + | ||
| 4 | + | /// Check if an element marks the beginning of a quoted reply. | |
| 5 | + | /// | |
| 6 | + | /// This is the central abstraction for reply detection. Rather than | |
| 7 | + | /// building per-client logic throughout the converter, all client-specific | |
| 8 | + | /// knowledge lives here behind a single predicate. | |
| 9 | + | /// | |
| 10 | + | /// An element is a reply boundary if it's a container that wraps quoted | |
| 11 | + | /// content from a previous message in the thread. The converter treats | |
| 12 | + | /// these identically to `<blockquote>` — children get `>` prefixed. | |
| 13 | + | pub fn is_reply_boundary(el: ElementRef) -> bool { | |
| 14 | + | let element = el.value(); | |
| 15 | + | let name = element.name(); | |
| 16 | + | ||
| 17 | + | // <blockquote> is already handled by the element classifier. | |
| 18 | + | // This function catches non-blockquote reply wrappers. | |
| 19 | + | ||
| 20 | + | // Structural: elements with type="cite" (Apple Mail, some webmail) | |
| 21 | + | if element.attr("type") == Some("cite") { | |
| 22 | + | return true; | |
| 23 | + | } | |
| 24 | + | ||
| 25 | + | // Class/ID-based detection — thin per-client checks | |
| 26 | + | if element.attr("class").is_some_and(is_reply_class) { | |
| 27 | + | return true; | |
| 28 | + | } | |
| 29 | + | ||
| 30 | + | if element.attr("id").is_some_and(is_reply_id) { | |
| 31 | + | return true; | |
| 32 | + | } | |
| 33 | + | ||
| 34 | + | // Heuristic: a <div> whose first meaningful text child matches | |
| 35 | + | // an attribution pattern ("On ... wrote:") followed by a blockquote | |
| 36 | + | if name == "div" && has_attribution_then_quote(el) { | |
| 37 | + | return true; | |
| 38 | + | } | |
| 39 | + | ||
| 40 | + | false | |
| 41 | + | } | |
| 42 | + | ||
| 43 | + | /// Extract an attribution line from just before or at the start of a reply boundary. | |
| 44 | + | /// | |
| 45 | + | /// Returns the attribution text (e.g. "On Mon, Jan 5, Alice wrote:") if found, | |
| 46 | + | /// so the converter can render it above the quoted block. | |
| 47 | + | pub fn find_attribution(el: ElementRef) -> Option<String> { | |
| 48 | + | // Check the element's own leading text for attribution patterns | |
| 49 | + | for child in el.children() { | |
| 50 | + | match child.value() { | |
| 51 | + | Node::Text(text) => { | |
| 52 | + | let trimmed = text.text.trim(); | |
| 53 | + | if is_attribution_text(trimmed) { | |
| 54 | + | return Some(trimmed.to_string()); | |
| 55 | + | } | |
| 56 | + | } | |
| 57 | + | Node::Element(_) => { | |
| 58 | + | // Stop at the first child element — attribution is leading text | |
| 59 | + | break; | |
| 60 | + | } | |
| 61 | + | _ => {} | |
| 62 | + | } | |
| 63 | + | } | |
| 64 | + | ||
| 65 | + | // Check for a preceding sibling text node or element with attribution | |
| 66 | + | if let Some(prev) = previous_sibling_text(el) { | |
| 67 | + | let trimmed = prev.trim().to_string(); | |
| 68 | + | if is_attribution_text(&trimmed) { | |
| 69 | + | return Some(trimmed); | |
| 70 | + | } | |
| 71 | + | } | |
| 72 | + | ||
| 73 | + | None | |
| 74 | + | } | |
| 75 | + | ||
| 76 | + | /// Check if text matches common email attribution patterns. | |
| 77 | + | /// | |
| 78 | + | /// These patterns are cross-client — every email client generates some | |
| 79 | + | /// variant of "On [date], [person] wrote:" or "--- Forwarded message ---". | |
| 80 | + | fn is_attribution_text(text: &str) -> bool { | |
| 81 | + | let t = text.trim(); | |
| 82 | + | ||
| 83 | + | // "On ... wrote:" (Gmail, Apple Mail, Thunderbird, most clients) | |
| 84 | + | if t.starts_with("On ") && t.ends_with("wrote:") { | |
| 85 | + | return true; | |
| 86 | + | } | |
| 87 | + | // Localized variants: "Le ... a écrit :" (French), "Am ... schrieb" (German) | |
| 88 | + | if (t.starts_with("Le ") || t.starts_with("El ")) | |
| 89 | + | && (t.ends_with("crit :") || t.ends_with("crit:")) | |
| 90 | + | { | |
| 91 | + | return true; | |
| 92 | + | } | |
| 93 | + | if t.starts_with("Am ") && (t.ends_with("schrieb:") || t.ends_with("schrieb :")) { | |
| 94 | + | return true; | |
| 95 | + | } | |
| 96 | + | ||
| 97 | + | // Forwarded message separators | |
| 98 | + | if t.contains("Forwarded message") | |
| 99 | + | || t.contains("Begin forwarded message") | |
| 100 | + | || t.contains("Original Message") | |
| 101 | + | { | |
| 102 | + | return true; | |
| 103 | + | } | |
| 104 | + | ||
| 105 | + | false | |
| 106 | + | } | |
| 107 | + | ||
| 108 | + | /// Thin per-client class checks. Each is one line — easy to add new clients. | |
| 109 | + | fn is_reply_class(class: &str) -> bool { | |
| 110 | + | // Split on whitespace to check individual class names | |
| 111 | + | class.split_whitespace().any(|c| { | |
| 112 | + | matches!( | |
| 113 | + | c, | |
| 114 | + | "gmail_quote" | |
| 115 | + | | "gmail_extra" | |
| 116 | + | | "yahoo_quoted" | |
| 117 | + | | "protonmail_quote" | |
| 118 | + | | "tutanota_quote" | |
| 119 | + | | "moz-cite-prefix" // Thunderbird | |
| 120 | + | | "zmail_extra" // Zoho | |
| 121 | + | | "WordSection1" // Outlook (sometimes wraps replies) | |
| 122 | + | ) | |
| 123 | + | }) | |
| 124 | + | } | |
| 125 | + | ||
| 126 | + | /// Thin per-client ID checks. | |
| 127 | + | fn is_reply_id(id: &str) -> bool { | |
| 128 | + | matches!( | |
| 129 | + | id, | |
| 130 | + | "divRplyFwdMsg" // Outlook | |
| 131 | + | | "reply-message" // Generic | |
| 132 | + | | "OLK_SRC_BODY_SECTION" // Outlook Mac | |
| 133 | + | ) | |
| 134 | + | } | |
| 135 | + | ||
| 136 | + | /// Check if a div contains attribution text followed by a blockquote. | |
| 137 | + | /// | |
| 138 | + | /// This catches the common pattern where no class/id is present but | |
| 139 | + | /// the structure is: `<div>On ... wrote:<br><blockquote>...</blockquote></div>` | |
| 140 | + | fn has_attribution_then_quote(el: ElementRef) -> bool { | |
| 141 | + | let mut found_attribution = false; | |
| 142 | + | ||
| 143 | + | for child in el.children() { | |
| 144 | + | match child.value() { | |
| 145 | + | Node::Text(text) => { | |
| 146 | + | if is_attribution_text(text.text.trim()) { | |
| 147 | + | found_attribution = true; | |
| 148 | + | } | |
| 149 | + | } | |
| 150 | + | Node::Element(e) => { | |
| 151 | + | if found_attribution && e.name() == "blockquote" { | |
| 152 | + | return true; | |
| 153 | + | } | |
| 154 | + | // Skip <br> tags between attribution and blockquote | |
| 155 | + | if e.name() != "br" { | |
| 156 | + | // If we hit a non-br element before finding attribution, stop | |
| 157 | + | if !found_attribution { | |
| 158 | + | return false; | |
| 159 | + | } | |
| 160 | + | } | |
| 161 | + | } | |
| 162 | + | _ => {} | |
| 163 | + | } | |
| 164 | + | } | |
| 165 | + | ||
| 166 | + | false | |
| 167 | + | } | |
| 168 | + | ||
| 169 | + | /// Get text from the previous sibling, if it exists and is a text or inline element. | |
| 170 | + | fn previous_sibling_text(el: ElementRef) -> Option<String> { | |
| 171 | + | let prev = el.prev_sibling()?; | |
| 172 | + | ||
| 173 | + | match prev.value() { | |
| 174 | + | Node::Text(text) => Some(text.text.to_string()), | |
| 175 | + | Node::Element(e) => { | |
| 176 | + | // Check inline elements like <span>, <font> that might wrap attribution | |
| 177 | + | if matches!(e.name(), "span" | "font" | "b" | "i" | "div" | "p") { | |
| 178 | + | let el_ref = ElementRef::wrap(prev)?; | |
| 179 | + | let text: String = el_ref.text().collect(); | |
| 180 | + | if !text.trim().is_empty() { | |
| 181 | + | return Some(text); | |
| 182 | + | } | |
| 183 | + | } | |
| 184 | + | None | |
| 185 | + | } | |
| 186 | + | _ => None, | |
| 187 | + | } | |
| 188 | + | } | |
| 189 | + | ||
| 190 | + | /// Check if a separator element marks the boundary between original | |
| 191 | + | /// content and a forwarded/replied message. | |
| 192 | + | /// | |
| 193 | + | /// This catches `<hr>` or styled divs that act as visual separators | |
| 194 | + | /// before reply content (common in Outlook "From: ... Sent: ..." blocks). | |
| 195 | + | pub fn is_outlook_separator(el: ElementRef) -> bool { | |
| 196 | + | let element = el.value(); | |
| 197 | + | ||
| 198 | + | // Outlook uses a specific pattern: a div containing | |
| 199 | + | // "From: ... Sent: ... To: ... Subject: ..." as a reply header | |
| 200 | + | if element.name() == "div" || element.name() == "p" { | |
| 201 | + | let text: String = el.text().collect(); | |
| 202 | + | let t = text.trim(); | |
| 203 | + | ||
| 204 | + | // Must have at least From + Sent/Date or Subject | |
| 205 | + | let has_from = t.contains("From:"); | |
| 206 | + | let has_sent = t.contains("Sent:") || t.contains("Date:"); | |
| 207 | + | let has_subject = t.contains("Subject:"); | |
| 208 | + | ||
| 209 | + | if has_from && (has_sent || has_subject) { | |
| 210 | + | return true; | |
| 211 | + | } | |
| 212 | + | } | |
| 213 | + | ||
| 214 | + | false | |
| 215 | + | } | |
| 216 | + | ||
| 217 | + | #[cfg(test)] | |
| 218 | + | mod tests { | |
| 219 | + | use super::*; | |
| 220 | + | use scraper::{Html, Selector}; | |
| 221 | + | ||
| 222 | + | fn parse_and_select(html: &str, selector: &str) -> (Html, Selector) { | |
| 223 | + | let doc = Html::parse_document(html); | |
| 224 | + | let sel = Selector::parse(selector).unwrap(); | |
| 225 | + | (doc, sel) | |
| 226 | + | } | |
| 227 | + | ||
| 228 | + | // -- Attribution detection -- | |
| 229 | + | ||
| 230 | + | #[test] | |
| 231 | + | fn attribution_on_wrote() { | |
| 232 | + | assert!(is_attribution_text("On Mon, Jan 5, 2026 at 3:00 PM Alice <alice@example.com> wrote:")); | |
| 233 | + | } | |
| 234 | + | ||
| 235 | + | #[test] | |
| 236 | + | fn attribution_forwarded() { | |
| 237 | + | assert!(is_attribution_text("---------- Forwarded message ----------")); | |
| 238 | + | } | |
| 239 | + | ||
| 240 | + | #[test] | |
| 241 | + | fn attribution_original_message() { | |
| 242 | + | assert!(is_attribution_text("-----Original Message-----")); | |
| 243 | + | } | |
| 244 | + | ||
| 245 | + | #[test] | |
| 246 | + | fn attribution_begin_forwarded() { | |
| 247 | + | assert!(is_attribution_text("Begin forwarded message:")); | |
| 248 | + | } | |
| 249 | + | ||
| 250 | + | #[test] | |
| 251 | + | fn not_attribution() { | |
| 252 | + | assert!(!is_attribution_text("Hello, how are you?")); | |
| 253 | + | assert!(!is_attribution_text("On the other hand, this is fine.")); | |
| 254 | + | } | |
| 255 | + | ||
| 256 | + | // -- Reply class detection -- | |
| 257 | + | ||
| 258 | + | #[test] | |
| 259 | + | fn gmail_quote_class() { | |
| 260 | + | assert!(is_reply_class("gmail_quote")); | |
| 261 | + | } | |
| 262 | + | ||
| 263 | + | #[test] | |
| 264 | + | fn multiple_classes_with_reply() { | |
| 265 | + | assert!(is_reply_class("some-class gmail_quote another")); | |
| 266 | + | } | |
| 267 | + | ||
| 268 | + | #[test] | |
| 269 | + | fn non_reply_class() { | |
| 270 | + | assert!(!is_reply_class("regular-div content-wrapper")); | |
| 271 | + | } | |
| 272 | + | ||
| 273 | + | // -- Reply boundary detection -- | |
| 274 | + | ||
| 275 | + | #[test] | |
| 276 | + | fn type_cite_is_boundary() { | |
| 277 | + | let html = r#"<div type="cite"><p>quoted</p></div>"#; | |
| 278 | + | let (doc, sel) = parse_and_select(html, r#"div[type="cite"]"#); | |
| 279 | + | let el = doc.select(&sel).next().unwrap(); | |
| 280 | + | assert!(is_reply_boundary(el)); | |
| 281 | + | } | |
| 282 | + | ||
| 283 | + | #[test] | |
| 284 | + | fn gmail_quote_is_boundary() { | |
| 285 | + | let html = r#"<div class="gmail_quote"><p>quoted</p></div>"#; | |
| 286 | + | let (doc, sel) = parse_and_select(html, "div.gmail_quote"); | |
| 287 | + | let el = doc.select(&sel).next().unwrap(); | |
| 288 | + | assert!(is_reply_boundary(el)); | |
| 289 | + | } | |
| 290 | + | ||
| 291 | + | #[test] | |
| 292 | + | fn outlook_id_is_boundary() { | |
| 293 | + | let html = r#"<div id="divRplyFwdMsg"><p>quoted</p></div>"#; | |
| 294 | + | let (doc, sel) = parse_and_select(html, "#divRplyFwdMsg"); | |
| 295 | + | let el = doc.select(&sel).next().unwrap(); | |
| 296 | + | assert!(is_reply_boundary(el)); | |
| 297 | + | } | |
| 298 | + | ||
| 299 | + | #[test] | |
| 300 | + | fn plain_div_not_boundary() { | |
| 301 | + | let html = r#"<div class="content"><p>not quoted</p></div>"#; | |
| 302 | + | let (doc, sel) = parse_and_select(html, "div.content"); | |
| 303 | + | let el = doc.select(&sel).next().unwrap(); | |
| 304 | + | assert!(!is_reply_boundary(el)); | |
| 305 | + | } | |
| 306 | + | ||
| 307 | + | // -- Outlook separator -- | |
| 308 | + | ||
| 309 | + | #[test] | |
| 310 | + | fn outlook_from_sent_subject() { | |
| 311 | + | let html = "<div>From: Alice\nSent: Monday\nTo: Bob\nSubject: Hello</div>"; | |
| 312 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 313 | + | let el = doc.select(&sel).next().unwrap(); | |
| 314 | + | assert!(is_outlook_separator(el)); | |
| 315 | + | } | |
| 316 | + | ||
| 317 | + | #[test] | |
| 318 | + | fn regular_div_not_separator() { | |
| 319 | + | let html = "<div>Just a normal paragraph.</div>"; | |
| 320 | + | let (doc, sel) = parse_and_select(html, "div"); | |
| 321 | + | let el = doc.select(&sel).next().unwrap(); | |
| 322 | + | assert!(!is_outlook_separator(el)); | |
| 323 | + | } | |
| 324 | + | } |
| @@ -0,0 +1,297 @@ | |||
| 1 | + | use scraper::ElementRef; | |
| 2 | + | ||
| 3 | + | /// Determine whether a `<table>` element is a data table or a layout table. | |
| 4 | + | /// | |
| 5 | + | /// Email HTML overwhelmingly uses tables for layout. A table is considered | |
| 6 | + | /// a **data table** if it has structural indicators of tabular data: | |
| 7 | + | /// - Contains `<th>` elements | |
| 8 | + | /// - Has a `<caption>` child | |
| 9 | + | /// - Has `role="grid"` or `role="table"` | |
| 10 | + | /// - Has multiple rows where multiple cells contain substantive text | |
| 11 | + | /// | |
| 12 | + | /// Everything else is treated as a layout table and unwrapped. | |
| 13 | + | pub fn is_data_table(table: ElementRef) -> bool { | |
| 14 | + | let el = table.value(); | |
| 15 | + | ||
| 16 | + | // role attribute | |
| 17 | + | if let Some(role) = el.attr("role") { | |
| 18 | + | if role == "grid" || role == "table" { | |
| 19 | + | return true; | |
| 20 | + | } | |
| 21 | + | // role="presentation" is an explicit layout signal | |
| 22 | + | if role == "presentation" || role == "none" { | |
| 23 | + | return false; | |
| 24 | + | } | |
| 25 | + | } | |
| 26 | + | ||
| 27 | + | let mut has_th = false; | |
| 28 | + | let mut has_caption = false; | |
| 29 | + | let mut multi_cell_rows = 0u32; | |
| 30 | + | ||
| 31 | + | for descendant in table.descendants() { | |
| 32 | + | if let Some(el_ref) = ElementRef::wrap(descendant) { | |
| 33 | + | match el_ref.value().name() { | |
| 34 | + | "th" => has_th = true, | |
| 35 | + | "caption" => has_caption = true, | |
| 36 | + | "tr" => { | |
| 37 | + | let cell_count = el_ref | |
| 38 | + | .children() | |
| 39 | + | .filter_map(ElementRef::wrap) | |
| 40 | + | .filter(|c| { | |
| 41 | + | let name = c.value().name(); | |
| 42 | + | (name == "td" || name == "th") && has_substantive_text(*c) | |
| 43 | + | }) | |
| 44 | + | .count(); | |
| 45 | + | if cell_count > 1 { | |
| 46 | + | multi_cell_rows += 1; | |
| 47 | + | } | |
| 48 | + | } | |
| 49 | + | _ => {} | |
| 50 | + | } | |
| 51 | + | } | |
| 52 | + | } | |
| 53 | + | ||
| 54 | + | if has_th || has_caption { | |
| 55 | + | return true; | |
| 56 | + | } | |
| 57 | + | ||
| 58 | + | // Multiple rows with multiple substantive cells = data table | |
| 59 | + | multi_cell_rows >= 2 | |
| 60 | + | } | |
| 61 | + | ||
| 62 | + | /// Check if an element contains meaningful text (not just whitespace/nbsp). | |
| 63 | + | fn has_substantive_text(el: ElementRef) -> bool { | |
| 64 | + | let text = el.text().collect::<String>(); | |
| 65 | + | let trimmed = text.trim().replace('\u{a0}', ""); // strip | |
| 66 | + | trimmed.len() > 1 // more than a single character | |
| 67 | + | } | |
| 68 | + | ||
| 69 | + | /// Extract rows and cells from a data table for markdown rendering. | |
| 70 | + | /// | |
| 71 | + | /// Returns (headers, rows) where each is a Vec of cell text strings. | |
| 72 | + | /// If no `<thead>`/`<th>` row exists, the first row is used as headers. | |
| 73 | + | pub fn extract_table_data(table: ElementRef) -> (Vec<String>, Vec<Vec<String>>) { | |
| 74 | + | let mut headers: Vec<String> = Vec::new(); | |
| 75 | + | let mut rows: Vec<Vec<String>> = Vec::new(); | |
| 76 | + | ||
| 77 | + | // Look for thead/th first | |
| 78 | + | for descendant in table.children().filter_map(ElementRef::wrap) { | |
| 79 | + | let name = descendant.value().name(); | |
| 80 | + | if name == "thead" { | |
| 81 | + | for tr in descendant.children().filter_map(ElementRef::wrap) { | |
| 82 | + | if tr.value().name() == "tr" { | |
| 83 | + | headers = extract_cells(tr); | |
| 84 | + | break; // first row of thead | |
| 85 | + | } | |
| 86 | + | } | |
| 87 | + | } else if name == "tbody" || name == "tr" { | |
| 88 | + | let trs: Box<dyn Iterator<Item = ElementRef>> = if name == "tbody" { | |
| 89 | + | Box::new( | |
| 90 | + | descendant | |
| 91 | + | .children() | |
| 92 | + | .filter_map(ElementRef::wrap) | |
| 93 | + | .filter(|e| e.value().name() == "tr"), | |
| 94 | + | ) | |
| 95 | + | } else { | |
| 96 | + | Box::new(std::iter::once(descendant)) | |
| 97 | + | }; | |
| 98 | + | ||
| 99 | + | for tr in trs { | |
| 100 | + | let cells = extract_cells(tr); | |
| 101 | + | if !cells.is_empty() { | |
| 102 | + | // If we haven't found headers yet and this row has <th> cells, | |
| 103 | + | // treat it as the header row | |
| 104 | + | if headers.is_empty() && has_th_cells(tr) { | |
| 105 | + | headers = cells; | |
| 106 | + | } else { | |
| 107 | + | rows.push(cells); | |
| 108 | + | } | |
| 109 | + | } | |
| 110 | + | } | |
| 111 | + | } | |
| 112 | + | } | |
| 113 | + | ||
| 114 | + | // If still no headers, promote first data row | |
| 115 | + | if headers.is_empty() && !rows.is_empty() { | |
| 116 | + | headers = rows.remove(0); | |
| 117 | + | } | |
| 118 | + | ||
| 119 | + | (headers, rows) | |
| 120 | + | } | |
| 121 | + | ||
| 122 | + | fn extract_cells(tr: ElementRef) -> Vec<String> { | |
| 123 | + | tr.children() | |
| 124 | + | .filter_map(ElementRef::wrap) | |
| 125 | + | .filter(|e| { | |
| 126 | + | let n = e.value().name(); | |
| 127 | + | n == "td" || n == "th" | |
| 128 | + | }) | |
| 129 | + | .map(|cell| { | |
| 130 | + | let text = cell.text().collect::<String>(); | |
| 131 | + | text.split_whitespace().collect::<Vec<_>>().join(" ") | |
| 132 | + | }) | |
| 133 | + | .collect() | |
| 134 | + | } | |
| 135 | + | ||
| 136 | + | fn has_th_cells(tr: ElementRef) -> bool { | |
| 137 | + | tr.children() | |
| 138 | + | .filter_map(ElementRef::wrap) | |
| 139 | + | .any(|e| e.value().name() == "th") | |
| 140 | + | } | |
| 141 | + | ||
| 142 | + | /// Render a data table as a GFM markdown table. | |
| 143 | + | pub fn render_markdown_table(headers: &[String], rows: &[Vec<String>]) -> String { | |
| 144 | + | if headers.is_empty() { | |
| 145 | + | return String::new(); | |
| 146 | + | } | |
| 147 | + | ||
| 148 | + | let col_count = headers.len(); | |
| 149 | + | let mut out = String::new(); | |
| 150 | + | ||
| 151 | + | // Header row | |
| 152 | + | out.push('|'); | |
| 153 | + | for h in headers { | |
| 154 | + | out.push(' '); | |
| 155 | + | out.push_str(h); | |
| 156 | + | out.push_str(" |"); | |
| 157 | + | } | |
| 158 | + | out.push('\n'); | |
| 159 | + | ||
| 160 | + | // Separator row | |
| 161 | + | out.push('|'); | |
| 162 | + | for _ in 0..col_count { | |
| 163 | + | out.push_str(" --- |"); | |
| 164 | + | } | |
| 165 | + | out.push('\n'); | |
| 166 | + | ||
| 167 | + | // Data rows | |
| 168 | + | for row in rows { | |
| 169 | + | out.push('|'); | |
| 170 | + | for i in 0..col_count { | |
| 171 | + | out.push(' '); | |
| 172 | + | if let Some(cell) = row.get(i) { | |
| 173 | + | out.push_str(cell); | |
| 174 | + | } | |
| 175 | + | out.push_str(" |"); | |
| 176 | + | } | |
| 177 | + | out.push('\n'); | |
| 178 | + | } | |
| 179 | + | ||
| 180 | + | // Remove trailing newline (caller handles spacing) | |
| 181 | + | out.trim_end().to_string() | |
| 182 | + | } | |
| 183 | + | ||
| 184 | + | #[cfg(test)] | |
| 185 | + | mod tests { | |
| 186 | + | use super::*; | |
| 187 | + | use scraper::{Html, Selector}; | |
| 188 | + | ||
| 189 | + | fn parse_table(html: &str) -> Html { | |
| 190 | + | Html::parse_document(html) | |
| 191 | + | } | |
| 192 | + | ||
| 193 | + | fn select_table(doc: &Html) -> ElementRef<'_> { | |
| 194 | + | let sel = Selector::parse("table").unwrap(); | |
| 195 | + | doc.select(&sel).next().unwrap() | |
| 196 | + | } | |
| 197 | + | ||
| 198 | + | #[test] | |
| 199 | + | fn single_cell_is_layout() { | |
| 200 | + | let doc = parse_table("<table><tr><td>content</td></tr></table>"); | |
| 201 | + | assert!(!is_data_table(select_table(&doc))); | |
| 202 | + | } | |
| 203 | + | ||
| 204 | + | #[test] | |
| 205 | + | fn table_with_th_is_data() { | |
| 206 | + | let doc = parse_table( | |
| 207 | + | "<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>", | |
| 208 | + | ); | |
| 209 | + | assert!(is_data_table(select_table(&doc))); | |
| 210 | + | } | |
| 211 | + | ||
| 212 | + | #[test] | |
| 213 | + | fn table_with_caption_is_data() { | |
| 214 | + | let doc = parse_table( | |
| 215 | + | "<table><caption>Users</caption><tr><td>Alice</td><td>30</td></tr></table>", | |
| 216 | + | ); | |
| 217 | + | assert!(is_data_table(select_table(&doc))); | |
| 218 | + | } | |
| 219 | + | ||
| 220 | + | #[test] | |
| 221 | + | fn role_presentation_is_layout() { | |
| 222 | + | let doc = parse_table( | |
| 223 | + | r#"<table role="presentation"><tr><td>layout</td><td>stuff</td></tr></table>"#, | |
| 224 | + | ); | |
| 225 | + | assert!(!is_data_table(select_table(&doc))); | |
| 226 | + | } | |
| 227 | + | ||
| 228 | + | #[test] | |
| 229 | + | fn role_grid_is_data() { | |
| 230 | + | let doc = | |
| 231 | + | parse_table(r#"<table role="grid"><tr><td>Alice</td><td>30</td></tr></table>"#); | |
| 232 | + | assert!(is_data_table(select_table(&doc))); | |
| 233 | + | } | |
| 234 | + | ||
| 235 | + | #[test] | |
| 236 | + | fn multi_row_multi_cell_is_data() { | |
| 237 | + | let doc = parse_table( | |
| 238 | + | "<table>\ | |
| 239 | + | <tr><td>Alice</td><td>Engineer</td></tr>\ | |
| 240 | + | <tr><td>Bob</td><td>Designer</td></tr>\ | |
| 241 | + | </table>", | |
| 242 | + | ); | |
| 243 | + | assert!(is_data_table(select_table(&doc))); | |
| 244 | + | } | |
| 245 | + | ||
| 246 | + | #[test] | |
| 247 | + | fn spacer_cells_not_substantive() { | |
| 248 | + | let doc = parse_table( | |
| 249 | + | "<table><tr><td>content</td><td> </td></tr>\ | |
| 250 | + | <tr><td>more</td><td> </td></tr></table>", | |
| 251 | + | ); | |
| 252 | + | // Only one substantive cell per row | |
| 253 | + | assert!(!is_data_table(select_table(&doc))); | |
| 254 | + | } | |
| 255 | + | ||
| 256 | + | #[test] | |
| 257 | + | fn render_simple_table() { | |
| 258 | + | let headers = vec!["Name".into(), "Age".into()]; | |
| 259 | + | let rows = vec![ | |
| 260 | + | vec!["Alice".into(), "30".into()], | |
| 261 | + | vec!["Bob".into(), "25".into()], | |
| 262 | + | ]; | |
| 263 | + | let md = render_markdown_table(&headers, &rows); | |
| 264 | + | assert_eq!( | |
| 265 | + | md, | |
| 266 | + | "| Name | Age |\n| --- | --- |\n| Alice | 30 |\n| Bob | 25 |" | |
| 267 | + | ); | |
| 268 | + | } | |
| 269 | + | ||
| 270 | + | #[test] | |
| 271 | + | fn render_empty_headers() { | |
| 272 | + | let md = render_markdown_table(&[], &[]); | |
| 273 | + | assert_eq!(md, ""); | |
| 274 | + | } | |
| 275 | + | ||
| 276 | + | #[test] | |
| 277 | + | fn extract_with_thead() { | |
| 278 | + | let doc = parse_table( | |
| 279 | + | "<table><thead><tr><th>A</th><th>B</th></tr></thead>\ | |
| 280 | + | <tbody><tr><td>1</td><td>2</td></tr></tbody></table>", | |
| 281 | + | ); | |
| 282 | + | let (h, r) = extract_table_data(select_table(&doc)); | |
| 283 | + | assert_eq!(h, vec!["A", "B"]); | |
| 284 | + | assert_eq!(r, vec![vec!["1".to_string(), "2".to_string()]]); | |
| 285 | + | } | |
| 286 | + | ||
| 287 | + | #[test] | |
| 288 | + | fn extract_promotes_first_row() { | |
| 289 | + | let doc = parse_table( | |
| 290 | + | "<table><tr><td>Name</td><td>Val</td></tr>\ | |
| 291 | + | <tr><td>X</td><td>Y</td></tr></table>", | |
| 292 | + | ); | |
| 293 | + | let (h, r) = extract_table_data(select_table(&doc)); | |
| 294 | + | assert_eq!(h, vec!["Name", "Val"]); | |
| 295 | + | assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]); | |
| 296 | + | } | |
| 297 | + | } |
| @@ -0,0 +1,56 @@ | |||
| 1 | + | /// Normalize whitespace in the final markdown output. | |
| 2 | + | /// | |
| 3 | + | /// - Collapse runs of 3+ newlines into 2 (one blank line) | |
| 4 | + | /// - Trim leading/trailing whitespace | |
| 5 | + | /// - Remove trailing whitespace from each line | |
| 6 | + | pub fn normalize(input: &str) -> String { | |
| 7 | + | let mut result = String::with_capacity(input.len()); | |
| 8 | + | let mut consecutive_newlines = 0u32; | |
| 9 | + | ||
| 10 | + | for ch in input.chars() { | |
| 11 | + | if ch == '\n' { | |
| 12 | + | consecutive_newlines += 1; | |
| 13 | + | if consecutive_newlines <= 2 { | |
| 14 | + | result.push('\n'); | |
| 15 | + | } | |
| 16 | + | } else { | |
| 17 | + | consecutive_newlines = 0; | |
| 18 | + | result.push(ch); | |
| 19 | + | } | |
| 20 | + | } | |
| 21 | + | ||
| 22 | + | // Trim trailing whitespace from each line | |
| 23 | + | let lines: Vec<&str> = result.lines().map(|l| l.trim_end()).collect(); | |
| 24 | + | let joined = lines.join("\n"); | |
| 25 | + | joined.trim().to_string() | |
| 26 | + | } | |
| 27 | + | ||
| 28 | + | #[cfg(test)] | |
| 29 | + | mod tests { | |
| 30 | + | use super::*; | |
| 31 | + | ||
| 32 | + | #[test] | |
| 33 | + | fn collapse_excessive_newlines() { | |
| 34 | + | assert_eq!(normalize("a\n\n\n\nb"), "a\n\nb"); | |
| 35 | + | } | |
| 36 | + | ||
| 37 | + | #[test] | |
| 38 | + | fn preserve_single_blank_line() { | |
| 39 | + | assert_eq!(normalize("a\n\nb"), "a\n\nb"); | |
| 40 | + | } | |
| 41 | + | ||
| 42 | + | #[test] | |
| 43 | + | fn trim_trailing_whitespace() { | |
| 44 | + | assert_eq!(normalize("hello \nworld "), "hello\nworld"); | |
| 45 | + | } | |
| 46 | + | ||
| 47 | + | #[test] | |
| 48 | + | fn trim_outer_whitespace() { | |
| 49 | + | assert_eq!(normalize("\n\nhello\n\n"), "hello"); | |
| 50 | + | } | |
| 51 | + | ||
| 52 | + | #[test] | |
| 53 | + | fn empty_input() { | |
| 54 | + | assert_eq!(normalize(""), ""); | |
| 55 | + | } | |
| 56 | + | } |
| @@ -0,0 +1,202 @@ | |||
| 1 | + | use pter::convert; | |
| 2 | + | ||
| 3 | + | #[test] | |
| 4 | + | fn empty_string() { | |
| 5 | + | assert_eq!(convert(""), ""); | |
| 6 | + | } | |
| 7 | + | ||
| 8 | + | #[test] | |
| 9 | + | fn whitespace_only() { | |
| 10 | + | assert_eq!(convert(" \n\t "), ""); | |
| 11 | + | } | |
| 12 | + | ||
| 13 | + | #[test] | |
| 14 | + | fn just_tags_no_content() { | |
| 15 | + | assert_eq!(convert("<div><p><span></span></p></div>"), ""); | |
| 16 | + | } | |
| 17 | + | ||
| 18 | + | #[test] | |
| 19 | + | fn deeply_nested_divs() { | |
| 20 | + | let mut html = String::new(); | |
| 21 | + | for _ in 0..100 { | |
| 22 | + | html.push_str("<div>"); | |
| 23 | + | } | |
| 24 | + | html.push_str("deep content"); | |
| 25 | + | for _ in 0..100 { | |
| 26 | + | html.push_str("</div>"); | |
| 27 | + | } | |
| 28 | + | let md = convert(&html); | |
| 29 | + | assert!(md.contains("deep content")); | |
| 30 | + | } | |
| 31 | + | ||
| 32 | + | #[test] | |
| 33 | + | fn deeply_nested_blockquotes() { | |
| 34 | + | let mut html = String::new(); | |
| 35 | + | for _ in 0..20 { | |
| 36 | + | html.push_str("<blockquote>"); | |
| 37 | + | } | |
| 38 | + | html.push_str("very deep"); | |
| 39 | + | for _ in 0..20 { | |
| 40 | + | html.push_str("</blockquote>"); | |
| 41 | + | } | |
| 42 | + | let md = convert(&html); | |
| 43 | + | assert!(md.contains("very deep")); | |
| 44 | + | // Should have many > prefixes | |
| 45 | + | assert!(md.contains("> > > > >")); | |
| 46 | + | } | |
| 47 | + | ||
| 48 | + | #[test] | |
| 49 | + | fn deeply_nested_lists() { | |
| 50 | + | let mut html = String::new(); | |
| 51 | + | for _ in 0..10 { | |
| 52 | + | html.push_str("<ul><li>"); | |
| 53 | + | } | |
| 54 | + | html.push_str("deep item"); | |
| 55 | + | for _ in 0..10 { | |
| 56 | + | html.push_str("</li></ul>"); | |
| 57 | + | } | |
| 58 | + | let md = convert(&html); | |
| 59 | + | assert!(md.contains("deep item")); | |
| 60 | + | } | |
| 61 | + | ||
| 62 | + | #[test] | |
| 63 | + | fn malformed_unclosed_tags() { | |
| 64 | + | // html5ever auto-corrects these | |
| 65 | + | let md = convert("<p>unclosed paragraph<p>another one"); | |
| 66 | + | assert!(md.contains("unclosed paragraph")); | |
| 67 | + | assert!(md.contains("another one")); | |
| 68 | + | } | |
| 69 | + | ||
| 70 | + | #[test] | |
| 71 | + | fn malformed_mismatched_tags() { | |
| 72 | + | let md = convert("<b><i>crossed</b></i>"); | |
| 73 | + | assert!(md.contains("crossed")); | |
| 74 | + | } | |
| 75 | + | ||
| 76 | + | #[test] | |
| 77 | + | fn only_script_content() { | |
| 78 | + | assert_eq!(convert("<script>alert('xss')</script>"), ""); | |
| 79 | + | } | |
| 80 | + | ||
| 81 | + | #[test] | |
| 82 | + | fn only_style_content() { | |
| 83 | + | assert_eq!(convert("<style>.x { color: red; }</style>"), ""); | |
| 84 | + | } | |
| 85 | + | ||
| 86 | + | #[test] | |
| 87 | + | fn only_tracking_pixels() { | |
| 88 | + | let html = r#" | |
| 89 | + | <img src="a.gif" width="1" height="1"> | |
| 90 | + | <img src="b.gif" width="1" height="1"> | |
| 91 | + | "#; | |
| 92 | + | assert_eq!(convert(html), ""); | |
| 93 | + | } | |
| 94 | + | ||
| 95 | + | #[test] | |
| 96 | + | fn unicode_content() { | |
| 97 | + | let md = convert("<p>日本語テスト 🎉 émojis café</p>"); | |
| 98 | + | assert!(md.contains("日本語テスト")); | |
| 99 | + | assert!(md.contains("🎉")); | |
| 100 | + | assert!(md.contains("café")); | |
| 101 | + | } | |
| 102 | + | ||
| 103 | + | #[test] | |
| 104 | + | fn html_entities_numeric() { | |
| 105 | + | let md = convert("<p>© — ’</p>"); | |
| 106 | + | assert!(md.contains("©")); | |
| 107 | + | assert!(md.contains("—")); | |
| 108 | + | } | |
| 109 | + | ||
| 110 | + | #[test] | |
| 111 | + | fn large_input_doesnt_blow_up() { | |
| 112 | + | let para = "<p>Hello world. This is a test paragraph with some content.</p>"; | |
| 113 | + | let html: String = para.repeat(1000); | |
| 114 | + | let md = convert(&html); | |
| 115 | + | assert!(md.contains("Hello world")); | |
| 116 | + | // Should be proportional, not quadratic | |
| 117 | + | assert!(md.len() < html.len()); | |
| 118 | + | } | |
| 119 | + | ||
| 120 | + | #[test] | |
| 121 | + | fn link_with_nested_formatting() { | |
| 122 | + | let html = r#"<a href="https://example.com"><strong>bold link</strong></a>"#; | |
| 123 | + | let md = convert(html); | |
| 124 | + | assert!(md.contains("[**bold link**](https://example.com)")); | |
| 125 | + | } | |
| 126 | + | ||
| 127 | + | #[test] | |
| 128 | + | fn image_with_no_alt() { | |
| 129 | + | let md = convert(r#"<img src="photo.jpg">"#); | |
| 130 | + | assert!(md.contains("")); | |
| 131 | + | } | |
| 132 | + | ||
| 133 | + | #[test] | |
| 134 | + | fn consecutive_inline_elements() { | |
| 135 | + | let md = convert("<b>bold</b><i>italic</i><code>code</code>"); | |
| 136 | + | assert_eq!(md, "**bold***italic*`code`"); | |
| 137 | + | } | |
| 138 | + | ||
| 139 | + | #[test] | |
| 140 | + | fn table_with_empty_cells() { | |
| 141 | + | let html = "<table><tr><th>A</th><th>B</th></tr>\ | |
| 142 | + | <tr><td></td><td>val</td></tr></table>"; | |
| 143 | + | let md = convert(html); | |
| 144 | + | assert!(md.contains("| A | B |")); | |
| 145 | + | assert!(md.contains("| | val |")); | |
| 146 | + | } | |
| 147 | + | ||
| 148 | + | #[test] | |
| 149 | + | fn pre_with_html_inside() { | |
| 150 | + | let html = "<pre><div>not a tag</div></pre>"; | |
| 151 | + | let md = convert(html); | |
| 152 | + | assert!(md.contains("```")); | |
| 153 | + | assert!(md.contains("<div>not a tag</div>")); | |
| 154 | + | } | |
| 155 | + | ||
| 156 | + | #[test] | |
| 157 | + | fn multiple_spaces_in_source() { | |
| 158 | + | let md = convert("<p>word1 word2 word3</p>"); | |
| 159 | + | assert_eq!(md, "word1 word2 word3"); | |
| 160 | + | } | |
| 161 | + | ||
| 162 | + | #[test] | |
| 163 | + | fn newlines_in_source_collapsed() { | |
| 164 | + | let md = convert("<p>line1\n\n\nline2</p>"); | |
| 165 | + | assert_eq!(md, "line1 line2"); | |
| 166 | + | } | |
| 167 | + | ||
| 168 | + | #[test] | |
| 169 | + | fn full_html_document() { | |
| 170 | + | let html = r#" | |
| 171 | + | <!DOCTYPE html> | |
| 172 | + | <html lang="en"> | |
| 173 | + | <head> | |
| 174 | + | <meta charset="UTF-8"> | |
| 175 | + | <title>Test Email</title> | |
| 176 | + | <style>body { font-family: sans-serif; }</style> | |
| 177 | + | </head> | |
| 178 | + | <body> | |
| 179 | + | <p>Hello!</p> | |
| 180 | + | </body> | |
| 181 | + | </html> | |
| 182 | + | "#; | |
| 183 | + | let md = convert(html); | |
| 184 | + | assert_eq!(md, "Hello!"); | |
| 185 | + | } | |
| 186 | + | ||
| 187 | + | #[test] | |
| 188 | + | fn data_uri_image_not_tracking_pixel() { | |
| 189 | + | // A data URI image that's not 1x1 should render | |
| 190 | + | let html = r#"<img src="data:image/png;base64,iVBOR..." alt="inline" width="100">"#; | |
| 191 | + | let md = convert(html); | |
| 192 | + | assert!(md.contains("![inline]")); | |
| 193 | + | } | |
| 194 | + | ||
| 195 | + | #[test] | |
| 196 | + | fn blockquote_with_paragraphs() { | |
| 197 | + | let html = "<blockquote><p>First para</p><p>Second para</p></blockquote>"; | |
| 198 | + | let md = convert(html); | |
| 199 | + | assert!(md.contains("> First para")); | |
| 200 | + | assert!(md.contains("> ")); | |
| 201 | + | assert!(md.contains("> Second para")); | |
| 202 | + | } |
| @@ -0,0 +1,436 @@ | |||
| 1 | + | use pter::convert; | |
| 2 | + | ||
| 3 | + | #[test] | |
| 4 | + | fn simple_email() { | |
| 5 | + | let html = r#" | |
| 6 | + | <html> | |
| 7 | + | <head><title>Email</title></head> | |
| 8 | + | <body> | |
| 9 | + | <h1>Meeting Tomorrow</h1> | |
| 10 | + | <p>Hi Max,</p> | |
| 11 | + | <p>Just confirming our meeting tomorrow at <strong>2pm</strong>.</p> | |
| 12 | + | <p>Best,<br>Alice</p> | |
| 13 | + | </body> | |
| 14 | + | </html> | |
| 15 | + | "#; | |
| 16 | + | ||
| 17 | + | let md = convert(html); | |
| 18 | + | assert!(md.contains("# Meeting Tomorrow")); | |
| 19 | + | assert!(md.contains("Hi Max,")); | |
| 20 | + | assert!(md.contains("**2pm**")); | |
| 21 | + | assert!(md.contains("Best,\nAlice")); | |
| 22 | + | } | |
| 23 | + | ||
| 24 | + | #[test] | |
| 25 | + | fn email_with_links() { | |
| 26 | + | let html = r#" | |
| 27 | + | <body> | |
| 28 | + | <p>Please review the <a href="https://example.com/doc">document</a>.</p> | |
| 29 | + | <p>Direct link: <a href="https://example.com">https://example.com</a></p> | |
| 30 | + | </body> | |
| 31 | + | "#; | |
| 32 | + | ||
| 33 | + | let md = convert(html); | |
| 34 | + | assert!(md.contains("[document](https://example.com/doc)")); | |
| 35 | + | // Link text matches URL — no markdown link syntax | |
| 36 | + | assert!(md.contains("Direct link: https://example.com")); | |
| 37 | + | } | |
| 38 | + | ||
| 39 | + | #[test] | |
| 40 | + | fn email_with_tracking_pixels() { | |
| 41 | + | let html = r#" | |
| 42 | + | <body> | |
| 43 | + | <p>Content here</p> | |
| 44 | + | <img src="https://tracker.example.com/open.gif" width="1" height="1" alt=""> | |
| 45 | + | <img src="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7" alt=""> | |
| 46 | + | <img src="real-image.jpg" alt="A real photo" width="600"> | |
| 47 | + | </body> | |
| 48 | + | "#; | |
| 49 | + | ||
| 50 | + | let md = convert(html); | |
| 51 | + | assert!(md.contains("Content here")); | |
| 52 | + | assert!(!md.contains("tracker")); | |
| 53 | + | assert!(!md.contains("data:image")); | |
| 54 | + | assert!(md.contains("")); | |
| 55 | + | } | |
| 56 | + | ||
| 57 | + | #[test] | |
| 58 | + | fn email_with_quoted_reply() { | |
| 59 | + | let html = r#" | |
| 60 | + | <body> | |
| 61 | + | <p>Thanks, that works for me.</p> | |
| 62 | + | <blockquote> | |
| 63 | + | <p>Can we meet at 3pm instead?</p> | |
| 64 | + | </blockquote> | |
| 65 | + | </body> | |
| 66 | + | "#; | |
| 67 | + | ||
| 68 | + | let md = convert(html); | |
| 69 | + | assert!(md.contains("Thanks, that works for me.")); | |
| 70 | + | assert!(md.contains("> Can we meet at 3pm instead?")); | |
| 71 | + | } | |
| 72 | + | ||
| 73 | + | #[test] | |
| 74 | + | fn email_with_signature_line() { | |
| 75 | + | let html = r#" | |
| 76 | + | <body> | |
| 77 | + | <p>See you then.</p> | |
| 78 | + | <hr> | |
| 79 | + | <p>Alice Smith</p> | |
| 80 | + | <p>Engineering Lead</p> | |
| 81 | + | </body> | |
| 82 | + | "#; | |
| 83 | + | ||
| 84 | + | let md = convert(html); | |
| 85 | + | assert!(md.contains("See you then.")); | |
| 86 | + | assert!(md.contains("---")); | |
| 87 | + | assert!(md.contains("Alice Smith")); | |
| 88 | + | } | |
| 89 | + | ||
| 90 | + | #[test] | |
| 91 | + | fn deeply_nested_blockquotes() { | |
| 92 | + | let html = r#" | |
| 93 | + | <body> | |
| 94 | + | <p>Got it.</p> | |
| 95 | + | <blockquote> | |
| 96 | + | <p>Sounds good.</p> | |
| 97 | + | <blockquote> | |
| 98 | + | <p>Can we reschedule?</p> | |
| 99 | + | <blockquote> | |
| 100 | + | <p>Original message here.</p> | |
| 101 | + | </blockquote> | |
| 102 | + | </blockquote> | |
| 103 | + | </blockquote> | |
| 104 | + | </body> | |
| 105 | + | "#; | |
| 106 | + | ||
| 107 | + | let md = convert(html); | |
| 108 | + | assert!(md.contains("Got it.")); | |
| 109 | + | assert!(md.contains("> Sounds good.")); | |
| 110 | + | assert!(md.contains("> > Can we reschedule?")); | |
| 111 | + | assert!(md.contains("> > > Original message here.")); | |
| 112 | + | } | |
| 113 | + | ||
| 114 | + | #[test] | |
| 115 | + | fn complex_list_structure() { | |
| 116 | + | let html = r#" | |
| 117 | + | <body> | |
| 118 | + | <p>Action items:</p> | |
| 119 | + | <ol> | |
| 120 | + | <li>Review the PR | |
| 121 | + | <ul> | |
| 122 | + | <li>Check tests</li> | |
| 123 | + | <li>Check docs</li> | |
| 124 | + | </ul> | |
| 125 | + | </li> | |
| 126 | + | <li>Deploy to staging</li> | |
| 127 | + | </ol> | |
| 128 | + | </body> | |
| 129 | + | "#; | |
| 130 | + | ||
| 131 | + | let md = convert(html); | |
| 132 | + | assert!(md.contains("Action items:")); | |
| 133 | + | assert!(md.contains("1. Review the PR")); | |
| 134 | + | assert!(md.contains(" - Check tests")); | |
| 135 | + | assert!(md.contains("2. Deploy to staging")); | |
| 136 | + | } | |
| 137 | + | ||
| 138 | + | #[test] | |
| 139 | + | fn pre_block_preserves_formatting() { | |
| 140 | + | let html = r#" | |
| 141 | + | <body> | |
| 142 | + | <p>Here's the code:</p> | |
| 143 | + | <pre><code>fn main() { | |
| 144 | + | println!("hello"); | |
| 145 | + | }</code></pre> | |
| 146 | + | </body> | |
| 147 | + | "#; | |
| 148 | + | ||
| 149 | + | let md = convert(html); | |
| 150 | + | assert!(md.contains("Here's the code:")); | |
| 151 | + | assert!(md.contains("```\nfn main()")); | |
| 152 | + | assert!(md.contains(" println!")); | |
| 153 | + | } | |
| 154 | + | ||
| 155 | + | #[test] | |
| 156 | + | fn hidden_content_stripped() { | |
| 157 | + | let html = r#" | |
| 158 | + | <body> | |
| 159 | + | <p>Visible content</p> | |
| 160 | + | <div style="display: none;"> | |
| 161 | + | <p>This should not appear</p> | |
| 162 | + | </div> | |
| 163 | + | <span style="visibility: hidden;">Also hidden</span> | |
| 164 | + | <p>More visible</p> | |
| 165 | + | </body> | |
| 166 | + | "#; | |
| 167 | + | ||
| 168 | + | let md = convert(html); | |
| 169 | + | assert!(md.contains("Visible content")); | |
| 170 | + | assert!(!md.contains("should not appear")); | |
| 171 | + | assert!(!md.contains("Also hidden")); | |
| 172 | + | assert!(md.contains("More visible")); | |
| 173 | + | } | |
| 174 | + | ||
| 175 | + | #[test] | |
| 176 | + | fn script_and_style_fully_removed() { | |
| 177 | + | let html = r#" | |
| 178 | + | <html> | |
| 179 | + | <head> | |
| 180 | + | <style>body { color: red; }</style> | |
| 181 | + | <script>alert('xss');</script> | |
| 182 | + | </head> | |
| 183 | + | <body> | |
| 184 | + | <p>Safe content</p> | |
| 185 | + | <script>document.write('injected')</script> | |
| 186 | + | </body> | |
| 187 | + | </html> | |
| 188 | + | "#; | |
| 189 | + | ||
| 190 | + | let md = convert(html); | |
| 191 | + | assert_eq!(md, "Safe content"); | |
| 192 | + | } | |
| 193 | + | ||
| 194 | + | #[test] | |
| 195 | + | fn newsletter_table_layout() { | |
| 196 | + | // Typical email newsletter wrapped in layout tables | |
| 197 | + | let html = r#" | |
| 198 | + | <html> | |
| 199 | + | <body> | |
| 200 | + | <table width="100%" cellpadding="0" cellspacing="0" role="presentation"> | |
| 201 | + | <tr> | |
| 202 | + | <td align="center"> | |
| 203 | + | <table width="600" cellpadding="0" cellspacing="0"> | |
| 204 | + | <tr> | |
| 205 | + | <td> | |
| 206 | + | <h2>Weekly Digest</h2> | |
| 207 | + | <p>Here are your updates for this week.</p> | |
| 208 | + | <ul> | |
| 209 | + | <li>New release v2.0</li> | |
| 210 | + | <li>Bug fixes</li> | |
| 211 | + | </ul> | |
| 212 | + | <p>Thanks for reading!</p> | |
| 213 | + | </td> | |
| 214 | + | </tr> | |
| 215 | + | </table> | |
| 216 | + | </td> | |
| 217 | + | </tr> | |
| 218 | + | </table> | |
| 219 | + | <img src="https://track.example.com/open.gif" width="1" height="1"> | |
| 220 | + | </body> | |
| 221 | + | </html> | |
| 222 | + | "#; | |
| 223 | + | ||
| 224 | + | let md = convert(html); | |
| 225 | + | assert!(md.contains("## Weekly Digest")); | |
| 226 | + | assert!(md.contains("Here are your updates for this week.")); | |
| 227 | + | assert!(md.contains("- New release v2.0")); | |
| 228 | + | assert!(md.contains("- Bug fixes")); | |
| 229 | + | assert!(md.contains("Thanks for reading!")); | |
| 230 | + | assert!(!md.contains("track.example.com")); | |
| 231 | + | // No table markup in output | |
| 232 | + | assert!(!md.contains("| ")); | |
| 233 | + | } | |
| 234 | + | ||
| 235 | + | #[test] | |
| 236 | + | fn data_table_preserved() { | |
| 237 | + | let html = r#" | |
| 238 | + | <body> | |
| 239 | + | <p>Order summary:</p> | |
| 240 | + | <table> | |
| 241 | + | <thead><tr><th>Item</th><th>Qty</th><th>Price</th></tr></thead> | |
| 242 | + | <tbody> | |
| 243 | + | <tr><td>Widget</td><td>3</td><td>$15.00</td></tr> | |
| 244 | + | <tr><td>Gadget</td><td>1</td><td>$29.99</td></tr> | |
| 245 | + | </tbody> | |
| 246 | + | </table> | |
| 247 | + | </body> | |
| 248 | + | "#; | |
| 249 | + | ||
| 250 | + | let md = convert(html); | |
| 251 | + | assert!(md.contains("Order summary:")); | |
| 252 | + | assert!(md.contains("| Item | Qty | Price |")); | |
| 253 | + | assert!(md.contains("| --- | --- | --- |")); | |
| 254 | + | assert!(md.contains("| Widget | 3 | $15.00 |")); | |
| 255 | + | assert!(md.contains("| Gadget | 1 | $29.99 |")); | |
| 256 | + | } | |
| 257 | + | ||
| 258 | + | #[test] | |
| 259 | + | fn spacer_and_tracking_stripped() { | |
| 260 | + | let html = r#" | |
| 261 | + | <body> | |
| 262 | + | <p>Real content</p> | |
| 263 | + | <div style="font-size: 0; line-height: 0;"> </div> | |
| 264 | + | <img src="pixel.gif" width="1" height="1" style="display:none"> | |
| 265 | + | <div style="height:0;overflow:hidden">invisible</div> | |
| 266 | + | <p>More content</p> | |
| 267 | + | </body> | |
| 268 | + | "#; | |
| 269 | + | ||
| 270 | + | let md = convert(html); | |
| 271 | + | assert!(md.contains("Real content")); | |
| 272 | + | assert!(md.contains("More content")); | |
| 273 | + | assert!(!md.contains("invisible")); | |
| 274 | + | assert!(!md.contains("pixel.gif")); | |
| 275 | + | } | |
| 276 | + | ||
| 277 | + | // -- Reply chain tests -- | |
| 278 | + | ||
| 279 | + | #[test] | |
| 280 | + | fn gmail_reply_chain() { | |
| 281 | + | let html = r#" | |
| 282 | + | <body> | |
| 283 | + | <div dir="ltr"> | |
| 284 | + | <p>Thanks, that works for me.</p> | |
| 285 | + | </div> | |
| 286 | + | <div class="gmail_quote"> | |
| 287 | + | <div class="gmail_attr">On Mon, Jan 5, 2026 at 3:00 PM Alice <alice@example.com> wrote:</div> | |
| 288 | + | <blockquote class="gmail_quote"> | |
| 289 | + | <div dir="ltr"> | |
| 290 | + | <p>Can we meet at 3pm instead of 2pm?</p> | |
| 291 | + | </div> | |
| 292 | + | </blockquote> | |
| 293 | + | </div> | |
| 294 | + | </body> | |
| 295 | + | "#; | |
| 296 | + | ||
| 297 | + | let md = convert(html); | |
| 298 | + | assert!(md.contains("Thanks, that works for me.")); | |
| 299 | + | // The gmail_quote div should be rendered as a quote block | |
| 300 | + | assert!(md.contains("> ")); | |
| 301 | + | assert!(md.contains("3pm instead of 2pm")); | |
| 302 | + | } | |
| 303 | + | ||
| 304 | + | #[test] | |
| 305 | + | fn apple_mail_reply() { | |
| 306 | + | let html = r#" | |
| 307 | + | <body> | |
| 308 | + | <div>Sounds good, see you then.</div> | |
| 309 | + | <div> | |
| 310 | + | <br> | |
| 311 | + | <blockquote type="cite"> | |
| 312 | + | <div>Hey, are we still on for lunch?</div> | |
| 313 | + | </blockquote> | |
| 314 | + | </div> | |
| 315 | + | </body> | |
| 316 | + | "#; | |
| 317 | + | ||
| 318 | + | let md = convert(html); | |
| 319 | + | assert!(md.contains("Sounds good, see you then.")); | |
| 320 | + | assert!(md.contains("> ")); | |
| 321 | + | assert!(md.contains("still on for lunch")); | |
| 322 | + | } | |
| 323 | + | ||
| 324 | + | #[test] | |
| 325 | + | fn outlook_reply_with_separator() { | |
| 326 | + | let html = r#" | |
| 327 | + | <body> | |
| 328 | + | <div> | |
| 329 | + | <p>I'll handle it.</p> | |
| 330 | + | </div> | |
| 331 | + | <hr> | |
| 332 | + | <div> | |
| 333 | + | <p>From: Alice Smith<br> | |
| 334 | + | Sent: Monday, January 5, 2026<br> | |
| 335 | + | To: Bob Jones<br> | |
| 336 | + | Subject: Action needed</p> | |
| 337 | + | </div> | |
| 338 | + | <div> | |
| 339 | + | <p>Can you take a look at the report?</p> | |
| 340 | + | </div> | |
| 341 | + | </body> | |
| 342 | + | "#; | |
| 343 | + | ||
| 344 | + | let md = convert(html); | |
| 345 | + | assert!(md.contains("I'll handle it.")); | |
| 346 | + | assert!(md.contains("---")); // hr separator | |
| 347 | + | assert!(md.contains("From: Alice Smith")); | |
| 348 | + | assert!(md.contains("take a look at the report")); | |
| 349 | + | } | |
| 350 | + | ||
| 351 | + | #[test] | |
| 352 | + | fn nested_gmail_reply_chain() { | |
| 353 | + | let html = r#" | |
| 354 | + | <body> | |
| 355 | + | <div dir="ltr"><p>Got it, thanks!</p></div> | |
| 356 | + | <div class="gmail_quote"> | |
| 357 | + | On Tue, Jan 6, Bob wrote: | |
| 358 | + | <blockquote class="gmail_quote"> | |
| 359 | + | <div dir="ltr"><p>Here's the update.</p></div> | |
| 360 | + | <div class="gmail_quote"> | |
| 361 | + | On Mon, Jan 5, Alice wrote: | |
| 362 | + | <blockquote class="gmail_quote"> | |
| 363 | + | <div dir="ltr"><p>What's the status?</p></div> | |
| 364 | + | </blockquote> | |
| 365 | + | </div> | |
| 366 | + | </blockquote> | |
| 367 | + | </div> | |
| 368 | + | </body> | |
| 369 | + | "#; | |
| 370 | + | ||
| 371 | + | let md = convert(html); | |
| 372 | + | assert!(md.contains("Got it, thanks!")); | |
| 373 | + | // Should have nested quoting | |
| 374 | + | assert!(md.contains("> ")); | |
| 375 | + | assert!(md.contains("Here's the update.")); | |
| 376 | + | assert!(md.contains("What's the status?")); | |
| 377 | + | } | |
| 378 | + | ||
| 379 | + | #[test] | |
| 380 | + | fn forwarded_message() { | |
| 381 | + | let html = r#" | |
| 382 | + | <body> | |
| 383 | + | <div><p>FYI, see below.</p></div> | |
| 384 | + | <div class="gmail_quote"> | |
| 385 | + | ---------- Forwarded message ---------- | |
| 386 | + | <blockquote> | |
| 387 | + | <p>From: Alice</p> | |
| 388 | + | <p>The deadline has been moved to Friday.</p> | |
| 389 | + | </blockquote> | |
| 390 | + | </div> | |
| 391 | + | </body> | |
| 392 | + | "#; | |
| 393 | + | ||
| 394 | + | let md = convert(html); | |
| 395 | + | assert!(md.contains("FYI, see below.")); | |
| 396 | + | assert!(md.contains("Forwarded message")); | |
| 397 | + | assert!(md.contains("deadline has been moved")); | |
| 398 | + | } | |
| 399 | + | ||
| 400 | + | #[test] | |
| 401 | + | fn protonmail_reply() { | |
| 402 | + | let html = r#" | |
| 403 | + | <body> | |
| 404 | + | <div>Will do, thanks.</div> | |
| 405 | + | <blockquote class="protonmail_quote" type="cite"> | |
| 406 | + | <div>Please send me the files by EOD.</div> | |
| 407 | + | </blockquote> | |
| 408 | + | </body> | |
| 409 | + | "#; | |
| 410 | + | ||
| 411 | + | let md = convert(html); | |
| 412 | + | assert!(md.contains("Will do, thanks.")); | |
| 413 | + | assert!(md.contains("> ")); | |
| 414 | + | assert!(md.contains("send me the files")); | |
| 415 | + | } | |
| 416 | + | ||
| 417 | + | #[test] | |
| 418 | + | fn attribution_preserved_above_quote() { | |
| 419 | + | let html = r#" | |
| 420 | + | <body> | |
| 421 | + | <p>Agreed.</p> | |
| 422 | + | <div class="gmail_quote"> | |
| 423 | + | On Wed, Jan 7, 2026 at 10:00 AM Carol wrote: | |
| 424 | + | <blockquote> | |
| 425 | + | <p>Let's go with option B.</p> | |
| 426 | + | </blockquote> | |
| 427 | + | </div> | |
| 428 | + | </body> | |
| 429 | + | "#; | |
| 430 | + | ||
| 431 | + | let md = convert(html); | |
| 432 | + | assert!(md.contains("Agreed.")); | |
| 433 | + | // Attribution should appear | |
| 434 | + | assert!(md.contains("Carol wrote:")); | |
| 435 | + | assert!(md.contains("option B")); | |
| 436 | + | } |
| @@ -0,0 +1,94 @@ | |||
| 1 | + | use proptest::prelude::*; | |
| 2 | + | ||
| 3 | + | // Strategy: generate arbitrary HTML-like strings | |
| 4 | + | fn html_fragment() -> impl Strategy<Value = String> { | |
| 5 | + | let tags = prop::sample::select(vec![ | |
| 6 | + | "p", "div", "span", "strong", "em", "a", "h1", "h2", "h3", | |
| 7 | + | "ul", "ol", "li", "blockquote", "pre", "code", "br", "hr", | |
| 8 | + | "img", "table", "tr", "td", "th", "b", "i", "del", "sup", "sub", | |
| 9 | + | ]); | |
| 10 | + | ||
| 11 | + | let text = "[a-zA-Z0-9 .,!?]{0,100}"; | |
| 12 | + | ||
| 13 | + | prop::collection::vec( | |
| 14 | + | prop_oneof![ | |
| 15 | + | // Plain text | |
| 16 | + | text.prop_map(|s| s), | |
| 17 | + | // Opening + closing tag with text | |
| 18 | + | (tags.clone(), text).prop_map(|(tag, content)| { | |
| 19 | + | format!("<{tag}>{content}</{tag}>") | |
| 20 | + | }), | |
| 21 | + | // Self-closing tag | |
| 22 | + | tags.clone().prop_map(|tag| format!("<{tag}/>")), | |
| 23 | + | // Nested tags | |
| 24 | + | (tags.clone(), tags.clone(), text).prop_map(|(outer, inner, content)| { | |
| 25 | + | format!("<{outer}><{inner}>{content}</{inner}></{outer}>") | |
| 26 | + | }), | |
| 27 | + | ], | |
| 28 | + | 1..10, | |
| 29 | + | ) | |
| 30 | + | .prop_map(|parts| parts.join("")) | |
| 31 | + | } | |
| 32 | + | ||
| 33 | + | proptest! { | |
| 34 | + | #[test] | |
| 35 | + | fn never_panics(html in html_fragment()) { | |
| 36 | + | let _ = pter::convert(&html); | |
| 37 | + | } | |
| 38 | + | ||
| 39 | + | #[test] | |
| 40 | + | fn never_panics_on_arbitrary_bytes(s in "\\PC{0,500}") { | |
| 41 | + | let _ = pter::convert(&s); | |
| 42 | + | } | |
| 43 | + | ||
| 44 | + | #[test] | |
| 45 | + | fn output_contains_no_html_tags(html in html_fragment()) { | |
| 46 | + | let md = pter::convert(&html); | |
| 47 | + | // Output should never contain raw HTML tags | |
| 48 | + | // (except inside code blocks, which we skip checking) | |
| 49 | + | let without_code_blocks: String = md | |
| 50 | + | .split("```") | |
| 51 | + | .enumerate() | |
| 52 | + | .filter(|(i, _)| i % 2 == 0) // only outside code blocks | |
| 53 | + | .map(|(_, s)| s) | |
| 54 | + | .collect(); | |
| 55 | + | ||
| 56 | + | // No <script>, <style>, <div>, etc. should leak through | |
| 57 | + | assert!(!without_code_blocks.contains("<script"), "leaked <script> in: {md}"); | |
| 58 | + | assert!(!without_code_blocks.contains("<style"), "leaked <style> in: {md}"); | |
| 59 | + | assert!(!without_code_blocks.contains("<head"), "leaked <head> in: {md}"); | |
| 60 | + | } | |
| 61 | + | ||
| 62 | + | #[test] | |
| 63 | + | fn output_is_valid_utf8(html in html_fragment()) { | |
| 64 | + | let md = pter::convert(&html); | |
| 65 | + | // String type guarantees UTF-8, but verify no replacement chars snuck in | |
| 66 | + | // from bad entity decoding | |
| 67 | + | assert!(!md.contains('\u{FFFD}'), "replacement char in: {md}"); | |
| 68 | + | } | |
| 69 | + | ||
| 70 | + | #[test] | |
| 71 | + | fn no_excessive_blank_lines(html in html_fragment()) { | |
| 72 | + | let md = pter::convert(&html); | |
| 73 | + | assert!(!md.contains("\n\n\n"), "triple newline in output: {md}"); | |
| 74 | + | } | |
| 75 | + | ||
| 76 | + | #[test] | |
| 77 | + | fn no_trailing_whitespace_on_lines(html in html_fragment()) { | |
| 78 | + | let md = pter::convert(&html); | |
| 79 | + | for (i, line) in md.lines().enumerate() { | |
| 80 | + | assert!( | |
| 81 | + | line == line.trim_end(), | |
| 82 | + | "trailing whitespace on line {i}: '{line}'" | |
| 83 | + | ); | |
| 84 | + | } | |
| 85 | + | } | |
| 86 | + | ||
| 87 | + | #[test] | |
| 88 | + | fn empty_input_returns_empty(s in "\\s{0,20}") { | |
| 89 | + | let html = format!("<html><body>{s}</body></html>"); | |
| 90 | + | let md = pter::convert(&html); | |
| 91 | + | // Whitespace-only input should produce empty or whitespace-only output | |
| 92 | + | assert!(md.trim().is_empty() || !s.trim().is_empty()); | |
| 93 | + | } | |
| 94 | + | } |