Skip to main content

max / pter

14.7 KB · 437 lines History Blame Raw
1 use scraper::ElementRef;
2
3 /// Determine whether a `<table>` element is a data table or a layout table.
4 ///
5 /// Email HTML overwhelmingly uses tables for layout. A table is considered
6 /// a **data table** if it has structural indicators of tabular data:
7 /// - Contains `<th>` elements
8 /// - Has a `<caption>` child
9 /// - Has `role="grid"` or `role="table"`
10 /// - Has multiple rows where multiple cells contain substantive text
11 ///
12 /// Everything else is treated as a layout table and unwrapped.
13 pub fn is_data_table(table: ElementRef) -> bool {
14 let el = table.value();
15
16 // role attribute
17 if let Some(role) = el.attr("role") {
18 if role == "grid" || role == "table" {
19 return true;
20 }
21 // role="presentation" is an explicit layout signal
22 if role == "presentation" || role == "none" {
23 return false;
24 }
25 }
26
27 let mut has_th = false;
28 let mut has_caption = false;
29 let mut multi_cell_rows = 0u32;
30
31 for descendant in table.descendants() {
32 if let Some(el_ref) = ElementRef::wrap(descendant) {
33 match el_ref.value().name() {
34 "th" => has_th = true,
35 "caption" => has_caption = true,
36 "tr" => {
37 let cell_count = el_ref
38 .children()
39 .filter_map(ElementRef::wrap)
40 .filter(|c| {
41 let name = c.value().name();
42 (name == "td" || name == "th") && has_substantive_text(*c)
43 })
44 .count();
45 if cell_count > 1 {
46 multi_cell_rows += 1;
47 }
48 }
49 _ => {}
50 }
51 }
52 }
53
54 if has_th || has_caption {
55 return true;
56 }
57
58 // Multiple rows with multiple substantive cells = data table
59 multi_cell_rows >= 2
60 }
61
62 /// Check if an element contains meaningful text (not just whitespace/nbsp).
63 fn has_substantive_text(el: ElementRef) -> bool {
64 let text = el.text().collect::<String>();
65 let trimmed = text.trim().replace('\u{a0}', ""); // strip &nbsp;
66 trimmed.len() > 1 // more than a single character
67 }
68
69 /// Extract rows and cells from a data table for markdown rendering.
70 ///
71 /// Returns (headers, rows) where each is a Vec of cell text strings.
72 /// If no `<thead>`/`<th>` row exists, the first row is used as headers.
73 pub fn extract_table_data(table: ElementRef) -> (Vec<String>, Vec<Vec<String>>) {
74 let mut headers: Vec<String> = Vec::new();
75 let mut rows: Vec<Vec<String>> = Vec::new();
76
77 // Look for thead/th first
78 for descendant in table.children().filter_map(ElementRef::wrap) {
79 let name = descendant.value().name();
80 if name == "thead" {
81 for tr in descendant.children().filter_map(ElementRef::wrap) {
82 if tr.value().name() == "tr" {
83 headers = extract_cells(tr);
84 break; // first row of thead
85 }
86 }
87 } else if name == "tbody" || name == "tr" {
88 let trs: Box<dyn Iterator<Item = ElementRef>> = if name == "tbody" {
89 Box::new(
90 descendant
91 .children()
92 .filter_map(ElementRef::wrap)
93 .filter(|e| e.value().name() == "tr"),
94 )
95 } else {
96 Box::new(std::iter::once(descendant))
97 };
98
99 for tr in trs {
100 let cells = extract_cells(tr);
101 if !cells.is_empty() {
102 // If we haven't found headers yet and this row has <th> cells,
103 // treat it as the header row
104 if headers.is_empty() && has_th_cells(tr) {
105 headers = cells;
106 } else {
107 rows.push(cells);
108 }
109 }
110 }
111 }
112 }
113
114 // If still no headers, promote first data row
115 if headers.is_empty() && !rows.is_empty() {
116 headers = rows.remove(0);
117 }
118
119 (headers, rows)
120 }
121
122 fn extract_cells(tr: ElementRef) -> Vec<String> {
123 tr.children()
124 .filter_map(ElementRef::wrap)
125 .filter(|e| {
126 let n = e.value().name();
127 n == "td" || n == "th"
128 })
129 .map(|cell| {
130 let text = cell.text().collect::<String>();
131 text.split_whitespace().collect::<Vec<_>>().join(" ")
132 })
133 .collect()
134 }
135
136 fn has_th_cells(tr: ElementRef) -> bool {
137 tr.children()
138 .filter_map(ElementRef::wrap)
139 .any(|e| e.value().name() == "th")
140 }
141
142 /// Render a data table as a GFM markdown table.
143 pub fn render_markdown_table(headers: &[String], rows: &[Vec<String>]) -> String {
144 if headers.is_empty() {
145 return String::new();
146 }
147
148 let col_count = headers.len();
149 let mut out = String::new();
150
151 // Header row
152 out.push('|');
153 for h in headers {
154 out.push(' ');
155 out.push_str(h);
156 out.push_str(" |");
157 }
158 out.push('\n');
159
160 // Separator row
161 out.push('|');
162 for _ in 0..col_count {
163 out.push_str(" --- |");
164 }
165 out.push('\n');
166
167 // Data rows
168 for row in rows {
169 out.push('|');
170 for i in 0..col_count {
171 out.push(' ');
172 if let Some(cell) = row.get(i) {
173 out.push_str(cell);
174 }
175 out.push_str(" |");
176 }
177 out.push('\n');
178 }
179
180 // Remove trailing newline (caller handles spacing)
181 out.trim_end().to_string()
182 }
183
184 #[cfg(test)]
185 mod tests {
186 use super::*;
187 use scraper::{Html, Selector};
188
189 fn parse_table(html: &str) -> Html {
190 Html::parse_document(html)
191 }
192
193 fn select_table(doc: &Html) -> ElementRef<'_> {
194 let sel = Selector::parse("table").unwrap();
195 doc.select(&sel).next().unwrap()
196 }
197
198 #[test]
199 fn single_cell_is_layout() {
200 let doc = parse_table("<table><tr><td>content</td></tr></table>");
201 assert!(!is_data_table(select_table(&doc)));
202 }
203
204 #[test]
205 fn table_with_th_is_data() {
206 let doc = parse_table(
207 "<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>30</td></tr></table>",
208 );
209 assert!(is_data_table(select_table(&doc)));
210 }
211
212 #[test]
213 fn table_with_caption_is_data() {
214 let doc = parse_table(
215 "<table><caption>Users</caption><tr><td>Alice</td><td>30</td></tr></table>",
216 );
217 assert!(is_data_table(select_table(&doc)));
218 }
219
220 #[test]
221 fn role_presentation_is_layout() {
222 let doc = parse_table(
223 r#"<table role="presentation"><tr><td>layout</td><td>stuff</td></tr></table>"#,
224 );
225 assert!(!is_data_table(select_table(&doc)));
226 }
227
228 #[test]
229 fn role_grid_is_data() {
230 let doc =
231 parse_table(r#"<table role="grid"><tr><td>Alice</td><td>30</td></tr></table>"#);
232 assert!(is_data_table(select_table(&doc)));
233 }
234
235 #[test]
236 fn multi_row_multi_cell_is_data() {
237 let doc = parse_table(
238 "<table>\
239 <tr><td>Alice</td><td>Engineer</td></tr>\
240 <tr><td>Bob</td><td>Designer</td></tr>\
241 </table>",
242 );
243 assert!(is_data_table(select_table(&doc)));
244 }
245
246 #[test]
247 fn spacer_cells_not_substantive() {
248 let doc = parse_table(
249 "<table><tr><td>content</td><td>&nbsp;</td></tr>\
250 <tr><td>more</td><td> </td></tr></table>",
251 );
252 // Only one substantive cell per row
253 assert!(!is_data_table(select_table(&doc)));
254 }
255
256 #[test]
257 fn render_simple_table() {
258 let headers = vec!["Name".into(), "Age".into()];
259 let rows = vec![
260 vec!["Alice".into(), "30".into()],
261 vec!["Bob".into(), "25".into()],
262 ];
263 let md = render_markdown_table(&headers, &rows);
264 assert_eq!(
265 md,
266 "| Name | Age |\n| --- | --- |\n| Alice | 30 |\n| Bob | 25 |"
267 );
268 }
269
270 #[test]
271 fn render_empty_headers() {
272 let md = render_markdown_table(&[], &[]);
273 assert_eq!(md, "");
274 }
275
276 #[test]
277 fn extract_with_thead() {
278 let doc = parse_table(
279 "<table><thead><tr><th>A</th><th>B</th></tr></thead>\
280 <tbody><tr><td>1</td><td>2</td></tr></tbody></table>",
281 );
282 let (h, r) = extract_table_data(select_table(&doc));
283 assert_eq!(h, vec!["A", "B"]);
284 assert_eq!(r, vec![vec!["1".to_string(), "2".to_string()]]);
285 }
286
287 #[test]
288 fn extract_promotes_first_row() {
289 let doc = parse_table(
290 "<table><tr><td>Name</td><td>Val</td></tr>\
291 <tr><td>X</td><td>Y</td></tr></table>",
292 );
293 let (h, r) = extract_table_data(select_table(&doc));
294 assert_eq!(h, vec!["Name", "Val"]);
295 assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]);
296 }
297
298 // -- Boundary tests for is_data_table role handling --
299
300 #[test]
301 fn role_none_is_layout() {
302 // role="none" → explicit layout signal. Catches L22 `||` mutation
303 // (presentation OR none); without the ||, "none" wouldn't short-circuit.
304 let doc = parse_table(
305 r#"<table role="none"><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></table>"#,
306 );
307 // Even with <th>, the explicit role="none" should win.
308 assert!(!is_data_table(select_table(&doc)));
309 }
310
311 #[test]
312 fn role_table_is_data() {
313 // role="table" → data. Catches L22 == "grid" mutating to != (which would
314 // make grid not match) AND covers the parallel `|| role == "table"` arm.
315 let doc =
316 parse_table(r#"<table role="table"><tr><td>a</td></tr></table>"#);
317 assert!(is_data_table(select_table(&doc)));
318 }
319
320 #[test]
321 fn role_unknown_falls_through_to_structural() {
322 // Unknown role → no early decision; structural rules apply.
323 // Single-cell single-row layout table → not data.
324 let doc =
325 parse_table(r#"<table role="banner"><tr><td>only one cell</td></tr></table>"#);
326 assert!(!is_data_table(select_table(&doc)));
327 }
328
329 #[test]
330 fn role_presentation_overrides_structure() {
331 // role="presentation" → layout, even with multiple substantive rows.
332 // Catches L22 == "presentation" mutating to != (which would skip this check).
333 let doc = parse_table(
334 r#"<table role="presentation"><tr><td>Alice</td><td>Engineer</td></tr>\
335 <tr><td>Bob</td><td>Designer</td></tr></table>"#,
336 );
337 assert!(!is_data_table(select_table(&doc)));
338 }
339
340 // -- Boundary tests for has_substantive_text > 1 --
341
342 #[test]
343 fn single_char_cells_not_substantive() {
344 // Two rows of single-char cells → not substantive → not a data table.
345 // Catches L66 `>` mutating to `>=`: with >=, single chars become substantive
346 // and these two rows would qualify as a data table.
347 let doc = parse_table(
348 "<table><tr><td>a</td><td>b</td></tr><tr><td>c</td><td>d</td></tr></table>",
349 );
350 assert!(!is_data_table(select_table(&doc)));
351 }
352
353 #[test]
354 fn two_char_cells_are_substantive() {
355 let doc = parse_table(
356 "<table><tr><td>ab</td><td>cd</td></tr><tr><td>ef</td><td>gh</td></tr></table>",
357 );
358 assert!(is_data_table(select_table(&doc)));
359 }
360
361 // -- Boundary tests for extract_table_data tbody handling --
362
363 #[test]
364 fn extract_with_tbody_no_thead() {
365 // Catches L87 `== "tbody"` mutating to != (which would skip tbody).
366 let doc = parse_table(
367 "<table><tbody><tr><td>Name</td><td>Val</td></tr><tr><td>X</td><td>Y</td></tr></tbody></table>",
368 );
369 let (h, r) = extract_table_data(select_table(&doc));
370 // First tbody row promoted to headers; second row is data.
371 assert_eq!(h, vec!["Name", "Val"]);
372 assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]);
373 }
374
375 // -- Boundary tests for the headers-vs-th-row decision (L104 &&) --
376
377 #[test]
378 fn thead_present_blocks_later_th_row_promotion() {
379 // Headers already set by thead. A later th-row should NOT overwrite them.
380 // Catches L104 `&&` mutating to `||`: with ||, has_th_cells alone would
381 // re-promote, clobbering the thead headers.
382 let doc = parse_table(
383 "<table><thead><tr><th>A</th><th>B</th></tr></thead>\
384 <tbody><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></tbody></table>",
385 );
386 let (h, r) = extract_table_data(select_table(&doc));
387 assert_eq!(h, vec!["A", "B"], "thead headers must not be overwritten");
388 // Both the th-row and the td-row become data rows.
389 assert_eq!(r.len(), 2);
390 }
391
392 #[test]
393 fn no_thead_th_row_promotes_to_headers() {
394 // No thead, but a tr full of th cells → that tr's cells become headers.
395 // Catches `has_th_cells -> bool` always-false mutation (which would
396 // make this row become a data row instead).
397 let doc = parse_table(
398 "<table><tr><th>X</th><th>Y</th></tr><tr><td>1</td><td>2</td></tr></table>",
399 );
400 let (h, r) = extract_table_data(select_table(&doc));
401 assert_eq!(h, vec!["X", "Y"]);
402 assert_eq!(r, vec![vec!["1".to_string(), "2".to_string()]]);
403 }
404
405 #[test]
406 fn all_td_rows_promote_first_to_headers() {
407 // No th anywhere → has_th_cells is false for every row → first row promoted
408 // by the `if headers.is_empty() && !rows.is_empty()` fallback.
409 // Catches `has_th_cells -> bool` always-true mutation (which would promote
410 // every row as headers, leaving rows empty after the first).
411 let doc = parse_table(
412 "<table><tr><td>Name</td><td>Val</td></tr><tr><td>X</td><td>Y</td></tr><tr><td>P</td><td>Q</td></tr></table>",
413 );
414 let (h, r) = extract_table_data(select_table(&doc));
415 assert_eq!(h, vec!["Name", "Val"]);
416 assert_eq!(r.len(), 2);
417 }
418
419 // -- Boundary test for has_th_cells (L139 == "th") --
420
421 #[test]
422 fn td_only_row_is_not_a_header_row() {
423 // A tr with only <td> cells should NOT promote to headers when other
424 // rows exist. Catches L139 `== "th"` mutating to `!=` (which would
425 // match td cells and incorrectly treat every td row as a header row).
426 let doc = parse_table(
427 "<table><tr><td>data-1</td><td>data-2</td></tr>\
428 <tr><td>data-3</td><td>data-4</td></tr>\
429 <tr><td>data-5</td><td>data-6</td></tr></table>",
430 );
431 let (h, r) = extract_table_data(select_table(&doc));
432 // First row is promoted (via the fallback at the end), leaving exactly two data rows.
433 assert_eq!(h, vec!["data-1", "data-2"]);
434 assert_eq!(r.len(), 2, "remaining rows should be data, not headers");
435 }
436 }
437