use scraper::ElementRef; /// Determine whether a `` element is a data table or a layout table. /// /// Email HTML overwhelmingly uses tables for layout. A table is considered /// a **data table** if it has structural indicators of tabular data: /// - Contains ``/`
` elements /// - Has a `
` child /// - Has `role="grid"` or `role="table"` /// - Has multiple rows where multiple cells contain substantive text /// /// Everything else is treated as a layout table and unwrapped. pub fn is_data_table(table: ElementRef) -> bool { let el = table.value(); // role attribute if let Some(role) = el.attr("role") { if role == "grid" || role == "table" { return true; } // role="presentation" is an explicit layout signal if role == "presentation" || role == "none" { return false; } } let mut has_th = false; let mut has_caption = false; let mut multi_cell_rows = 0u32; for descendant in table.descendants() { if let Some(el_ref) = ElementRef::wrap(descendant) { match el_ref.value().name() { "th" => has_th = true, "caption" => has_caption = true, "tr" => { let cell_count = el_ref .children() .filter_map(ElementRef::wrap) .filter(|c| { let name = c.value().name(); (name == "td" || name == "th") && has_substantive_text(*c) }) .count(); if cell_count > 1 { multi_cell_rows += 1; } } _ => {} } } } if has_th || has_caption { return true; } // Multiple rows with multiple substantive cells = data table multi_cell_rows >= 2 } /// Check if an element contains meaningful text (not just whitespace/nbsp). fn has_substantive_text(el: ElementRef) -> bool { let text = el.text().collect::(); let trimmed = text.trim().replace('\u{a0}', ""); // strip   trimmed.len() > 1 // more than a single character } /// Extract rows and cells from a data table for markdown rendering. /// /// Returns (headers, rows) where each is a Vec of cell text strings. /// If no `
` row exists, the first row is used as headers. pub fn extract_table_data(table: ElementRef) -> (Vec, Vec>) { let mut headers: Vec = Vec::new(); let mut rows: Vec> = Vec::new(); // Look for thead/th first for descendant in table.children().filter_map(ElementRef::wrap) { let name = descendant.value().name(); if name == "thead" { for tr in descendant.children().filter_map(ElementRef::wrap) { if tr.value().name() == "tr" { headers = extract_cells(tr); break; // first row of thead } } } else if name == "tbody" || name == "tr" { let trs: Box> = if name == "tbody" { Box::new( descendant .children() .filter_map(ElementRef::wrap) .filter(|e| e.value().name() == "tr"), ) } else { Box::new(std::iter::once(descendant)) }; for tr in trs { let cells = extract_cells(tr); if !cells.is_empty() { // If we haven't found headers yet and this row has cells, // treat it as the header row if headers.is_empty() && has_th_cells(tr) { headers = cells; } else { rows.push(cells); } } } } } // If still no headers, promote first data row if headers.is_empty() && !rows.is_empty() { headers = rows.remove(0); } (headers, rows) } fn extract_cells(tr: ElementRef) -> Vec { tr.children() .filter_map(ElementRef::wrap) .filter(|e| { let n = e.value().name(); n == "td" || n == "th" }) .map(|cell| { let text = cell.text().collect::(); text.split_whitespace().collect::>().join(" ") }) .collect() } fn has_th_cells(tr: ElementRef) -> bool { tr.children() .filter_map(ElementRef::wrap) .any(|e| e.value().name() == "th") } /// Render a data table as a GFM markdown table. pub fn render_markdown_table(headers: &[String], rows: &[Vec]) -> String { if headers.is_empty() { return String::new(); } let col_count = headers.len(); let mut out = String::new(); // Header row out.push('|'); for h in headers { out.push(' '); out.push_str(h); out.push_str(" |"); } out.push('\n'); // Separator row out.push('|'); for _ in 0..col_count { out.push_str(" --- |"); } out.push('\n'); // Data rows for row in rows { out.push('|'); for i in 0..col_count { out.push(' '); if let Some(cell) = row.get(i) { out.push_str(cell); } out.push_str(" |"); } out.push('\n'); } // Remove trailing newline (caller handles spacing) out.trim_end().to_string() } #[cfg(test)] mod tests { use super::*; use scraper::{Html, Selector}; fn parse_table(html: &str) -> Html { Html::parse_document(html) } fn select_table(doc: &Html) -> ElementRef<'_> { let sel = Selector::parse("table").unwrap(); doc.select(&sel).next().unwrap() } #[test] fn single_cell_is_layout() { let doc = parse_table("
content
"); assert!(!is_data_table(select_table(&doc))); } #[test] fn table_with_th_is_data() { let doc = parse_table( "
NameAge
Alice30
", ); assert!(is_data_table(select_table(&doc))); } #[test] fn table_with_caption_is_data() { let doc = parse_table( "
Users
Alice30
", ); assert!(is_data_table(select_table(&doc))); } #[test] fn role_presentation_is_layout() { let doc = parse_table( r#"
layoutstuff
"#, ); assert!(!is_data_table(select_table(&doc))); } #[test] fn role_grid_is_data() { let doc = parse_table(r#"
Alice30
"#); assert!(is_data_table(select_table(&doc))); } #[test] fn multi_row_multi_cell_is_data() { let doc = parse_table( "\ \ \
AliceEngineer
BobDesigner
", ); assert!(is_data_table(select_table(&doc))); } #[test] fn spacer_cells_not_substantive() { let doc = parse_table( "\
content 
more
", ); // Only one substantive cell per row assert!(!is_data_table(select_table(&doc))); } #[test] fn render_simple_table() { let headers = vec!["Name".into(), "Age".into()]; let rows = vec![ vec!["Alice".into(), "30".into()], vec!["Bob".into(), "25".into()], ]; let md = render_markdown_table(&headers, &rows); assert_eq!( md, "| Name | Age |\n| --- | --- |\n| Alice | 30 |\n| Bob | 25 |" ); } #[test] fn render_empty_headers() { let md = render_markdown_table(&[], &[]); assert_eq!(md, ""); } #[test] fn extract_with_thead() { let doc = parse_table( "\
AB
12
", ); let (h, r) = extract_table_data(select_table(&doc)); assert_eq!(h, vec!["A", "B"]); assert_eq!(r, vec![vec!["1".to_string(), "2".to_string()]]); } #[test] fn extract_promotes_first_row() { let doc = parse_table( "\
NameVal
XY
", ); let (h, r) = extract_table_data(select_table(&doc)); assert_eq!(h, vec!["Name", "Val"]); assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]); } // -- Boundary tests for is_data_table role handling -- #[test] fn role_none_is_layout() { // role="none" → explicit layout signal. Catches L22 `||` mutation // (presentation OR none); without the ||, "none" wouldn't short-circuit. let doc = parse_table( r#"
XY
12
"#, ); // Even with
, the explicit role="none" should win. assert!(!is_data_table(select_table(&doc))); } #[test] fn role_table_is_data() { // role="table" → data. Catches L22 == "grid" mutating to != (which would // make grid not match) AND covers the parallel `|| role == "table"` arm. let doc = parse_table(r#"
a
"#); assert!(is_data_table(select_table(&doc))); } #[test] fn role_unknown_falls_through_to_structural() { // Unknown role → no early decision; structural rules apply. // Single-cell single-row layout table → not data. let doc = parse_table(r#"
only one cell
"#); assert!(!is_data_table(select_table(&doc))); } #[test] fn role_presentation_overrides_structure() { // role="presentation" → layout, even with multiple substantive rows. // Catches L22 == "presentation" mutating to != (which would skip this check). let doc = parse_table( r#"\
AliceEngineer
BobDesigner
"#, ); assert!(!is_data_table(select_table(&doc))); } // -- Boundary tests for has_substantive_text > 1 -- #[test] fn single_char_cells_not_substantive() { // Two rows of single-char cells → not substantive → not a data table. // Catches L66 `>` mutating to `>=`: with >=, single chars become substantive // and these two rows would qualify as a data table. let doc = parse_table( "
ab
cd
", ); assert!(!is_data_table(select_table(&doc))); } #[test] fn two_char_cells_are_substantive() { let doc = parse_table( "
abcd
efgh
", ); assert!(is_data_table(select_table(&doc))); } // -- Boundary tests for extract_table_data tbody handling -- #[test] fn extract_with_tbody_no_thead() { // Catches L87 `== "tbody"` mutating to != (which would skip tbody). let doc = parse_table( "
NameVal
XY
", ); let (h, r) = extract_table_data(select_table(&doc)); // First tbody row promoted to headers; second row is data. assert_eq!(h, vec!["Name", "Val"]); assert_eq!(r, vec![vec!["X".to_string(), "Y".to_string()]]); } // -- Boundary tests for the headers-vs-th-row decision (L104 &&) -- #[test] fn thead_present_blocks_later_th_row_promotion() { // Headers already set by thead. A later th-row should NOT overwrite them. // Catches L104 `&&` mutating to `||`: with ||, has_th_cells alone would // re-promote, clobbering the thead headers. let doc = parse_table( "\
AB
XY
12
", ); let (h, r) = extract_table_data(select_table(&doc)); assert_eq!(h, vec!["A", "B"], "thead headers must not be overwritten"); // Both the th-row and the td-row become data rows. assert_eq!(r.len(), 2); } #[test] fn no_thead_th_row_promotes_to_headers() { // No thead, but a tr full of th cells → that tr's cells become headers. // Catches `has_th_cells -> bool` always-false mutation (which would // make this row become a data row instead). let doc = parse_table( "
XY
12
", ); let (h, r) = extract_table_data(select_table(&doc)); assert_eq!(h, vec!["X", "Y"]); assert_eq!(r, vec![vec!["1".to_string(), "2".to_string()]]); } #[test] fn all_td_rows_promote_first_to_headers() { // No th anywhere → has_th_cells is false for every row → first row promoted // by the `if headers.is_empty() && !rows.is_empty()` fallback. // Catches `has_th_cells -> bool` always-true mutation (which would promote // every row as headers, leaving rows empty after the first). let doc = parse_table( "
NameVal
XY
PQ
", ); let (h, r) = extract_table_data(select_table(&doc)); assert_eq!(h, vec!["Name", "Val"]); assert_eq!(r.len(), 2); } // -- Boundary test for has_th_cells (L139 == "th") -- #[test] fn td_only_row_is_not_a_header_row() { // A tr with only
cells should NOT promote to headers when other // rows exist. Catches L139 `== "th"` mutating to `!=` (which would // match td cells and incorrectly treat every td row as a header row). let doc = parse_table( "\ \
data-1data-2
data-3data-4
data-5data-6
", ); let (h, r) = extract_table_data(select_table(&doc)); // First row is promoted (via the fallback at the end), leaving exactly two data rows. assert_eq!(h, vec!["data-1", "data-2"]); assert_eq!(r.len(), 2, "remaining rows should be data, not headers"); } }