use std::collections::HashMap; use std::path::Path; use std::sync::LazyLock; use regex::Regex; static LINK_RE: LazyLock = LazyLock::new(|| { Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").expect("valid regex") }); /// Configuration for the doc loader. pub struct DocLoaderConfig { /// Sections as `(directory_name, display_name)` pairs in display order. pub sections: Vec<(String, String)>, /// URL prefix for rewritten links (e.g., "/docs"). pub link_prefix: String, /// Pattern that identifies unpublished links to strip (e.g., "unpublished/"). pub unpublished_pattern: Option, /// Path to directory containing UI example `.html` fragments. /// If set, `[!UI] name` directives are resolved by loading `{examples_path}/{name}.html`. pub examples_path: Option, /// Optional pre-processor applied to raw markdown before link rewriting. /// On `Err`, the page is skipped with a warning. Use to wire /// [`crate::Assumptions::substitute`] or a similar transform. pub pre_process: Option Result + Send + Sync>>, } /// A rendered documentation page. #[derive(Clone, Debug)] pub struct DocPage { pub title: String, pub slug: String, pub section: String, pub html_content: String, } /// Ordered entry for the docs index page. #[derive(Clone, Debug)] pub struct DocIndexEntry { pub title: String, pub slug: String, pub section: String, } /// Entry in the full-text search index, serialised to JSON for client-side search. #[derive(Clone, Debug, serde::Serialize)] pub struct DocSearchEntry { pub slug: String, pub title: String, pub section: String, pub body_text: String, } /// In-memory store of rendered documentation pages, built once at startup. #[derive(Clone, Debug)] pub struct DocLoader { pages: HashMap, index: Vec, } impl DocLoader { /// Load all `.md` files from `base_path`, rendering them into HTML. /// /// Expects subdirectories matching the configured sections. pub fn load(base_path: &Path, config: &DocLoaderConfig) -> Self { let mut pages = HashMap::new(); let mut index = Vec::new(); for (dir_name, section_display) in &config.sections { let section_path = base_path.join(dir_name); if !section_path.is_dir() { continue; } let read_dir = match std::fs::read_dir(§ion_path) { Ok(rd) => rd, Err(e) => { tracing::warn!(path = %section_path.display(), error = %e, "Failed to read docs section directory"); continue; } }; let mut entries: Vec<_> = read_dir .filter_map(|e| e.ok()) .filter(|e| { e.path() .extension() .map(|ext| ext == "md") .unwrap_or(false) }) .collect(); entries.sort_by_key(|e| e.file_name()); for entry in entries { let path = entry.path(); let slug = path .file_stem() .and_then(|s| s.to_str()) .unwrap_or_default() .to_string(); let raw_md = match std::fs::read_to_string(&path) { Ok(content) => content, Err(_) => continue, }; let raw_md = match &config.pre_process { Some(pp) => match pp(&raw_md) { Ok(md) => md, Err(e) => { tracing::warn!( path = %path.display(), error = %e, "pre_process failed; skipping page" ); continue; } }, None => raw_md, }; let title = crate::text::extract_title(&raw_md).unwrap_or_else(|| slug.clone()); let rewritten_md = rewrite_links( &raw_md, &config.link_prefix, config.unpublished_pattern.as_deref(), ); let md_without_title = crate::text::strip_first_heading(&rewritten_md); let html_content = crate::render_permissive(&md_without_title); #[cfg(feature = "directives")] let html_content = crate::directives::post_process_directives(&html_content); let html_content = resolve_ui_examples(&html_content, config.examples_path.as_deref()); let page = DocPage { title, slug, section: section_display.clone(), html_content, }; index.push(DocIndexEntry { title: page.title.clone(), slug: page.slug.clone(), section: page.section.clone(), }); let slug_key = page.slug.clone(); pages.insert(slug_key, page); } } DocLoader { pages, index } } /// Look up a rendered page by slug. pub fn get(&self, slug: &str) -> Option<&DocPage> { self.pages.get(slug) } /// Get the full ordered index. pub fn index(&self) -> &[DocIndexEntry] { &self.index } /// Build a search index with HTML stripped to plain text. pub fn search_index(&self) -> Vec { self.index .iter() .filter_map(|entry| { let page = self.pages.get(&entry.slug)?; Some(DocSearchEntry { slug: entry.slug.clone(), title: entry.title.clone(), section: entry.section.clone(), body_text: strip_html_tags(&page.html_content), }) }) .collect() } } /// Replace `
` placeholders with /// the contents of `{examples_path}/{name}.html`. /// /// If no examples path is configured or a file is missing, the placeholder is /// replaced with a fallback message. fn resolve_ui_examples(html: &str, examples_path: Option<&Path>) -> String { static UI_PLACEHOLDER: LazyLock = LazyLock::new(|| { Regex::new(r#"
"#) .expect("valid UI placeholder regex") }); if !html.contains("doc-ui-frame") { return html.to_string(); } UI_PLACEHOLDER.replace_all(html, |caps: ®ex::Captures| { let name = &caps[1]; match examples_path { Some(dir) => { let file = dir.join(format!("{name}.html")); match std::fs::read_to_string(&file) { Ok(content) => format!( "
{content}
" ), Err(_) => { tracing::warn!(example = name, "UI example file not found"); format!( "
[UI example: {name}]
" ) } } } None => format!( "
[UI example: {name}]
" ), } }).into_owned() } /// Strip HTML tags from a string, returning plain text. /// Decodes common HTML entities so search indexes match plain-text queries. fn strip_html_tags(html: &str) -> String { let mut out = String::with_capacity(html.len()); let mut in_tag = false; for ch in html.chars() { match ch { '<' => in_tag = true, '>' => { in_tag = false; // Add a space after closing tags to separate words. if !out.ends_with(' ') { out.push(' '); } } _ if !in_tag => out.push(ch), _ => {} } } // Collapse runs of whitespace. let collapsed: String = out.split_whitespace().collect::>().join(" "); // Decode common HTML entities for search index accuracy. collapsed .replace("&", "&") .replace("<", "<") .replace(">", ">") .replace(""", "\"") .replace("'", "'") .replace("'", "'") } /// Rewrite relative `.md` links to the configured prefix. fn rewrite_links(markdown: &str, link_prefix: &str, unpublished_pattern: Option<&str>) -> String { LINK_RE .replace_all(markdown, |caps: ®ex::Captures| { let text = &caps[1]; let url = &caps[2]; // Preserve absolute URLs, mailto, and internal routes. if url.starts_with("http://") || url.starts_with("https://") || url.starts_with("mailto:") || url.starts_with('/') { return caps[0].to_string(); } // Unpublished docs: strip link, keep text. if let Some(pattern) = unpublished_pattern { if url.contains(pattern) { return text.to_string(); } } // Only rewrite links containing .md if !url.contains(".md") { return caps[0].to_string(); } // Split off any #anchor. let (path_part, anchor): (&str, Option<&str>) = match url.split_once('#') { Some((p, a)) => (p, Some(a)), None => (url, None), }; // Extract slug from filename: ../support/faq.md -> faq let filename = path_part .rsplit('/') .next() .unwrap_or(path_part) .trim_end_matches(".md"); let mut new_url = format!("{link_prefix}/{filename}"); if let Some(anchor) = anchor { new_url.push('#'); new_url.push_str(anchor); } format!("[{text}]({new_url})") }) .to_string() } #[cfg(test)] mod tests { use super::*; #[test] fn rewrite_same_section_link() { let md = "See [SLA](./guarantees.md) for details."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, "See [SLA](/docs/guarantees) for details."); } #[test] fn rewrite_cross_section_link() { let md = "Check [FAQ](../support/faq.md) for more."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, "Check [FAQ](/docs/faq) for more."); } #[test] fn rewrite_unpublished_link_becomes_plain_text() { let md = "See [Content Moderation](../../unpublished/legal/moderation.md) for details."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, "See Content Moderation for details."); } #[test] fn rewrite_preserves_absolute_urls() { let md = "Visit [our site](https://example.com) today."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, md); } #[test] fn rewrite_preserves_plain_http_urls() { // Distinct from https — catches the `url.starts_with("http://")` arm // mutation (L244 `||` → `&&`). Without this case, the only protocol // tested is https, leaving the http arm uncovered. let md = "Visit [legacy](http://example.com) today."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, md); } #[test] fn rewrite_preserves_external_md_links() { // Absolute URLs that happen to end in .md must NOT be rewritten. // This catches the L244 `||` → `&&` mutation: under the mutant, the // early-return short-circuit fails (since one URL can't both start // with "http://" AND "https://"), so the URL falls through to the // .md-rewrite path and gets incorrectly mangled. let md_http = "See [external](http://example.com/foo.md)."; assert_eq!( rewrite_links(md_http, "/docs", Some("unpublished/")), md_http, "http:// + .md must be preserved" ); let md_https = "See [external](https://example.com/foo.md)."; assert_eq!( rewrite_links(md_https, "/docs", Some("unpublished/")), md_https, "https:// + .md must be preserved" ); let md_mailto = "Email [us](mailto:a@b.md)."; assert_eq!( rewrite_links(md_mailto, "/docs", Some("unpublished/")), md_mailto, "mailto: + .md must be preserved" ); } #[test] fn rewrite_preserves_mailto() { let md = "Email [us](mailto:test@example.com)"; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, md); } #[test] fn rewrite_preserves_internal_routes() { let md = "Go to [pricing](/pricing) page."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, md); } #[test] fn rewrite_link_with_anchor() { let md = "See [section](./faq.md#billing)."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, "See [section](/docs/faq#billing)."); } #[test] fn rewrite_public_cross_ref() { let md = "See [Acceptable Use](../../public/legal/acceptable-use.md)."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, "See [Acceptable Use](/docs/acceptable-use)."); } #[test] fn rewrite_custom_prefix() { let md = "See [FAQ](./faq.md) here."; let result = rewrite_links(md, "/help", None); assert_eq!(result, "See [FAQ](/help/faq) here."); } #[test] fn rewrite_no_unpublished_pattern() { let md = "See [doc](../../unpublished/foo.md)."; let result = rewrite_links(md, "/docs", None); // Without the pattern, it just rewrites normally assert_eq!(result, "See [doc](/docs/foo)."); } #[test] fn rewrite_non_md_link_preserved() { let md = "See [image](./photo.png) here."; let result = rewrite_links(md, "/docs", None); assert_eq!(result, md); } #[test] fn strip_html_tags_removes_tags() { let html = "

Hello world

"; assert_eq!(strip_html_tags(html), "Hello world"); } #[test] fn strip_html_tags_empty_input() { assert_eq!(strip_html_tags(""), ""); } #[test] fn strip_html_tags_decodes_entities() { let html = "

Price: $10 & free

"; assert_eq!(strip_html_tags(html), "Price: $10 & free"); let html2 = "

a < b > c

"; assert_eq!(strip_html_tags(html2), "a < b > c"); let html3 = "

"hello" & 'world'

"; assert_eq!(strip_html_tags(html3), "\"hello\" & 'world'"); } #[test] fn strip_html_tags_nested_tags() { let html = "

A nested deep tag

"; assert_eq!(strip_html_tags(html), "A nested deep tag"); } // ── DocLoader::load / get / index / search_index (tempdir fixtures) ── fn config_for(base: &Path) -> DocLoaderConfig { // Sections listed in display order: "guide" first, then "support". let _ = base; // base path lives at the call site; config doesn't need it. DocLoaderConfig { sections: vec![ ("guide".to_string(), "Guide".to_string()), ("support".to_string(), "Support".to_string()), ], link_prefix: "/docs".to_string(), unpublished_pattern: Some("unpublished/".to_string()), examples_path: None, pre_process: None, } } #[test] fn pre_process_hook_transforms_markdown_before_render() { let tmp = tempfile::tempdir().unwrap(); let base = tmp.path(); let p = base.join("guide"); std::fs::create_dir_all(&p).unwrap(); std::fs::write(p.join("a.md"), "# Hi\n\nValue: TOKEN").unwrap(); let mut config = config_for(base); config.pre_process = Some(Box::new(|md: &str| { Ok(md.replace("TOKEN", "42")) })); let loader = DocLoader::load(base, &config); let page = loader.get("a").expect("page loaded"); assert!(page.html_content.contains("42"), "got: {}", page.html_content); assert!(!page.html_content.contains("TOKEN"), "got: {}", page.html_content); } #[test] fn pre_process_hook_error_skips_page() { let tmp = tempfile::tempdir().unwrap(); let base = tmp.path(); let p = base.join("guide"); std::fs::create_dir_all(&p).unwrap(); std::fs::write(p.join("good.md"), "# Good").unwrap(); std::fs::write(p.join("bad.md"), "# Bad\n\n{{ missing }}").unwrap(); let mut config = config_for(base); config.pre_process = Some(Box::new(|md: &str| { if md.contains("{{") { Err("unresolved placeholder".into()) } else { Ok(md.to_string()) } })); let loader = DocLoader::load(base, &config); assert!(loader.get("good").is_some(), "good page should load"); assert!(loader.get("bad").is_none(), "bad page should be skipped"); } fn write(base: &Path, rel: &str, content: &str) { let p = base.join(rel); std::fs::create_dir_all(p.parent().unwrap()).unwrap(); std::fs::write(p, content).unwrap(); } #[test] fn load_indexes_pages_across_sections_in_display_order() { let tmp = tempfile::tempdir().unwrap(); let base = tmp.path(); // Guide section: two pages, with the file order intentionally reversed // from desired sort order to confirm `entries.sort_by_key(file_name)`. write(base, "guide/zzz-last.md", "# Z Page\n\nzzz body"); write(base, "guide/aaa-first.md", "# A Page\n\naaa body"); // Support section: one page. write(base, "support/faq.md", "# FAQ\n\nfaq body"); let loader = DocLoader::load(base, &config_for(base)); let idx = loader.index(); assert_eq!(idx.len(), 3, "expected 3 indexed pages, got: {idx:?}"); // Sections appear in config order; entries within a section in // sort_by_key(file_name) order. assert_eq!(idx[0].slug, "aaa-first"); assert_eq!(idx[0].section, "Guide"); assert_eq!(idx[1].slug, "zzz-last"); assert_eq!(idx[1].section, "Guide"); assert_eq!(idx[2].slug, "faq"); assert_eq!(idx[2].section, "Support"); } #[test] fn load_extracts_title_from_first_heading_and_strips_it_from_body() { let tmp = tempfile::tempdir().unwrap(); let base = tmp.path(); write(base, "guide/welcome.md", "# Welcome Title\n\nBody paragraph here."); let loader = DocLoader::load(base, &config_for(base)); let page = loader.get("welcome").expect("page should be indexed"); assert_eq!(page.title, "Welcome Title"); // The H1 itself must be stripped from html_content — only body remains. assert!(!page.html_content.contains("Welcome Title"), "title leaked into body: {}", page.html_content); assert!(page.html_content.contains("Body paragraph here")); } #[test] fn load_falls_back_to_slug_when_no_title_heading() { let tmp = tempfile::tempdir().unwrap(); let base = tmp.path(); write(base, "guide/no-title.md", "Body without any heading."); let loader = DocLoader::load(base, &config_for(base)); let page = loader.get("no-title").unwrap(); assert_eq!(page.title, "no-title"); } #[test] fn load_skips_non_markdown_files() { let tmp = tempfile::tempdir().unwrap(); let base = tmp.path(); write(base, "guide/keep.md", "# Keep\n\nbody"); write(base, "guide/ignore.txt", "should not be indexed"); write(base, "guide/also-ignore.json", "{}"); let loader = DocLoader::load(base, &config_for(base)); assert_eq!(loader.index().len(), 1); assert_eq!(loader.index()[0].slug, "keep"); } #[test] fn load_skips_missing_section_directories() { let tmp = tempfile::tempdir().unwrap(); let base = tmp.path(); // Only `guide` exists; `support` is missing entirely. write(base, "guide/page.md", "# Page\n\nbody"); let loader = DocLoader::load(base, &config_for(base)); // Should index the one page that exists, not panic on the missing dir. assert_eq!(loader.index().len(), 1); assert_eq!(loader.index()[0].slug, "page"); } #[test] fn load_rewrites_relative_md_links() { let tmp = tempfile::tempdir().unwrap(); let base = tmp.path(); write( base, "guide/main.md", "# Main\n\nSee [FAQ](../support/faq.md) for help.", ); write(base, "support/faq.md", "# FAQ\n\nFAQ body."); let loader = DocLoader::load(base, &config_for(base)); let main = loader.get("main").unwrap(); // The .md link must be rewritten to /docs/; the original // `../support/faq.md` path must not appear. assert!(main.html_content.contains("/docs/faq"), "link not rewritten: {}", main.html_content); assert!(!main.html_content.contains("faq.md"), "raw .md path leaked: {}", main.html_content); } #[test] fn get_returns_none_for_unknown_slug() { let tmp = tempfile::tempdir().unwrap(); let base = tmp.path(); write(base, "guide/exists.md", "# Exists\n\nbody"); let loader = DocLoader::load(base, &config_for(base)); assert!(loader.get("nope").is_none()); assert!(loader.get("exists").is_some()); } #[test] fn search_index_strips_html_and_preserves_metadata() { let tmp = tempfile::tempdir().unwrap(); let base = tmp.path(); write( base, "guide/with-html.md", "# Title\n\nA **bold** word and an `inline code` token.", ); let loader = DocLoader::load(base, &config_for(base)); let entries = loader.search_index(); assert_eq!(entries.len(), 1); let e = &entries[0]; assert_eq!(e.slug, "with-html"); assert_eq!(e.title, "Title"); assert_eq!(e.section, "Guide"); // body_text must be plain text — no surviving tags. assert!(!e.body_text.contains('<'), "tag leaked into search: {}", e.body_text); assert!(e.body_text.contains("bold")); assert!(e.body_text.contains("inline code")); } // ── resolve_ui_examples ── #[test] fn resolve_ui_examples_inlines_file_contents_when_present() { let tmp = tempfile::tempdir().unwrap(); let dir = tmp.path(); std::fs::write(dir.join("cart.html"), "").unwrap(); let html = r#"
"#; let result = resolve_ui_examples(html, Some(dir)); assert!(result.contains("")); // The data-ui attribute is consumed during inlining. assert!(!result.contains(r#"data-ui="cart""#)); } #[test] fn resolve_ui_examples_falls_back_when_file_missing() { let tmp = tempfile::tempdir().unwrap(); let dir = tmp.path(); // examples_path exists but file does not. let html = r#"
"#; let result = resolve_ui_examples(html, Some(dir)); assert!(result.contains("doc-ui-missing")); assert!(result.contains("[UI example: ghost]")); } #[test] fn resolve_ui_examples_falls_back_when_no_examples_path() { // Pins the `None` arm of the `match examples_path`. let html = r#"
"#; let result = resolve_ui_examples(html, None); assert!(result.contains("doc-ui-missing")); assert!(result.contains("[UI example: anything]")); } #[test] fn resolve_ui_examples_short_circuits_when_no_placeholder() { // Pins the `if !html.contains("doc-ui-frame") { return html.to_string(); }` early return. let html = "

Just regular HTML, no placeholders.

"; let result = resolve_ui_examples(html, None); assert_eq!(result, html); } }