use std::collections::HashMap; use std::path::Path; use std::sync::LazyLock; use regex::Regex; static LINK_RE: LazyLock = LazyLock::new(|| { Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").expect("valid regex") }); /// Configuration for the doc loader. pub struct DocLoaderConfig { /// Sections as `(directory_name, display_name)` pairs in display order. pub sections: Vec<(String, String)>, /// URL prefix for rewritten links (e.g., "/docs"). pub link_prefix: String, /// Pattern that identifies unpublished links to strip (e.g., "unpublished/"). pub unpublished_pattern: Option, } /// A rendered documentation page. #[derive(Clone, Debug)] pub struct DocPage { pub title: String, pub slug: String, pub section: String, pub html_content: String, } /// Ordered entry for the docs index page. #[derive(Clone, Debug)] pub struct DocIndexEntry { pub title: String, pub slug: String, pub section: String, } /// Entry in the full-text search index, serialised to JSON for client-side search. #[derive(Clone, Debug, serde::Serialize)] pub struct DocSearchEntry { pub slug: String, pub title: String, pub section: String, pub body_text: String, } /// In-memory store of rendered documentation pages, built once at startup. #[derive(Clone, Debug)] pub struct DocLoader { pages: HashMap, index: Vec, } impl DocLoader { /// Load all `.md` files from `base_path`, rendering them into HTML. /// /// Expects subdirectories matching the configured sections. pub fn load(base_path: &Path, config: &DocLoaderConfig) -> Self { let mut pages = HashMap::new(); let mut index = Vec::new(); for (dir_name, section_display) in &config.sections { let section_path = base_path.join(dir_name); if !section_path.is_dir() { continue; } let read_dir = match std::fs::read_dir(§ion_path) { Ok(rd) => rd, Err(e) => { tracing::warn!(path = %section_path.display(), error = %e, "Failed to read docs section directory"); continue; } }; let mut entries: Vec<_> = read_dir .filter_map(|e| e.ok()) .filter(|e| { e.path() .extension() .map(|ext| ext == "md") .unwrap_or(false) }) .collect(); entries.sort_by_key(|e| e.file_name()); for entry in entries { let path = entry.path(); let slug = path .file_stem() .and_then(|s| s.to_str()) .unwrap_or_default() .to_string(); let raw_md = match std::fs::read_to_string(&path) { Ok(content) => content, Err(_) => continue, }; let title = crate::text::extract_title(&raw_md).unwrap_or_else(|| slug.clone()); let rewritten_md = rewrite_links( &raw_md, &config.link_prefix, config.unpublished_pattern.as_deref(), ); let md_without_title = crate::text::strip_first_heading(&rewritten_md); let html_content = crate::render_permissive(&md_without_title); #[cfg(feature = "directives")] let html_content = crate::directives::post_process_directives(&html_content); let page = DocPage { title, slug, section: section_display.clone(), html_content, }; index.push(DocIndexEntry { title: page.title.clone(), slug: page.slug.clone(), section: page.section.clone(), }); let slug_key = page.slug.clone(); pages.insert(slug_key, page); } } DocLoader { pages, index } } /// Look up a rendered page by slug. pub fn get(&self, slug: &str) -> Option<&DocPage> { self.pages.get(slug) } /// Get the full ordered index. pub fn index(&self) -> &[DocIndexEntry] { &self.index } /// Build a search index with HTML stripped to plain text. pub fn search_index(&self) -> Vec { self.index .iter() .filter_map(|entry| { let page = self.pages.get(&entry.slug)?; Some(DocSearchEntry { slug: entry.slug.clone(), title: entry.title.clone(), section: entry.section.clone(), body_text: strip_html_tags(&page.html_content), }) }) .collect() } } /// Strip HTML tags from a string, returning plain text. /// Decodes common HTML entities so search indexes match plain-text queries. fn strip_html_tags(html: &str) -> String { let mut out = String::with_capacity(html.len()); let mut in_tag = false; for ch in html.chars() { match ch { '<' => in_tag = true, '>' => { in_tag = false; // Add a space after closing tags to separate words. if !out.ends_with(' ') { out.push(' '); } } _ if !in_tag => out.push(ch), _ => {} } } // Collapse runs of whitespace. let collapsed: String = out.split_whitespace().collect::>().join(" "); // Decode common HTML entities for search index accuracy. collapsed .replace("&", "&") .replace("<", "<") .replace(">", ">") .replace(""", "\"") .replace("'", "'") .replace("'", "'") } /// Rewrite relative `.md` links to the configured prefix. fn rewrite_links(markdown: &str, link_prefix: &str, unpublished_pattern: Option<&str>) -> String { LINK_RE .replace_all(markdown, |caps: ®ex::Captures| { let text = &caps[1]; let url = &caps[2]; // Preserve absolute URLs, mailto, and internal routes. if url.starts_with("http://") || url.starts_with("https://") || url.starts_with("mailto:") || url.starts_with('/') { return caps[0].to_string(); } // Unpublished docs: strip link, keep text. if let Some(pattern) = unpublished_pattern { if url.contains(pattern) { return text.to_string(); } } // Only rewrite links containing .md if !url.contains(".md") { return caps[0].to_string(); } // Split off any #anchor. let (path_part, anchor): (&str, Option<&str>) = match url.split_once('#') { Some((p, a)) => (p, Some(a)), None => (url, None), }; // Extract slug from filename: ../support/faq.md -> faq let filename = path_part .rsplit('/') .next() .unwrap_or(path_part) .trim_end_matches(".md"); let mut new_url = format!("{link_prefix}/{filename}"); if let Some(anchor) = anchor { new_url.push('#'); new_url.push_str(anchor); } format!("[{text}]({new_url})") }) .to_string() } #[cfg(test)] mod tests { use super::*; #[test] fn rewrite_same_section_link() { let md = "See [SLA](./guarantees.md) for details."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, "See [SLA](/docs/guarantees) for details."); } #[test] fn rewrite_cross_section_link() { let md = "Check [FAQ](../support/faq.md) for more."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, "Check [FAQ](/docs/faq) for more."); } #[test] fn rewrite_unpublished_link_becomes_plain_text() { let md = "See [Content Moderation](../../unpublished/legal/moderation.md) for details."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, "See Content Moderation for details."); } #[test] fn rewrite_preserves_absolute_urls() { let md = "Visit [our site](https://example.com) today."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, md); } #[test] fn rewrite_preserves_mailto() { let md = "Email [us](mailto:test@example.com)"; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, md); } #[test] fn rewrite_preserves_internal_routes() { let md = "Go to [pricing](/pricing) page."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, md); } #[test] fn rewrite_link_with_anchor() { let md = "See [section](./faq.md#billing)."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, "See [section](/docs/faq#billing)."); } #[test] fn rewrite_public_cross_ref() { let md = "See [Acceptable Use](../../public/legal/acceptable-use.md)."; let result = rewrite_links(md, "/docs", Some("unpublished/")); assert_eq!(result, "See [Acceptable Use](/docs/acceptable-use)."); } #[test] fn rewrite_custom_prefix() { let md = "See [FAQ](./faq.md) here."; let result = rewrite_links(md, "/help", None); assert_eq!(result, "See [FAQ](/help/faq) here."); } #[test] fn rewrite_no_unpublished_pattern() { let md = "See [doc](../../unpublished/foo.md)."; let result = rewrite_links(md, "/docs", None); // Without the pattern, it just rewrites normally assert_eq!(result, "See [doc](/docs/foo)."); } #[test] fn rewrite_non_md_link_preserved() { let md = "See [image](./photo.png) here."; let result = rewrite_links(md, "/docs", None); assert_eq!(result, md); } #[test] fn strip_html_tags_removes_tags() { let html = "

Hello world

"; assert_eq!(strip_html_tags(html), "Hello world"); } #[test] fn strip_html_tags_empty_input() { assert_eq!(strip_html_tags(""), ""); } #[test] fn strip_html_tags_decodes_entities() { let html = "

Price: $10 & free

"; assert_eq!(strip_html_tags(html), "Price: $10 & free"); let html2 = "

a < b > c

"; assert_eq!(strip_html_tags(html2), "a < b > c"); let html3 = "

"hello" & 'world'

"; assert_eq!(strip_html_tags(html3), "\"hello\" & 'world'"); } #[test] fn strip_html_tags_nested_tags() { let html = "

A nested deep tag

"; assert_eq!(strip_html_tags(html), "A nested deep tag"); } }