Skip to main content

max / docengine

11.2 KB · 355 lines History Blame Raw
1 use std::collections::HashMap;
2 use std::path::Path;
3 use std::sync::LazyLock;
4
5 use regex::Regex;
6
7 static LINK_RE: LazyLock<Regex> = LazyLock::new(|| {
8 Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").expect("valid regex")
9 });
10
11 /// Configuration for the doc loader.
12 pub struct DocLoaderConfig {
13 /// Sections as `(directory_name, display_name)` pairs in display order.
14 pub sections: Vec<(String, String)>,
15 /// URL prefix for rewritten links (e.g., "/docs").
16 pub link_prefix: String,
17 /// Pattern that identifies unpublished links to strip (e.g., "unpublished/").
18 pub unpublished_pattern: Option<String>,
19 }
20
21 /// A rendered documentation page.
22 #[derive(Clone, Debug)]
23 pub struct DocPage {
24 pub title: String,
25 pub slug: String,
26 pub section: String,
27 pub html_content: String,
28 }
29
30 /// Ordered entry for the docs index page.
31 #[derive(Clone, Debug)]
32 pub struct DocIndexEntry {
33 pub title: String,
34 pub slug: String,
35 pub section: String,
36 }
37
38 /// Entry in the full-text search index, serialised to JSON for client-side search.
39 #[derive(Clone, Debug, serde::Serialize)]
40 pub struct DocSearchEntry {
41 pub slug: String,
42 pub title: String,
43 pub section: String,
44 pub body_text: String,
45 }
46
47 /// In-memory store of rendered documentation pages, built once at startup.
48 #[derive(Clone, Debug)]
49 pub struct DocLoader {
50 pages: HashMap<String, DocPage>,
51 index: Vec<DocIndexEntry>,
52 }
53
54 impl DocLoader {
55 /// Load all `.md` files from `base_path`, rendering them into HTML.
56 ///
57 /// Expects subdirectories matching the configured sections.
58 pub fn load(base_path: &Path, config: &DocLoaderConfig) -> Self {
59 let mut pages = HashMap::new();
60 let mut index = Vec::new();
61
62 for (dir_name, section_display) in &config.sections {
63 let section_path = base_path.join(dir_name);
64 if !section_path.is_dir() {
65 continue;
66 }
67
68 let read_dir = match std::fs::read_dir(&section_path) {
69 Ok(rd) => rd,
70 Err(e) => {
71 tracing::warn!(path = %section_path.display(), error = %e, "Failed to read docs section directory");
72 continue;
73 }
74 };
75
76 let mut entries: Vec<_> = read_dir
77 .filter_map(|e| e.ok())
78 .filter(|e| {
79 e.path()
80 .extension()
81 .map(|ext| ext == "md")
82 .unwrap_or(false)
83 })
84 .collect();
85
86 entries.sort_by_key(|e| e.file_name());
87
88 for entry in entries {
89 let path = entry.path();
90 let slug = path
91 .file_stem()
92 .and_then(|s| s.to_str())
93 .unwrap_or_default()
94 .to_string();
95
96 let raw_md = match std::fs::read_to_string(&path) {
97 Ok(content) => content,
98 Err(_) => continue,
99 };
100
101 let title =
102 crate::text::extract_title(&raw_md).unwrap_or_else(|| slug.clone());
103 let rewritten_md = rewrite_links(
104 &raw_md,
105 &config.link_prefix,
106 config.unpublished_pattern.as_deref(),
107 );
108 let md_without_title = crate::text::strip_first_heading(&rewritten_md);
109 let html_content = crate::render_permissive(&md_without_title);
110 #[cfg(feature = "directives")]
111 let html_content = crate::directives::post_process_directives(&html_content);
112
113 let page = DocPage {
114 title,
115 slug,
116 section: section_display.clone(),
117 html_content,
118 };
119
120 index.push(DocIndexEntry {
121 title: page.title.clone(),
122 slug: page.slug.clone(),
123 section: page.section.clone(),
124 });
125
126 let slug_key = page.slug.clone();
127 pages.insert(slug_key, page);
128 }
129 }
130
131 DocLoader { pages, index }
132 }
133
134 /// Look up a rendered page by slug.
135 pub fn get(&self, slug: &str) -> Option<&DocPage> {
136 self.pages.get(slug)
137 }
138
139 /// Get the full ordered index.
140 pub fn index(&self) -> &[DocIndexEntry] {
141 &self.index
142 }
143
144 /// Build a search index with HTML stripped to plain text.
145 pub fn search_index(&self) -> Vec<DocSearchEntry> {
146 self.index
147 .iter()
148 .filter_map(|entry| {
149 let page = self.pages.get(&entry.slug)?;
150 Some(DocSearchEntry {
151 slug: entry.slug.clone(),
152 title: entry.title.clone(),
153 section: entry.section.clone(),
154 body_text: strip_html_tags(&page.html_content),
155 })
156 })
157 .collect()
158 }
159 }
160
161 /// Strip HTML tags from a string, returning plain text.
162 /// Decodes common HTML entities so search indexes match plain-text queries.
163 fn strip_html_tags(html: &str) -> String {
164 let mut out = String::with_capacity(html.len());
165 let mut in_tag = false;
166 for ch in html.chars() {
167 match ch {
168 '<' => in_tag = true,
169 '>' => {
170 in_tag = false;
171 // Add a space after closing tags to separate words.
172 if !out.ends_with(' ') {
173 out.push(' ');
174 }
175 }
176 _ if !in_tag => out.push(ch),
177 _ => {}
178 }
179 }
180 // Collapse runs of whitespace.
181 let collapsed: String = out.split_whitespace().collect::<Vec<_>>().join(" ");
182 // Decode common HTML entities for search index accuracy.
183 collapsed
184 .replace("&amp;", "&")
185 .replace("&lt;", "<")
186 .replace("&gt;", ">")
187 .replace("&quot;", "\"")
188 .replace("&#x27;", "'")
189 .replace("&#39;", "'")
190 }
191
192 /// Rewrite relative `.md` links to the configured prefix.
193 fn rewrite_links(markdown: &str, link_prefix: &str, unpublished_pattern: Option<&str>) -> String {
194 LINK_RE
195 .replace_all(markdown, |caps: &regex::Captures| {
196 let text = &caps[1];
197 let url = &caps[2];
198
199 // Preserve absolute URLs, mailto, and internal routes.
200 if url.starts_with("http://")
201 || url.starts_with("https://")
202 || url.starts_with("mailto:")
203 || url.starts_with('/')
204 {
205 return caps[0].to_string();
206 }
207
208 // Unpublished docs: strip link, keep text.
209 if let Some(pattern) = unpublished_pattern {
210 if url.contains(pattern) {
211 return text.to_string();
212 }
213 }
214
215 // Only rewrite links containing .md
216 if !url.contains(".md") {
217 return caps[0].to_string();
218 }
219
220 // Split off any #anchor.
221 let (path_part, anchor): (&str, Option<&str>) = match url.split_once('#') {
222 Some((p, a)) => (p, Some(a)),
223 None => (url, None),
224 };
225
226 // Extract slug from filename: ../support/faq.md -> faq
227 let filename = path_part
228 .rsplit('/')
229 .next()
230 .unwrap_or(path_part)
231 .trim_end_matches(".md");
232
233 let mut new_url = format!("{link_prefix}/{filename}");
234 if let Some(anchor) = anchor {
235 new_url.push('#');
236 new_url.push_str(anchor);
237 }
238
239 format!("[{text}]({new_url})")
240 })
241 .to_string()
242 }
243
244 #[cfg(test)]
245 mod tests {
246 use super::*;
247
248 #[test]
249 fn rewrite_same_section_link() {
250 let md = "See [SLA](./guarantees.md) for details.";
251 let result = rewrite_links(md, "/docs", Some("unpublished/"));
252 assert_eq!(result, "See [SLA](/docs/guarantees) for details.");
253 }
254
255 #[test]
256 fn rewrite_cross_section_link() {
257 let md = "Check [FAQ](../support/faq.md) for more.";
258 let result = rewrite_links(md, "/docs", Some("unpublished/"));
259 assert_eq!(result, "Check [FAQ](/docs/faq) for more.");
260 }
261
262 #[test]
263 fn rewrite_unpublished_link_becomes_plain_text() {
264 let md = "See [Content Moderation](../../unpublished/legal/moderation.md) for details.";
265 let result = rewrite_links(md, "/docs", Some("unpublished/"));
266 assert_eq!(result, "See Content Moderation for details.");
267 }
268
269 #[test]
270 fn rewrite_preserves_absolute_urls() {
271 let md = "Visit [our site](https://example.com) today.";
272 let result = rewrite_links(md, "/docs", Some("unpublished/"));
273 assert_eq!(result, md);
274 }
275
276 #[test]
277 fn rewrite_preserves_mailto() {
278 let md = "Email [us](mailto:test@example.com)";
279 let result = rewrite_links(md, "/docs", Some("unpublished/"));
280 assert_eq!(result, md);
281 }
282
283 #[test]
284 fn rewrite_preserves_internal_routes() {
285 let md = "Go to [pricing](/pricing) page.";
286 let result = rewrite_links(md, "/docs", Some("unpublished/"));
287 assert_eq!(result, md);
288 }
289
290 #[test]
291 fn rewrite_link_with_anchor() {
292 let md = "See [section](./faq.md#billing).";
293 let result = rewrite_links(md, "/docs", Some("unpublished/"));
294 assert_eq!(result, "See [section](/docs/faq#billing).");
295 }
296
297 #[test]
298 fn rewrite_public_cross_ref() {
299 let md = "See [Acceptable Use](../../public/legal/acceptable-use.md).";
300 let result = rewrite_links(md, "/docs", Some("unpublished/"));
301 assert_eq!(result, "See [Acceptable Use](/docs/acceptable-use).");
302 }
303
304 #[test]
305 fn rewrite_custom_prefix() {
306 let md = "See [FAQ](./faq.md) here.";
307 let result = rewrite_links(md, "/help", None);
308 assert_eq!(result, "See [FAQ](/help/faq) here.");
309 }
310
311 #[test]
312 fn rewrite_no_unpublished_pattern() {
313 let md = "See [doc](../../unpublished/foo.md).";
314 let result = rewrite_links(md, "/docs", None);
315 // Without the pattern, it just rewrites normally
316 assert_eq!(result, "See [doc](/docs/foo).");
317 }
318
319 #[test]
320 fn rewrite_non_md_link_preserved() {
321 let md = "See [image](./photo.png) here.";
322 let result = rewrite_links(md, "/docs", None);
323 assert_eq!(result, md);
324 }
325
326 #[test]
327 fn strip_html_tags_removes_tags() {
328 let html = "<p>Hello <strong>world</strong></p>";
329 assert_eq!(strip_html_tags(html), "Hello world");
330 }
331
332 #[test]
333 fn strip_html_tags_empty_input() {
334 assert_eq!(strip_html_tags(""), "");
335 }
336
337 #[test]
338 fn strip_html_tags_decodes_entities() {
339 let html = "<p>Price: $10 &amp; free</p>";
340 assert_eq!(strip_html_tags(html), "Price: $10 & free");
341
342 let html2 = "<p>a &lt; b &gt; c</p>";
343 assert_eq!(strip_html_tags(html2), "a < b > c");
344
345 let html3 = "<p>&quot;hello&quot; &amp; &#x27;world&#39;</p>";
346 assert_eq!(strip_html_tags(html3), "\"hello\" & 'world'");
347 }
348
349 #[test]
350 fn strip_html_tags_nested_tags() {
351 let html = "<div><p>A <em>nested <strong>deep</strong></em> tag</p></div>";
352 assert_eq!(strip_html_tags(html), "A nested deep tag");
353 }
354 }
355