max / docengine

11.2 KB · 355 lines History Blame Raw

1	use std::collections::HashMap;
2	use std::path::Path;
3	use std::sync::LazyLock;
4
5	use regex::Regex;
6
7	static LINK_RE: LazyLock<Regex> = LazyLock::new(\|\| {
8	Regex::new(r"\[([^\]]+)\]$([^)]+)$").expect("valid regex")
9	});
10
11	/// Configuration for the doc loader.
12	pub struct DocLoaderConfig {
13	/// Sections as `(directory_name, display_name)` pairs in display order.
14	pub sections: Vec<(String, String)>,
15	/// URL prefix for rewritten links (e.g., "/docs").
16	pub link_prefix: String,
17	/// Pattern that identifies unpublished links to strip (e.g., "unpublished/").
18	pub unpublished_pattern: Option<String>,
19	}
20
21	/// A rendered documentation page.
22	#[derive(Clone, Debug)]
23	pub struct DocPage {
24	pub title: String,
25	pub slug: String,
26	pub section: String,
27	pub html_content: String,
28	}
29
30	/// Ordered entry for the docs index page.
31	#[derive(Clone, Debug)]
32	pub struct DocIndexEntry {
33	pub title: String,
34	pub slug: String,
35	pub section: String,
36	}
37
38	/// Entry in the full-text search index, serialised to JSON for client-side search.
39	#[derive(Clone, Debug, serde::Serialize)]
40	pub struct DocSearchEntry {
41	pub slug: String,
42	pub title: String,
43	pub section: String,
44	pub body_text: String,
45	}
46
47	/// In-memory store of rendered documentation pages, built once at startup.
48	#[derive(Clone, Debug)]
49	pub struct DocLoader {
50	pages: HashMap<String, DocPage>,
51	index: Vec<DocIndexEntry>,
52	}
53
54	impl DocLoader {
55	/// Load all `.md` files from `base_path`, rendering them into HTML.
56	///
57	/// Expects subdirectories matching the configured sections.
58	pub fn load(base_path: &Path, config: &DocLoaderConfig) -> Self {
59	let mut pages = HashMap::new();
60	let mut index = Vec::new();
61
62	for (dir_name, section_display) in &config.sections {
63	let section_path = base_path.join(dir_name);
64	if !section_path.is_dir() {
65	continue;
66	}
67
68	let read_dir = match std::fs::read_dir(&section_path) {
69	Ok(rd) => rd,
70	Err(e) => {
71	tracing::warn!(path = %section_path.display(), error = %e, "Failed to read docs section directory");
72	continue;
73	}
74	};
75
76	let mut entries: Vec<_> = read_dir
77	.filter_map(\|e\| e.ok())
78	.filter(\|e\| {
79	e.path()
80	.extension()
81	.map(\|ext\| ext == "md")
82	.unwrap_or(false)
83	})
84	.collect();
85
86	entries.sort_by_key(\|e\| e.file_name());
87
88	for entry in entries {
89	let path = entry.path();
90	let slug = path
91	.file_stem()
92	.and_then(\|s\| s.to_str())
93	.unwrap_or_default()
94	.to_string();
95
96	let raw_md = match std::fs::read_to_string(&path) {
97	Ok(content) => content,
98	Err(_) => continue,
99	};
100
101	let title =
102	crate::text::extract_title(&raw_md).unwrap_or_else(\|\| slug.clone());
103	let rewritten_md = rewrite_links(
104	&raw_md,
105	&config.link_prefix,
106	config.unpublished_pattern.as_deref(),
107	);
108	let md_without_title = crate::text::strip_first_heading(&rewritten_md);
109	let html_content = crate::render_permissive(&md_without_title);
110	#[cfg(feature = "directives")]
111	let html_content = crate::directives::post_process_directives(&html_content);
112
113	let page = DocPage {
114	title,
115	slug,
116	section: section_display.clone(),
117	html_content,
118	};
119
120	index.push(DocIndexEntry {
121	title: page.title.clone(),
122	slug: page.slug.clone(),
123	section: page.section.clone(),
124	});
125
126	let slug_key = page.slug.clone();
127	pages.insert(slug_key, page);
128	}
129	}
130
131	DocLoader { pages, index }
132	}
133
134	/// Look up a rendered page by slug.
135	pub fn get(&self, slug: &str) -> Option<&DocPage> {
136	self.pages.get(slug)
137	}
138
139	/// Get the full ordered index.
140	pub fn index(&self) -> &[DocIndexEntry] {
141	&self.index
142	}
143
144	/// Build a search index with HTML stripped to plain text.
145	pub fn search_index(&self) -> Vec<DocSearchEntry> {
146	self.index
147	.iter()
148	.filter_map(\|entry\| {
149	let page = self.pages.get(&entry.slug)?;
150	Some(DocSearchEntry {
151	slug: entry.slug.clone(),
152	title: entry.title.clone(),
153	section: entry.section.clone(),
154	body_text: strip_html_tags(&page.html_content),
155	})
156	})
157	.collect()
158	}
159	}
160
161	/// Strip HTML tags from a string, returning plain text.
162	/// Decodes common HTML entities so search indexes match plain-text queries.
163	fn strip_html_tags(html: &str) -> String {
164	let mut out = String::with_capacity(html.len());
165	let mut in_tag = false;
166	for ch in html.chars() {
167	match ch {
168	'<' => in_tag = true,
169	'>' => {
170	in_tag = false;
171	// Add a space after closing tags to separate words.
172	if !out.ends_with(' ') {
173	out.push(' ');
174	}
175	}
176	_ if !in_tag => out.push(ch),
177	_ => {}
178	}
179	}
180	// Collapse runs of whitespace.
181	let collapsed: String = out.split_whitespace().collect::<Vec<_>>().join(" ");
182	// Decode common HTML entities for search index accuracy.
183	collapsed
184	.replace("&", "&")
185	.replace("<", "<")
186	.replace(">", ">")
187	.replace(""", "\"")
188	.replace("'", "'")
189	.replace("'", "'")
190	}
191
192	/// Rewrite relative `.md` links to the configured prefix.
193	fn rewrite_links(markdown: &str, link_prefix: &str, unpublished_pattern: Option<&str>) -> String {
194	LINK_RE
195	.replace_all(markdown, \|caps: &regex::Captures\| {
196	let text = &caps[1];
197	let url = &caps[2];
198
199	// Preserve absolute URLs, mailto, and internal routes.
200	if url.starts_with("http://")
201	\|\| url.starts_with("https://")
202	\|\| url.starts_with("mailto:")
203	\|\| url.starts_with('/')
204	{
205	return caps[0].to_string();
206	}
207
208	// Unpublished docs: strip link, keep text.
209	if let Some(pattern) = unpublished_pattern {
210	if url.contains(pattern) {
211	return text.to_string();
212	}
213	}
214
215	// Only rewrite links containing .md
216	if !url.contains(".md") {
217	return caps[0].to_string();
218	}
219
220	// Split off any #anchor.
221	let (path_part, anchor): (&str, Option<&str>) = match url.split_once('#') {
222	Some((p, a)) => (p, Some(a)),
223	None => (url, None),
224	};
225
226	// Extract slug from filename: ../support/faq.md -> faq
227	let filename = path_part
228	.rsplit('/')
229	.next()
230	.unwrap_or(path_part)
231	.trim_end_matches(".md");
232
233	let mut new_url = format!("{link_prefix}/{filename}");
234	if let Some(anchor) = anchor {
235	new_url.push('#');
236	new_url.push_str(anchor);
237	}
238
239	format!("[{text}]({new_url})")
240	})
241	.to_string()
242	}
243
244	#[cfg(test)]
245	mod tests {
246	use super::*;
247
248	#[test]
249	fn rewrite_same_section_link() {
250	let md = "See [SLA](./guarantees.md) for details.";
251	let result = rewrite_links(md, "/docs", Some("unpublished/"));
252	assert_eq!(result, "See [SLA](/docs/guarantees) for details.");
253	}
254
255	#[test]
256	fn rewrite_cross_section_link() {
257	let md = "Check [FAQ](../support/faq.md) for more.";
258	let result = rewrite_links(md, "/docs", Some("unpublished/"));
259	assert_eq!(result, "Check [FAQ](/docs/faq) for more.");
260	}
261
262	#[test]
263	fn rewrite_unpublished_link_becomes_plain_text() {
264	let md = "See [Content Moderation](../../unpublished/legal/moderation.md) for details.";
265	let result = rewrite_links(md, "/docs", Some("unpublished/"));
266	assert_eq!(result, "See Content Moderation for details.");
267	}
268
269	#[test]
270	fn rewrite_preserves_absolute_urls() {
271	let md = "Visit [our site](https://example.com) today.";
272	let result = rewrite_links(md, "/docs", Some("unpublished/"));
273	assert_eq!(result, md);
274	}
275
276	#[test]
277	fn rewrite_preserves_mailto() {
278	let md = "Email [us](mailto:test@example.com)";
279	let result = rewrite_links(md, "/docs", Some("unpublished/"));
280	assert_eq!(result, md);
281	}
282
283	#[test]
284	fn rewrite_preserves_internal_routes() {
285	let md = "Go to [pricing](/pricing) page.";
286	let result = rewrite_links(md, "/docs", Some("unpublished/"));
287	assert_eq!(result, md);
288	}
289
290	#[test]
291	fn rewrite_link_with_anchor() {
292	let md = "See [section](./faq.md#billing).";
293	let result = rewrite_links(md, "/docs", Some("unpublished/"));
294	assert_eq!(result, "See [section](/docs/faq#billing).");
295	}
296
297	#[test]
298	fn rewrite_public_cross_ref() {
299	let md = "See [Acceptable Use](../../public/legal/acceptable-use.md).";
300	let result = rewrite_links(md, "/docs", Some("unpublished/"));
301	assert_eq!(result, "See [Acceptable Use](/docs/acceptable-use).");
302	}
303
304	#[test]
305	fn rewrite_custom_prefix() {
306	let md = "See [FAQ](./faq.md) here.";
307	let result = rewrite_links(md, "/help", None);
308	assert_eq!(result, "See [FAQ](/help/faq) here.");
309	}
310
311	#[test]
312	fn rewrite_no_unpublished_pattern() {
313	let md = "See [doc](../../unpublished/foo.md).";
314	let result = rewrite_links(md, "/docs", None);
315	// Without the pattern, it just rewrites normally
316	assert_eq!(result, "See [doc](/docs/foo).");
317	}
318
319	#[test]
320	fn rewrite_non_md_link_preserved() {
321	let md = "See [image](./photo.png) here.";
322	let result = rewrite_links(md, "/docs", None);
323	assert_eq!(result, md);
324	}
325
326	#[test]
327	fn strip_html_tags_removes_tags() {
328	let html = "<p>Hello <strong>world</strong></p>";
329	assert_eq!(strip_html_tags(html), "Hello world");
330	}
331
332	#[test]
333	fn strip_html_tags_empty_input() {
334	assert_eq!(strip_html_tags(""), "");
335	}
336
337	#[test]
338	fn strip_html_tags_decodes_entities() {
339	let html = "<p>Price: $10 & free</p>";
340	assert_eq!(strip_html_tags(html), "Price: $10 & free");
341
342	let html2 = "<p>a < b > c</p>";
343	assert_eq!(strip_html_tags(html2), "a < b > c");
344
345	let html3 = "<p>"hello" & 'world'</p>";
346	assert_eq!(strip_html_tags(html3), "\"hello\" & 'world'");
347	}
348
349	#[test]
350	fn strip_html_tags_nested_tags() {
351	let html = "<div><p>A <em>nested <strong>deep</strong></em> tag</p></div>";
352	assert_eq!(strip_html_tags(html), "A nested deep tag");
353	}
354	}
355