| 1 |
use std::collections::HashMap; |
| 2 |
use std::path::Path; |
| 3 |
use std::sync::LazyLock; |
| 4 |
|
| 5 |
use regex::Regex; |
| 6 |
|
| 7 |
static LINK_RE: LazyLock<Regex> = LazyLock::new(|| { |
| 8 |
Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").expect("valid regex") |
| 9 |
}); |
| 10 |
|
| 11 |
|
| 12 |
pub struct DocLoaderConfig { |
| 13 |
|
| 14 |
pub sections: Vec<(String, String)>, |
| 15 |
|
| 16 |
pub link_prefix: String, |
| 17 |
|
| 18 |
pub unpublished_pattern: Option<String>, |
| 19 |
} |
| 20 |
|
| 21 |
|
| 22 |
#[derive(Clone, Debug)] |
| 23 |
pub struct DocPage { |
| 24 |
pub title: String, |
| 25 |
pub slug: String, |
| 26 |
pub section: String, |
| 27 |
pub html_content: String, |
| 28 |
} |
| 29 |
|
| 30 |
|
| 31 |
#[derive(Clone, Debug)] |
| 32 |
pub struct DocIndexEntry { |
| 33 |
pub title: String, |
| 34 |
pub slug: String, |
| 35 |
pub section: String, |
| 36 |
} |
| 37 |
|
| 38 |
|
| 39 |
#[derive(Clone, Debug, serde::Serialize)] |
| 40 |
pub struct DocSearchEntry { |
| 41 |
pub slug: String, |
| 42 |
pub title: String, |
| 43 |
pub section: String, |
| 44 |
pub body_text: String, |
| 45 |
} |
| 46 |
|
| 47 |
|
| 48 |
#[derive(Clone, Debug)] |
| 49 |
pub struct DocLoader { |
| 50 |
pages: HashMap<String, DocPage>, |
| 51 |
index: Vec<DocIndexEntry>, |
| 52 |
} |
| 53 |
|
| 54 |
impl DocLoader { |
| 55 |
|
| 56 |
|
| 57 |
|
| 58 |
pub fn load(base_path: &Path, config: &DocLoaderConfig) -> Self { |
| 59 |
let mut pages = HashMap::new(); |
| 60 |
let mut index = Vec::new(); |
| 61 |
|
| 62 |
for (dir_name, section_display) in &config.sections { |
| 63 |
let section_path = base_path.join(dir_name); |
| 64 |
if !section_path.is_dir() { |
| 65 |
continue; |
| 66 |
} |
| 67 |
|
| 68 |
let read_dir = match std::fs::read_dir(§ion_path) { |
| 69 |
Ok(rd) => rd, |
| 70 |
Err(e) => { |
| 71 |
tracing::warn!(path = %section_path.display(), error = %e, "Failed to read docs section directory"); |
| 72 |
continue; |
| 73 |
} |
| 74 |
}; |
| 75 |
|
| 76 |
let mut entries: Vec<_> = read_dir |
| 77 |
.filter_map(|e| e.ok()) |
| 78 |
.filter(|e| { |
| 79 |
e.path() |
| 80 |
.extension() |
| 81 |
.map(|ext| ext == "md") |
| 82 |
.unwrap_or(false) |
| 83 |
}) |
| 84 |
.collect(); |
| 85 |
|
| 86 |
entries.sort_by_key(|e| e.file_name()); |
| 87 |
|
| 88 |
for entry in entries { |
| 89 |
let path = entry.path(); |
| 90 |
let slug = path |
| 91 |
.file_stem() |
| 92 |
.and_then(|s| s.to_str()) |
| 93 |
.unwrap_or_default() |
| 94 |
.to_string(); |
| 95 |
|
| 96 |
let raw_md = match std::fs::read_to_string(&path) { |
| 97 |
Ok(content) => content, |
| 98 |
Err(_) => continue, |
| 99 |
}; |
| 100 |
|
| 101 |
let title = |
| 102 |
crate::text::extract_title(&raw_md).unwrap_or_else(|| slug.clone()); |
| 103 |
let rewritten_md = rewrite_links( |
| 104 |
&raw_md, |
| 105 |
&config.link_prefix, |
| 106 |
config.unpublished_pattern.as_deref(), |
| 107 |
); |
| 108 |
let md_without_title = crate::text::strip_first_heading(&rewritten_md); |
| 109 |
let html_content = crate::render_permissive(&md_without_title); |
| 110 |
#[cfg(feature = "directives")] |
| 111 |
let html_content = crate::directives::post_process_directives(&html_content); |
| 112 |
|
| 113 |
let page = DocPage { |
| 114 |
title, |
| 115 |
slug, |
| 116 |
section: section_display.clone(), |
| 117 |
html_content, |
| 118 |
}; |
| 119 |
|
| 120 |
index.push(DocIndexEntry { |
| 121 |
title: page.title.clone(), |
| 122 |
slug: page.slug.clone(), |
| 123 |
section: page.section.clone(), |
| 124 |
}); |
| 125 |
|
| 126 |
let slug_key = page.slug.clone(); |
| 127 |
pages.insert(slug_key, page); |
| 128 |
} |
| 129 |
} |
| 130 |
|
| 131 |
DocLoader { pages, index } |
| 132 |
} |
| 133 |
|
| 134 |
|
| 135 |
pub fn get(&self, slug: &str) -> Option<&DocPage> { |
| 136 |
self.pages.get(slug) |
| 137 |
} |
| 138 |
|
| 139 |
|
| 140 |
pub fn index(&self) -> &[DocIndexEntry] { |
| 141 |
&self.index |
| 142 |
} |
| 143 |
|
| 144 |
|
| 145 |
pub fn search_index(&self) -> Vec<DocSearchEntry> { |
| 146 |
self.index |
| 147 |
.iter() |
| 148 |
.filter_map(|entry| { |
| 149 |
let page = self.pages.get(&entry.slug)?; |
| 150 |
Some(DocSearchEntry { |
| 151 |
slug: entry.slug.clone(), |
| 152 |
title: entry.title.clone(), |
| 153 |
section: entry.section.clone(), |
| 154 |
body_text: strip_html_tags(&page.html_content), |
| 155 |
}) |
| 156 |
}) |
| 157 |
.collect() |
| 158 |
} |
| 159 |
} |
| 160 |
|
| 161 |
|
| 162 |
|
| 163 |
fn strip_html_tags(html: &str) -> String { |
| 164 |
let mut out = String::with_capacity(html.len()); |
| 165 |
let mut in_tag = false; |
| 166 |
for ch in html.chars() { |
| 167 |
match ch { |
| 168 |
'<' => in_tag = true, |
| 169 |
'>' => { |
| 170 |
in_tag = false; |
| 171 |
|
| 172 |
if !out.ends_with(' ') { |
| 173 |
out.push(' '); |
| 174 |
} |
| 175 |
} |
| 176 |
_ if !in_tag => out.push(ch), |
| 177 |
_ => {} |
| 178 |
} |
| 179 |
} |
| 180 |
|
| 181 |
let collapsed: String = out.split_whitespace().collect::<Vec<_>>().join(" "); |
| 182 |
|
| 183 |
collapsed |
| 184 |
.replace("&", "&") |
| 185 |
.replace("<", "<") |
| 186 |
.replace(">", ">") |
| 187 |
.replace(""", "\"") |
| 188 |
.replace("'", "'") |
| 189 |
.replace("'", "'") |
| 190 |
} |
| 191 |
|
| 192 |
|
| 193 |
fn rewrite_links(markdown: &str, link_prefix: &str, unpublished_pattern: Option<&str>) -> String { |
| 194 |
LINK_RE |
| 195 |
.replace_all(markdown, |caps: ®ex::Captures| { |
| 196 |
let text = &caps[1]; |
| 197 |
let url = &caps[2]; |
| 198 |
|
| 199 |
|
| 200 |
if url.starts_with("http://") |
| 201 |
|| url.starts_with("https://") |
| 202 |
|| url.starts_with("mailto:") |
| 203 |
|| url.starts_with('/') |
| 204 |
{ |
| 205 |
return caps[0].to_string(); |
| 206 |
} |
| 207 |
|
| 208 |
|
| 209 |
if let Some(pattern) = unpublished_pattern { |
| 210 |
if url.contains(pattern) { |
| 211 |
return text.to_string(); |
| 212 |
} |
| 213 |
} |
| 214 |
|
| 215 |
|
| 216 |
if !url.contains(".md") { |
| 217 |
return caps[0].to_string(); |
| 218 |
} |
| 219 |
|
| 220 |
|
| 221 |
let (path_part, anchor): (&str, Option<&str>) = match url.split_once('#') { |
| 222 |
Some((p, a)) => (p, Some(a)), |
| 223 |
None => (url, None), |
| 224 |
}; |
| 225 |
|
| 226 |
|
| 227 |
let filename = path_part |
| 228 |
.rsplit('/') |
| 229 |
.next() |
| 230 |
.unwrap_or(path_part) |
| 231 |
.trim_end_matches(".md"); |
| 232 |
|
| 233 |
let mut new_url = format!("{link_prefix}/{filename}"); |
| 234 |
if let Some(anchor) = anchor { |
| 235 |
new_url.push('#'); |
| 236 |
new_url.push_str(anchor); |
| 237 |
} |
| 238 |
|
| 239 |
format!("[{text}]({new_url})") |
| 240 |
}) |
| 241 |
.to_string() |
| 242 |
} |
| 243 |
|
| 244 |
#[cfg(test)] |
| 245 |
mod tests { |
| 246 |
use super::*; |
| 247 |
|
| 248 |
#[test] |
| 249 |
fn rewrite_same_section_link() { |
| 250 |
let md = "See [SLA](./guarantees.md) for details."; |
| 251 |
let result = rewrite_links(md, "/docs", Some("unpublished/")); |
| 252 |
assert_eq!(result, "See [SLA](/docs/guarantees) for details."); |
| 253 |
} |
| 254 |
|
| 255 |
#[test] |
| 256 |
fn rewrite_cross_section_link() { |
| 257 |
let md = "Check [FAQ](../support/faq.md) for more."; |
| 258 |
let result = rewrite_links(md, "/docs", Some("unpublished/")); |
| 259 |
assert_eq!(result, "Check [FAQ](/docs/faq) for more."); |
| 260 |
} |
| 261 |
|
| 262 |
#[test] |
| 263 |
fn rewrite_unpublished_link_becomes_plain_text() { |
| 264 |
let md = "See [Content Moderation](../../unpublished/legal/moderation.md) for details."; |
| 265 |
let result = rewrite_links(md, "/docs", Some("unpublished/")); |
| 266 |
assert_eq!(result, "See Content Moderation for details."); |
| 267 |
} |
| 268 |
|
| 269 |
#[test] |
| 270 |
fn rewrite_preserves_absolute_urls() { |
| 271 |
let md = "Visit [our site](https://example.com) today."; |
| 272 |
let result = rewrite_links(md, "/docs", Some("unpublished/")); |
| 273 |
assert_eq!(result, md); |
| 274 |
} |
| 275 |
|
| 276 |
#[test] |
| 277 |
fn rewrite_preserves_mailto() { |
| 278 |
let md = "Email [us](mailto:test@example.com)"; |
| 279 |
let result = rewrite_links(md, "/docs", Some("unpublished/")); |
| 280 |
assert_eq!(result, md); |
| 281 |
} |
| 282 |
|
| 283 |
#[test] |
| 284 |
fn rewrite_preserves_internal_routes() { |
| 285 |
let md = "Go to [pricing](/pricing) page."; |
| 286 |
let result = rewrite_links(md, "/docs", Some("unpublished/")); |
| 287 |
assert_eq!(result, md); |
| 288 |
} |
| 289 |
|
| 290 |
#[test] |
| 291 |
fn rewrite_link_with_anchor() { |
| 292 |
let md = "See [section](./faq.md#billing)."; |
| 293 |
let result = rewrite_links(md, "/docs", Some("unpublished/")); |
| 294 |
assert_eq!(result, "See [section](/docs/faq#billing)."); |
| 295 |
} |
| 296 |
|
| 297 |
#[test] |
| 298 |
fn rewrite_public_cross_ref() { |
| 299 |
let md = "See [Acceptable Use](../../public/legal/acceptable-use.md)."; |
| 300 |
let result = rewrite_links(md, "/docs", Some("unpublished/")); |
| 301 |
assert_eq!(result, "See [Acceptable Use](/docs/acceptable-use)."); |
| 302 |
} |
| 303 |
|
| 304 |
#[test] |
| 305 |
fn rewrite_custom_prefix() { |
| 306 |
let md = "See [FAQ](./faq.md) here."; |
| 307 |
let result = rewrite_links(md, "/help", None); |
| 308 |
assert_eq!(result, "See [FAQ](/help/faq) here."); |
| 309 |
} |
| 310 |
|
| 311 |
#[test] |
| 312 |
fn rewrite_no_unpublished_pattern() { |
| 313 |
let md = "See [doc](../../unpublished/foo.md)."; |
| 314 |
let result = rewrite_links(md, "/docs", None); |
| 315 |
|
| 316 |
assert_eq!(result, "See [doc](/docs/foo)."); |
| 317 |
} |
| 318 |
|
| 319 |
#[test] |
| 320 |
fn rewrite_non_md_link_preserved() { |
| 321 |
let md = "See [image](./photo.png) here."; |
| 322 |
let result = rewrite_links(md, "/docs", None); |
| 323 |
assert_eq!(result, md); |
| 324 |
} |
| 325 |
|
| 326 |
#[test] |
| 327 |
fn strip_html_tags_removes_tags() { |
| 328 |
let html = "<p>Hello <strong>world</strong></p>"; |
| 329 |
assert_eq!(strip_html_tags(html), "Hello world"); |
| 330 |
} |
| 331 |
|
| 332 |
#[test] |
| 333 |
fn strip_html_tags_empty_input() { |
| 334 |
assert_eq!(strip_html_tags(""), ""); |
| 335 |
} |
| 336 |
|
| 337 |
#[test] |
| 338 |
fn strip_html_tags_decodes_entities() { |
| 339 |
let html = "<p>Price: $10 & free</p>"; |
| 340 |
assert_eq!(strip_html_tags(html), "Price: $10 & free"); |
| 341 |
|
| 342 |
let html2 = "<p>a < b > c</p>"; |
| 343 |
assert_eq!(strip_html_tags(html2), "a < b > c"); |
| 344 |
|
| 345 |
let html3 = "<p>"hello" & 'world'</p>"; |
| 346 |
assert_eq!(strip_html_tags(html3), "\"hello\" & 'world'"); |
| 347 |
} |
| 348 |
|
| 349 |
#[test] |
| 350 |
fn strip_html_tags_nested_tags() { |
| 351 |
let html = "<div><p>A <em>nested <strong>deep</strong></em> tag</p></div>"; |
| 352 |
assert_eq!(strip_html_tags(html), "A nested deep tag"); |
| 353 |
} |
| 354 |
} |
| 355 |
|