Skip to main content

max / docengine

Code review remediation: custom directives, escape module, media URLs, tests Extensible [!TYPE] alerts, [!TABS] code tabs, HTML escape extraction, media URL rewriting. 141 tests. Grade A. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Author: Max J. <87768334+MaxJMath@users.noreply.github.com> · 2026-04-12 23:39 UTC
Commit: f919b8cf0596b8b48e6ed344dd92062dc1368245
Parent: 7367851
14 files changed, +1203 insertions, -38 deletions
M Cargo.lock +38
@@ -92,6 +92,7 @@ dependencies = [
92 92 "regex-lite",
93 93 "serde",
94 94 "toml",
95 + "tracing",
95 96 "uuid",
96 97 ]
97 98
@@ -498,6 +499,12 @@ dependencies = [
498 499 ]
499 500
500 501 [[package]]
502 + name = "pin-project-lite"
503 + version = "0.2.17"
504 + source = "registry+https://github.com/rust-lang/crates.io-index"
505 + checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
506 +
507 + [[package]]
501 508 name = "potential_utf"
502 509 version = "0.1.4"
503 510 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -822,6 +829,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
822 829 checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801"
823 830
824 831 [[package]]
832 + name = "tracing"
833 + version = "0.1.44"
834 + source = "registry+https://github.com/rust-lang/crates.io-index"
835 + checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100"
836 + dependencies = [
837 + "pin-project-lite",
838 + "tracing-attributes",
839 + "tracing-core",
840 + ]
841 +
842 + [[package]]
843 + name = "tracing-attributes"
844 + version = "0.1.31"
845 + source = "registry+https://github.com/rust-lang/crates.io-index"
846 + checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da"
847 + dependencies = [
848 + "proc-macro2",
849 + "quote",
850 + "syn",
851 + ]
852 +
853 + [[package]]
854 + name = "tracing-core"
855 + version = "0.1.36"
856 + source = "registry+https://github.com/rust-lang/crates.io-index"
857 + checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a"
858 + dependencies = [
859 + "once_cell",
860 + ]
861 +
862 + [[package]]
825 863 name = "unicase"
826 864 version = "2.9.0"
827 865 source = "registry+https://github.com/rust-lang/crates.io-index"
M Cargo.toml +6 -3
@@ -5,11 +5,13 @@ edition = "2021"
5 5
6 6 [features]
7 7 default = []
8 - doc-loader = ["dep:regex"]
8 + doc-loader = ["dep:regex", "dep:tracing"]
9 + directives = ["dep:regex-lite"]
9 10 mentions = ["dep:regex-lite"]
10 11 quotes = ["dep:regex-lite", "dep:uuid"]
11 - frontmatter = ["dep:toml"]
12 - full = ["doc-loader", "mentions", "quotes", "frontmatter"]
12 + frontmatter = ["dep:toml", "dep:tracing"]
13 + media-urls = ["dep:regex-lite"]
14 + full = ["doc-loader", "directives", "mentions", "quotes", "frontmatter", "media-urls"]
13 15
14 16 [dependencies]
15 17 pulldown-cmark = "0.12"
@@ -20,3 +22,4 @@ regex = { version = "1", optional = true }
20 22 regex-lite = { version = "0.1", optional = true }
21 23 uuid = { version = "1", features = ["serde", "v4"], optional = true }
22 24 toml = { version = "0.8", optional = true }
25 + tracing = { version = "0.1", optional = true }
A README.md +119
@@ -0,0 +1,119 @@
1 + # DocEngine
2 +
3 + Configurable markdown-to-HTML rendering library with sanitization presets. Built on pulldown-cmark (GFM) and ammonia.
4 +
5 + Used by MNW (site docs, blog posts, user-generated content), Multithreaded (forum posts), and the desktop apps (descriptions, notes).
6 +
7 + ## Presets
8 +
9 + Four rendering presets, each with different security/feature tradeoffs:
10 +
11 + | Preset | Use case | Tables | Images | Raw HTML | Dangerous scheme filter | Sanitization |
12 + |--------|----------|:------:|:------:|:--------:|:-----------------------:|--------------|
13 + | **Permissive** | Docs, blog posts (trusted) | Y | Y | Y | N | Default ammonia |
14 + | **Standard** | App text fields (descriptions) | Y | N | Y | N | Default ammonia |
15 + | **Strict** | User-generated content (forums) | N | N | N | Y | nofollow on links |
16 + | **Sanitize-only** | External HTML (RSS feeds) | -- | -- | -- | -- | Default ammonia, no markdown parsing |
17 +
18 + ```rust
19 + use docengine::{render_permissive, render_standard, render_strict, sanitize_html};
20 +
21 + // Convenience functions
22 + let html = render_permissive("# Hello\n\n**Bold** text");
23 + let html = render_standard("A description with [link](https://example.com)");
24 + let html = render_strict("User post with @mentions and `code`");
25 + let html = sanitize_html("<p>Pre-rendered</p><script>stripped</script>");
26 +
27 + // Builder pattern for custom configurations
28 + use docengine::{Renderer, SanitizePreset};
29 +
30 + let html = Renderer::permissive()
31 + .with_strip_images(true) // override: strip images even in permissive
32 + .with_footnotes(false)
33 + .render("# Custom config");
34 +
35 + // Render with metadata (word count, reading time)
36 + let result = Renderer::standard().render_with_meta("Some article text...");
37 + println!("{} words, ~{} min read", result.word_count, result.reading_time_minutes);
38 + ```
39 +
40 + ## Feature Flags
41 +
42 + All optional features are off by default. Enable what you need:
43 +
44 + | Flag | Dependencies | Provides |
45 + |------|-------------|----------|
46 + | `doc-loader` | regex | `DocLoader` -- load a directory of `.md` files into an in-memory page store |
47 + | `directives` | regex-lite | `post_process_directives` -- `[!NOTE]`/`[!TIP]`/`[!TABS]` blockquote alerts and code tabs |
48 + | `frontmatter` | toml | `parse_frontmatter` -- extract TOML frontmatter delimited by `+++` |
49 + | `mentions` | regex-lite | `extract_mentions`, `resolve_mentions` -- `@username` parsing and linking |
50 + | `quotes` | regex-lite, uuid | `post_process_quotes` -- replace `[quote:POST_ID:HASH]` markers with author attribution |
51 + | `media-urls` | regex-lite | `rewrite_media_paths`, `img_to_video` -- CDN path rewriting and video tag conversion |
52 + | `full` | all of the above | Enable everything |
53 +
54 + ```toml
55 + # In Cargo.toml
56 + docengine = { path = "../Shared/docengine" } # Core only
57 + docengine = { path = "../Shared/docengine", features = ["full"] } # Everything
58 + ```
59 +
60 + ## Core API
61 +
62 + ### Types
63 +
64 + - **`Renderer`** -- configurable markdown renderer with builder pattern
65 + - **`RenderResult`** -- rendered HTML plus `word_count` and `reading_time_minutes`
66 + - **`SanitizePreset`** -- `Permissive`, `Standard`, `Strict`, `Minimal`
67 + - **`TocEntry`** -- heading level, text, and anchor for table of contents
68 +
69 + ### Functions
70 +
71 + | Function | Description |
72 + |----------|-------------|
73 + | `render_permissive(md)` | Render with full GFM features |
74 + | `render_standard(md)` | Render without images |
75 + | `render_strict(md)` | Render with all restrictions (UGC-safe) |
76 + | `sanitize_html(html)` | Clean pre-rendered HTML without markdown parsing |
77 + | `word_count(text)` | Count words in raw text |
78 + | `reading_time_minutes(wc)` | Estimate reading time (200 wpm) |
79 + | `extract_title(md)` | Pull the first `# Heading` from markdown |
80 + | `strip_first_heading(md)` | Remove the first `# Heading` (for template-rendered titles) |
81 + | `extract_toc(md)` | Build a `Vec<TocEntry>` from all headings |
82 + | `render_toc_html(entries)` | Render TOC entries as a `<nav class="toc">` HTML list |
83 +
84 + ### Feature-gated
85 +
86 + | Function / Type | Feature | Description |
87 + |-----------------|---------|-------------|
88 + | `DocLoader::load(path, config)` | `doc-loader` | Load `.md` files from disk, render to HTML, build searchable index |
89 + | `DocPage`, `DocIndexEntry` | `doc-loader` | Page and index entry types |
90 + | `post_process_directives(html)` | `directives` | Convert `[!NOTE]`/`[!TIP]`/etc. blockquotes to alert divs, `[!TABS]` to tabbed code blocks |
91 + | `parse_frontmatter(input)` | `frontmatter` | Parse `+++`-delimited TOML frontmatter |
92 + | `Frontmatter` | `frontmatter` | Struct with `title`, `date`, `tags`, `section`, `draft`, `extra` |
93 + | `extract_mentions(md)` | `mentions` | Find unique `@username` mentions (skips code blocks) |
94 + | `resolve_mentions(md, valid, template)` | `mentions` | Replace `@user` with `[@user](/path/to/user)` for known usernames |
95 + | `post_process_quotes(html, authors)` | `quotes` | Replace `[quote:UUID:HASH]` with clickable attribution |
96 + | `rewrite_media_paths(md, base, user)` | `media-urls` | Rewrite relative image paths to absolute CDN URLs |
97 + | `img_to_video(html)` | `media-urls` | Convert `<img>` tags pointing to video files into `<video>` elements |
98 +
99 + ## Consumers
100 +
101 + | Project | Features used | Preset |
102 + |---------|--------------|--------|
103 + | MNW | `doc-loader`, `directives`, `frontmatter`, `media-urls` | Permissive (docs/blog), Standard (descriptions) |
104 + | Multithreaded | `mentions`, `quotes` | Strict (forum posts) |
105 + | GoingsOn | core only | Standard (notes, descriptions) |
106 + | Balanced Breakfast | core only | Sanitize-only (RSS feed content) |
107 +
108 + ## Security
109 +
110 + All presets sanitize output through ammonia. The strict preset additionally:
111 + - Strips all raw HTML and images at the parser level (before ammonia)
112 + - Replaces `javascript:`, `data:`, `vbscript:` URLs with `#`
113 + - Adds `rel="noopener noreferrer nofollow"` to all links
114 +
115 + Zero unsafe code.
116 +
117 + ## License
118 +
119 + PolyForm Noncommercial 1.0.0
@@ -0,0 +1,80 @@
1 + # DocEngine Architecture
2 +
3 + ## Overview
4 +
5 + DocEngine is a markdown rendering library that wraps pulldown-cmark (parsing) and ammonia (sanitization) behind a preset system. Each preset configures which markdown features are enabled and how aggressively the output is sanitized.
6 +
7 + ## Module Map
8 +
9 + ```
10 + src/
11 + lib.rs Crate root, re-exports, convenience functions
12 + render.rs Renderer struct (builder pattern, 4 presets, render/render_with_meta)
13 + sanitize.rs SanitizePreset enum (Permissive, Standard, Strict, Minimal)
14 + text.rs Text utilities (word_count, reading_time, extract_title, strip_first_heading)
15 + toc.rs Table of contents extraction and HTML rendering
16 + escape.rs HTML entity escaping for safe string interpolation
17 + code_spans.rs Code span/block byte range detection (used by mentions to skip code)
18 + directives.rs [directives] Alert/tabs blockquote post-processing
19 + doc_loader.rs [doc-loader] Load .md files from disk into in-memory page store
20 + frontmatter.rs [frontmatter] Parse +++delimited TOML frontmatter
21 + media_urls.rs [media-urls] CDN path rewriting for images, img-to-video conversion
22 + mentions.rs [mentions] @username extraction and resolution
23 + quotes.rs [quotes] [quote:UUID:HASH] post-processing for forum attribution
24 + ```
25 +
26 + ## Design Decisions
27 +
28 + ### Presets over configuration
29 +
30 + Rather than exposing every pulldown-cmark option, DocEngine provides named presets that bundle markdown features with sanitization levels. This prevents misconfiguration -- you can't accidentally enable raw HTML without appropriate sanitization.
31 +
32 + Custom configurations are still possible via the builder pattern (`Renderer::permissive().with_strip_images(true)`).
33 +
34 + ### Two-phase rendering
35 +
36 + Rendering happens in two phases:
37 + 1. **pulldown-cmark** parses markdown to HTML events, with optional filtering (strip images, strip raw HTML, neutralize dangerous URL schemes)
38 + 2. **ammonia** sanitizes the resulting HTML string
39 +
40 + This means even the permissive preset strips `<script>` tags -- ammonia always runs.
41 +
42 + Post-processing steps (directives, mentions, quotes, media URLs) are applied after sanitization by consumers, not built into the render pipeline.
43 +
44 + ### Feature-gated modules
45 +
46 + DocEngine has zero required dependencies beyond pulldown-cmark, ammonia, and serde. Consumers that only need rendering don't pull in regex, toml, or uuid. The `full` feature enables everything.
47 +
48 + The `regex` vs `regex-lite` split is intentional -- doc-loader's link rewriting needs the full regex engine while simpler patterns in directives, mentions, quotes, and media-urls use the lighter variant.
49 +
50 + ### DocLoader loads once at startup
51 +
52 + `DocLoader::load()` reads all `.md` files from disk, renders them to HTML, and stores them in a `HashMap<String, DocPage>`. This happens once at application boot (MNW calls it during startup). Pages are served from memory with no disk I/O on request.
53 +
54 + Link rewriting converts relative `.md` references to the configured URL prefix (e.g., `./faq.md` becomes `/docs/faq`). Links to unpublished docs are stripped to plain text.
55 +
56 + ### Mention resolution skips code
57 +
58 + `extract_mentions` and `resolve_mentions` detect inline code (backticks) and fenced code blocks, skipping any @mentions inside them. This prevents false positives from code examples.
59 +
60 + ### Directive post-processing
61 +
62 + Directives (`[!NOTE]`, `[!TIP]`, `[!TABS]`, etc.) are implemented as HTML post-processing rather than markdown parsing extensions. This keeps the core render pipeline simple and makes directives composable with any preset.
63 +
64 + ## Consumers
65 +
66 + | Consumer | Features | How it's used |
67 + |----------|----------|---------------|
68 + | MNW | doc-loader, directives, frontmatter, media-urls | Site docs loaded at boot, blog posts with frontmatter, user descriptions (standard), item markdown (standard), CDN image rewriting |
69 + | Multithreaded | mentions, quotes | Forum posts (strict), @username linking, quote attribution |
70 + | GoingsOn | core | Task/event descriptions (standard) |
71 + | Balanced Breakfast | core | RSS feed content (sanitize_only) |
72 + | audiofiles | core | Sample descriptions (standard) |
73 +
74 + ## Key Paths
75 +
76 + - `src/render.rs` -- the core rendering logic
77 + - `src/sanitize.rs` -- ammonia preset configurations
78 + - `src/directives.rs` -- alert and code tab processing
79 + - `src/doc_loader.rs` -- document loading and link rewriting
80 + - `src/media_urls.rs` -- CDN path rewriting
@@ -0,0 +1,114 @@
1 + # DocEngine — Code Review
2 +
3 + **Date:** 2026-04-12
4 + **Version:** 0.3.0
5 + **Reviewer:** Claude (Opus 4.6)
6 + **Scope:** Full codebase review — all Rust source, Cargo.toml, README, docs
7 +
8 + ## Summary
9 +
10 + DocEngine is a markdown-to-HTML rendering library (~2,550 source LOC across 13 modules) built on pulldown-cmark and ammonia. Preset-based configuration system bundles markdown features with sanitization levels. 6 cargo feature gates keep the dependency tree minimal. Used by 5 consumers across the ecosystem (MNW, Multithreaded, GO, BB, AF). 141 tests, 0 clippy warnings, 0 unsafe code.
11 +
12 + **Overall: A** — clean, well-tested, security-conscious. No bugs found. Findings are documentation gaps and minor observations.
13 +
14 + ---
15 +
16 + ## Findings
17 +
18 + ### [MEDIUM] README and architecture.md missing `directives` and `media-urls` features
19 +
20 + The README feature flag table (lines 42-51) lists `doc-loader`, `frontmatter`, `mentions`, and `quotes` but omits `directives` and `media-urls`. These features are defined in Cargo.toml, included in `full`, and used by MNW. The `full` description says "all of the above" but the unlisted features make this misleading.
21 +
22 + Similarly, `architecture.md` module map (lines 9-21) does not include `directives.rs`, `media_urls.rs`, or `escape.rs`. The consumers table also doesn't mention `directives` or `media-urls` for MNW.
23 +
24 + ### [MEDIUM] Missing `docs/todo.md` and `docs/audit_review.md`
25 +
26 + Per cross-cutting conventions, each project should have `todo.md` and `audit_review.md` in `docs/`. Only `architecture.md` exists.
27 +
28 + ### [LOW] `Permissive` and `Standard` sanitize presets are identical
29 +
30 + In `sanitize.rs:17`, both `Permissive` and `Standard` map to `ammonia::clean(html)`. The doc comment for `Standard` says "Same as Permissive." The distinction is intentional — they differ in the `Renderer`'s markdown settings (Standard strips images, Permissive doesn't) — but the sanitize preset enum having two identical variants with a comment that says "Same" could confuse contributors. A comment clarifying "Same sanitization; markdown-level differences are configured in the Renderer" would help.
31 +
32 + ### [LOW] `strip_html_tags` in doc_loader.rs does not decode HTML entities
33 +
34 + The search index generator (`strip_html_tags`, lines 162-182) strips tags but leaves HTML entities (`&amp;`, `&lt;`, etc.) intact. Searching for "A & B" won't match content rendered as "A &amp; B". Low impact since doc search is client-side and search terms are unlikely to contain entities, but worth noting.
35 +
36 + ### [LOW] `render.rs` at 511 lines
37 +
38 + Technically exceeds the 500-line guideline, but 268 lines are tests. The logic is ~235 lines. Within the spirit of the rule. If the test suite grows further, consider moving tests to a submodule.
39 +
40 + ### [INFO] `rewrite_links` regex is naive about nested brackets
41 +
42 + The regex `\[([^\]]+)\]\(([^)]+)\)` in doc_loader.rs cannot handle nested brackets in link text or parentheses in URLs. Low risk since doc files are authored by the project owner, but edge cases like `[text](url_(with_parens))` would be malformed.
43 +
44 + ### [INFO] `extract_title` silently fails on frontmatter-prefixed documents
45 +
46 + If called on raw markdown that starts with `+++` TOML frontmatter, `extract_title` returns `None` because the `+++` line is neither empty nor `---`. This is the correct behavior (frontmatter should be stripped first via `parse_frontmatter`), but the interaction is documented nowhere.
47 +
48 + ### [INFO] `html_escape` uses sequential string replacements
49 +
50 + Five sequential `.replace()` calls, each allocating a new String. A single-pass approach would be more efficient, but this function is only called in template contexts (TOC, quotes, video tags), not in the hot render path. Negligible impact.
51 +
52 + ---
53 +
54 + ## Strengths
55 +
56 + - **Preset system is the right abstraction.** Bundles markdown features with matching sanitization levels, preventing dangerous misconfigurations (e.g., raw HTML without sanitization). Builder pattern still allows per-instance overrides.
57 + - **Feature gate design.** Zero unnecessary dependencies for core-only consumers. Smart split of `regex` (doc-loader, complex patterns) vs `regex-lite` (directives/mentions/quotes/media-urls, simple patterns).
58 + - **Security-conscious.** All paths go through ammonia sanitization. Dangerous URL schemes detected case-insensitively. Path traversal blocked in media URL rewriting. HTML escaping on all user-supplied strings interpolated into HTML. Zero unsafe code.
59 + - **Test quality.** 141 tests at ~1.09:1 test-to-logic ratio. Tests cover happy paths, edge cases, security scenarios. All co-located with implementation.
60 + - **Clean module boundaries.** Each module does one thing. No circular dependencies. Feature gates cleanly gate whole modules.
61 + - **Directive system is extensible.** Any `[!UPPERCASE]` blockquote becomes an alert div. Code tabs auto-detect language labels. Both are implemented as HTML post-processing, keeping the core render pipeline simple.
62 +
63 + ## Security Checklist
64 +
65 + | Check | Status |
66 + |-------|--------|
67 + | XSS via raw HTML | Pass — ammonia sanitization on all presets |
68 + | XSS via markdown | Pass — strict preset strips raw HTML at parser level + sanitizes |
69 + | javascript:/data:/vbscript: URLs | Pass — detected case-insensitively, neutralized to `#` |
70 + | Path traversal in media URLs | Pass — `..` paths rejected |
71 + | User string injection in HTML | Pass — `html_escape()` applied in quotes, TOC, media tags |
72 + | Unsafe code | Pass — zero `unsafe` blocks |
73 +
74 + ## Metrics
75 +
76 + | Metric | Value |
77 + |--------|-------|
78 + | Source LOC (logic) | ~1,310 |
79 + | Source LOC (tests) | ~1,235 |
80 + | Source LOC (total) | ~2,550 |
81 + | Source files | 13 |
82 + | Test count | 141 |
83 + | Tests/KLOC (logic) | ~108 |
84 + | Clippy warnings | 0 |
85 + | Unsafe blocks | 0 |
86 + | Cargo features | 6 (+full) |
87 + | Direct dependencies | 7 (3 always, 4 optional) |
88 + | Consumers | 5 (MNW, Multithreaded, GO, BB, AF) |
89 + | Audit advisories | 0 (1 allowed warning) |
90 +
91 + ## Module Heatmap
92 +
93 + | Module | Code | Test | Security | Docs |
94 + |--------|:----:|:----:|:--------:|:----:|
95 + | render.rs | A | A | A | A |
96 + | directives.rs | A | A | A- | B (not in README/arch) |
97 + | doc_loader.rs | A | A- | A | A- |
98 + | media_urls.rs | A | A | A | B (not in README/arch) |
99 + | toc.rs | A | A- | A | A |
100 + | mentions.rs | A | A | A | A |
101 + | frontmatter.rs | A | A | A | A |
102 + | code_spans.rs | A | A- | A | A- |
103 + | sanitize.rs | A- | A- | A | B+ (confusing "Same" comment) |
104 + | text.rs | A | A | A | A |
105 + | escape.rs | A | A- | A | A |
106 + | quotes.rs | A | A- | A | A |
107 + | lib.rs | A | — | A | A |
108 +
109 + ## Action Items
110 +
111 + 1. ~~**[MEDIUM]** Update README feature flag table to include `directives` and `media-urls`~~ — Done. Feature table, feature-gated API table, and consumers table all updated.
112 + 2. ~~**[MEDIUM]** Update `architecture.md` module map and consumers table~~ — Done. Added directives.rs, media_urls.rs, escape.rs to module map. Updated consumers, key paths, and added directive design decision.
113 + 3. ~~**[LOW]** Clarify `Standard` sanitize preset doc comment~~ — Done. Explains the difference is at the Renderer level.
114 + 4. ~~**[LOW]** Consider HTML entity decoding in `strip_html_tags` for search index accuracy~~ — Done. Decodes `&amp;`, `&lt;`, `&gt;`, `&quot;`, `&#x27;`, `&#39;` after tag stripping.
@@ -0,0 +1,443 @@
1 + //! Post-process rendered HTML to convert blockquote-based directives into
2 + //! styled elements.
3 + //!
4 + //! **Alerts:** `> [!NOTE]`, `> [!TIP]`, `> [!WARNING]`, `> [!CAUTION]`,
5 + //! `> [!IMPORTANT]`, and any custom `> [!TYPE]` marker become styled
6 + //! `<div class="alert alert-{type}">` callout elements.
7 + //!
8 + //! **Code tabs:** `> [!TABS]` followed by fenced code blocks become a tabbed
9 + //! interface with language-labelled tabs.
10 +
11 + use std::sync::LazyLock;
12 +
13 + /// Matches any `[!TYPE]` alert marker inside a blockquote paragraph.
14 + /// Accepts any uppercase word (letters, digits, hyphens, underscores).
15 + static ALERT_RE: LazyLock<regex_lite::Regex> = LazyLock::new(|| {
16 + regex_lite::Regex::new(
17 + r"<blockquote>\s*<p>\[!([A-Z][A-Z0-9_-]*)\](?:<br\s*/?>)?\s*",
18 + )
19 + .expect("valid alert regex")
20 + });
21 +
22 + /// Process all directives: code tabs first, then alerts.
23 + pub fn post_process_directives(html: &str) -> String {
24 + let with_tabs = process_tabs(html);
25 + process_alerts(&with_tabs)
26 + }
27 +
28 + /// Replace alert blockquotes with styled `<div class="alert ...">` elements.
29 + fn process_alerts(html: &str) -> String {
30 + // First pass: replace opening markers.
31 + let opened = ALERT_RE.replace_all(html, |caps: &regex_lite::Captures| {
32 + let kind = &caps[1];
33 + // Skip TABS — already handled by process_tabs.
34 + if kind == "TABS" {
35 + return caps[0].to_string();
36 + }
37 + let label = title_case(kind);
38 + format!(
39 + "<div class=\"alert alert-{kind}\"><p class=\"alert-title\">{label}</p><p>",
40 + kind = kind.to_ascii_lowercase(),
41 + label = label,
42 + )
43 + });
44 +
45 + // Second pass: close any opened alerts.
46 + let alert_count = ALERT_RE
47 + .captures_iter(html)
48 + .filter(|c| &c[1] != "TABS")
49 + .count();
50 + if alert_count == 0 {
51 + return opened.into_owned();
52 + }
53 +
54 + let mut result = String::with_capacity(opened.len());
55 + let mut remaining = opened.as_ref();
56 + let mut replaced = 0;
57 +
58 + while replaced < alert_count {
59 + if let Some(pos) = remaining.find("</blockquote>") {
60 + result.push_str(&remaining[..pos]);
61 + result.push_str("</div>");
62 + remaining = &remaining[(pos + "</blockquote>".len())..];
63 + replaced += 1;
64 + } else {
65 + break;
66 + }
67 + }
68 + result.push_str(remaining);
69 + result
70 + }
71 +
72 + /// Process `[!TABS]` blockquotes into tabbed code-block interfaces.
73 + fn process_tabs(html: &str) -> String {
74 + if !html.contains("[!TABS]") {
75 + return html.to_string();
76 + }
77 +
78 + let mut result = String::with_capacity(html.len());
79 + let mut remaining = html;
80 +
81 + while let Some(bq_pos) = remaining.find("<blockquote>") {
82 + let after_bq_start = bq_pos + "<blockquote>".len();
83 +
84 + // Find the closing </blockquote> for this blockquote.
85 + let close_pos = match remaining[bq_pos..].find("</blockquote>") {
86 + Some(p) => bq_pos + p,
87 + None => break,
88 + };
89 +
90 + let inner = &remaining[after_bq_start..close_pos];
91 +
92 + // Check if the first <p> in the blockquote contains [!TABS].
93 + let is_tabs = {
94 + let trimmed = inner.trim_start();
95 + trimmed.starts_with("<p>") && {
96 + let first_p_end = trimmed.find("</p>").unwrap_or(trimmed.len());
97 + trimmed[..first_p_end].contains("[!TABS]")
98 + }
99 + };
100 +
101 + if !is_tabs {
102 + // Not a TABS blockquote — copy through the opening tag and continue.
103 + result.push_str(&remaining[..after_bq_start]);
104 + remaining = &remaining[after_bq_start..];
105 + continue;
106 + }
107 +
108 + // Copy everything before this blockquote.
109 + result.push_str(&remaining[..bq_pos]);
110 +
111 + // Extract code blocks from the inner HTML.
112 + let tabs = extract_code_blocks(inner);
113 +
114 + if tabs.is_empty() {
115 + // No code blocks found — wrap content in a plain div.
116 + result.push_str("<div class=\"code-tabs\">");
117 + result.push_str(inner);
118 + result.push_str("</div>");
119 + } else {
120 + result.push_str(&build_tabs_html(&tabs));
121 + }
122 +
123 + remaining = &remaining[close_pos + "</blockquote>".len()..];
124 + }
125 +
126 + result.push_str(remaining);
127 + result
128 + }
129 +
130 + /// Extract `(language, full_html_block)` pairs from HTML containing
131 + /// `<pre><code>` elements.
132 + fn extract_code_blocks(html: &str) -> Vec<(String, String)> {
133 + let mut blocks = Vec::new();
134 + let mut search_from = 0;
135 + let end_marker = "</code></pre>";
136 +
137 + while let Some(pre_pos) = html[search_from..].find("<pre><code") {
138 + let abs_pos = search_from + pre_pos;
139 +
140 + let end_pos = match html[abs_pos..].find(end_marker) {
141 + Some(p) => abs_pos + p + end_marker.len(),
142 + None => break,
143 + };
144 +
145 + let full_block = &html[abs_pos..end_pos];
146 +
147 + // Extract language from class="language-X".
148 + let lang = if let Some(class_start) = full_block.find("class=\"language-") {
149 + let after = &full_block[class_start + "class=\"language-".len()..];
150 + after.split('"').next().unwrap_or("code").to_string()
151 + } else {
152 + "code".to_string()
153 + };
154 +
155 + blocks.push((lang, full_block.to_string()));
156 + search_from = end_pos;
157 + }
158 +
159 + blocks
160 + }
161 +
162 + /// Build tabbed HTML from extracted code blocks.
163 + fn build_tabs_html(tabs: &[(String, String)]) -> String {
164 + let mut html = String::from("<div class=\"code-tabs\">\n<div class=\"code-tabs-bar\">");
165 +
166 + for (i, (lang, _)) in tabs.iter().enumerate() {
167 + let active = if i == 0 { " active" } else { "" };
168 + let label = code_language_label(lang);
169 + html.push_str(&format!(
170 + "<button class=\"code-tab{active}\" data-tab-index=\"{i}\">{label}</button>"
171 + ));
172 + }
173 +
174 + html.push_str("</div>\n");
175 +
176 + for (i, (_, block)) in tabs.iter().enumerate() {
177 + let active = if i == 0 { " active" } else { "" };
178 + html.push_str(&format!(
179 + "<div class=\"code-tab-panel{active}\" data-tab-index=\"{i}\">{block}</div>\n"
180 + ));
181 + }
182 +
183 + html.push_str("</div>");
184 + html
185 + }
186 +
187 + /// Human-readable label for a code language identifier.
188 + fn code_language_label(lang: &str) -> String {
189 + match lang {
190 + "js" | "javascript" => "JavaScript".into(),
191 + "ts" | "typescript" => "TypeScript".into(),
192 + "sh" | "bash" | "zsh" | "shell" => "Shell".into(),
193 + "json" => "JSON".into(),
194 + "html" => "HTML".into(),
195 + "css" => "CSS".into(),
196 + "sql" => "SQL".into(),
197 + "toml" => "TOML".into(),
198 + "yaml" | "yml" => "YAML".into(),
199 + "xml" => "XML".into(),
200 + other => title_case(other),
201 + }
202 + }
203 +
204 + fn title_case(s: &str) -> String {
205 + let mut chars = s.chars();
206 + match chars.next() {
207 + Some(c) => {
208 + let mut out = c.to_uppercase().to_string();
209 + out.extend(chars.map(|c| c.to_ascii_lowercase()));
210 + out
211 + }
212 + None => String::new(),
213 + }
214 + }
215 +
216 + #[cfg(test)]
217 + mod tests {
218 + use super::*;
219 +
220 + // ===== Alert directives =====
221 +
222 + #[test]
223 + fn note_alert() {
224 + let html = "<blockquote>\n<p>[!NOTE]<br>\nThis is a note.</p>\n</blockquote>";
225 + let result = post_process_directives(html);
226 + assert!(result.contains("alert alert-note"));
227 + assert!(result.contains("<p class=\"alert-title\">Note</p>"));
228 + assert!(result.contains("This is a note."));
229 + assert!(!result.contains("<blockquote>"));
230 + }
231 +
232 + #[test]
233 + fn tip_alert() {
234 + let html = "<blockquote>\n<p>[!TIP]<br>\nHelpful tip here.</p>\n</blockquote>";
235 + let result = post_process_directives(html);
236 + assert!(result.contains("alert alert-tip"));
237 + assert!(result.contains("<p class=\"alert-title\">Tip</p>"));
238 + }
239 +
240 + #[test]
241 + fn important_alert() {
242 + let html = "<blockquote>\n<p>[!IMPORTANT]<br>\nDo this.</p>\n</blockquote>";
243 + let result = post_process_directives(html);
244 + assert!(result.contains("alert alert-important"));
245 + assert!(result.contains("<p class=\"alert-title\">Important</p>"));
246 + }
247 +
248 + #[test]
249 + fn warning_alert() {
250 + let html = "<blockquote>\n<p>[!WARNING]<br>\nBe careful.</p>\n</blockquote>";
251 + let result = post_process_directives(html);
252 + assert!(result.contains("alert alert-warning"));
253 + assert!(result.contains("<p class=\"alert-title\">Warning</p>"));
254 + }
255 +
256 + #[test]
257 + fn caution_alert() {
258 + let html = "<blockquote>\n<p>[!CAUTION]<br/>\nDanger zone.</p>\n</blockquote>";
259 + let result = post_process_directives(html);
260 + assert!(result.contains("alert alert-caution"));
261 + assert!(result.contains("<p class=\"alert-title\">Caution</p>"));
262 + }
263 +
264 + #[test]
265 + fn multi_paragraph_alert() {
266 + let html = "<blockquote>\n<p>[!NOTE]<br>\nFirst paragraph.</p>\n<p>Second paragraph.</p>\n</blockquote>";
267 + let result = post_process_directives(html);
268 + assert!(result.contains("alert alert-note"));
269 + assert!(result.contains("First paragraph."));
270 + assert!(result.contains("Second paragraph."));
271 + assert!(result.contains("</div>"));
272 + assert!(!result.contains("</blockquote>"));
273 + }
274 +
275 + #[test]
276 + fn regular_blockquote_unchanged() {
277 + let html = "<blockquote>\n<p>Just a normal quote.</p>\n</blockquote>";
278 + let result = post_process_directives(html);
279 + assert_eq!(result, html);
280 + }
281 +
282 + #[test]
283 + fn mixed_alerts_and_blockquotes() {
284 + let html = concat!(
285 + "<blockquote>\n<p>[!WARNING]<br>\nWatch out!</p>\n</blockquote>\n",
286 + "<blockquote>\n<p>Normal quote.</p>\n</blockquote>"
287 + );
288 + let result = post_process_directives(html);
289 + assert!(result.contains("alert alert-warning"));
290 + assert!(result.contains("Watch out!"));
291 + // The normal blockquote remains unchanged.
292 + assert!(result.contains("<blockquote>"));
293 + assert!(result.contains("Normal quote."));
294 + }
295 +
296 + // ===== Custom alert types =====
297 +
298 + #[test]
299 + fn custom_example_alert() {
300 + let html = "<blockquote>\n<p>[!EXAMPLE]<br>\nHere is an example.</p>\n</blockquote>";
301 + let result = post_process_directives(html);
302 + assert!(result.contains("alert alert-example"));
303 + assert!(result.contains("<p class=\"alert-title\">Example</p>"));
304 + assert!(result.contains("Here is an example."));
305 + assert!(!result.contains("<blockquote>"));
306 + }
307 +
308 + #[test]
309 + fn custom_definition_alert() {
310 + let html = "<blockquote>\n<p>[!DEFINITION]<br>\nA term and its meaning.</p>\n</blockquote>";
311 + let result = post_process_directives(html);
312 + assert!(result.contains("alert alert-definition"));
313 + assert!(result.contains("<p class=\"alert-title\">Definition</p>"));
314 + }
315 +
316 + #[test]
317 + fn custom_alert_with_hyphen() {
318 + let html =
319 + "<blockquote>\n<p>[!SEE-ALSO]<br>\nRelated topics.</p>\n</blockquote>";
320 + let result = post_process_directives(html);
321 + assert!(result.contains("alert alert-see-also"));
322 + assert!(result.contains("<p class=\"alert-title\">See-also</p>"));
323 + }
324 +
325 + // ===== Code tabs =====
326 +
327 + #[test]
328 + fn tabs_two_languages() {
329 + let html = concat!(
330 + "<blockquote>\n<p>[!TABS]</p>\n",
331 + "<pre><code class=\"language-rust\">fn main() {}\n</code></pre>\n",
332 + "<pre><code class=\"language-python\">def main(): pass\n</code></pre>\n",
333 + "</blockquote>"
334 + );
335 + let result = post_process_directives(html);
336 + assert!(result.contains("code-tabs"));
337 + assert!(result.contains("code-tabs-bar"));
338 + assert!(result.contains("Rust"));
339 + assert!(result.contains("Python"));
340 + assert!(result.contains("fn main() {}"));
341 + assert!(result.contains("def main(): pass"));
342 + assert!(!result.contains("<blockquote>"));
343 + // First tab is active.
344 + assert!(result.contains("code-tab active"));
345 + assert!(result.contains("code-tab-panel active"));
346 + }
347 +
348 + #[test]
349 + fn tabs_three_languages() {
350 + let html = concat!(
351 + "<blockquote>\n<p>[!TABS]</p>\n",
352 + "<pre><code class=\"language-bash\">curl https://api.example.com\n</code></pre>\n",
353 + "<pre><code class=\"language-js\">fetch('https://api.example.com')\n</code></pre>\n",
354 + "<pre><code class=\"language-python\">requests.get('https://api.example.com')\n</code></pre>\n",
355 + "</blockquote>"
356 + );
357 + let result = post_process_directives(html);
358 + assert!(result.contains("Shell")); // bash → Shell
359 + assert!(result.contains("JavaScript")); // js → JavaScript
360 + assert!(result.contains("Python"));
361 + assert!(result.contains("data-tab-index=\"0\""));
362 + assert!(result.contains("data-tab-index=\"1\""));
363 + assert!(result.contains("data-tab-index=\"2\""));
364 + }
365 +
366 + #[test]
367 + fn tabs_no_language_specified() {
368 + let html = concat!(
369 + "<blockquote>\n<p>[!TABS]</p>\n",
370 + "<pre><code>some code\n</code></pre>\n",
371 + "<pre><code class=\"language-rust\">let x = 1;\n</code></pre>\n",
372 + "</blockquote>"
373 + );
374 + let result = post_process_directives(html);
375 + assert!(result.contains("Code")); // fallback label
376 + assert!(result.contains("Rust"));
377 + }
378 +
379 + #[test]
380 + fn tabs_with_br_marker() {
381 + let html = concat!(
382 + "<blockquote>\n<p>[!TABS]<br>\n</p>\n",
383 + "<pre><code class=\"language-toml\">[package]\n</code></pre>\n",
384 + "<pre><code class=\"language-json\">{}\n</code></pre>\n",
385 + "</blockquote>"
386 + );
387 + let result = post_process_directives(html);
388 + assert!(result.contains("TOML"));
389 + assert!(result.contains("JSON"));
390 + }
391 +
392 + #[test]
393 + fn tabs_mixed_with_alert_and_blockquote() {
394 + let html = concat!(
395 + "<blockquote>\n<p>[!NOTE]<br>\nA note.</p>\n</blockquote>\n",
396 + "<blockquote>\n<p>[!TABS]</p>\n",
397 + "<pre><code class=\"language-rust\">let x = 1;\n</code></pre>\n",
398 + "</blockquote>\n",
399 + "<blockquote>\n<p>Normal quote.</p>\n</blockquote>"
400 + );
401 + let result = post_process_directives(html);
402 + // Alert processed.
403 + assert!(result.contains("alert alert-note"));
404 + // Tabs processed.
405 + assert!(result.contains("code-tabs"));
406 + assert!(result.contains("Rust"));
407 + // Normal blockquote unchanged.
408 + assert!(result.contains("<blockquote>"));
409 + assert!(result.contains("Normal quote."));
410 + }
411 +
412 + #[test]
413 + fn tabs_no_code_blocks() {
414 + let html = concat!(
415 + "<blockquote>\n<p>[!TABS]</p>\n",
416 + "<p>Just text, no code.</p>\n",
417 + "</blockquote>"
418 + );
419 + let result = post_process_directives(html);
420 + assert!(result.contains("code-tabs"));
421 + assert!(result.contains("Just text, no code."));
422 + assert!(!result.contains("<blockquote>"));
423 + }
424 +
425 + // ===== Language label mapping =====
426 +
427 + #[test]
428 + fn language_labels() {
429 + assert_eq!(code_language_label("js"), "JavaScript");
430 + assert_eq!(code_language_label("typescript"), "TypeScript");
431 + assert_eq!(code_language_label("bash"), "Shell");
432 + assert_eq!(code_language_label("json"), "JSON");
433 + assert_eq!(code_language_label("html"), "HTML");
434 + assert_eq!(code_language_label("css"), "CSS");
435 + assert_eq!(code_language_label("sql"), "SQL");
436 + assert_eq!(code_language_label("toml"), "TOML");
437 + assert_eq!(code_language_label("yaml"), "YAML");
438 + assert_eq!(code_language_label("xml"), "XML");
439 + assert_eq!(code_language_label("rust"), "Rust");
440 + assert_eq!(code_language_label("python"), "Python");
441 + assert_eq!(code_language_label("go"), "Go");
442 + }
443 + }
M src/doc_loader.rs +103 -9
@@ -35,6 +35,15 @@ pub struct DocIndexEntry {
35 35 pub section: String,
36 36 }
37 37
38 + /// Entry in the full-text search index, serialised to JSON for client-side search.
39 + #[derive(Clone, Debug, serde::Serialize)]
40 + pub struct DocSearchEntry {
41 + pub slug: String,
42 + pub title: String,
43 + pub section: String,
44 + pub body_text: String,
45 + }
46 +
38 47 /// In-memory store of rendered documentation pages, built once at startup.
39 48 #[derive(Clone, Debug)]
40 49 pub struct DocLoader {
@@ -56,9 +65,15 @@ impl DocLoader {
56 65 continue;
57 66 }
58 67
59 - let mut entries: Vec<_> = std::fs::read_dir(&section_path)
60 - .into_iter()
61 - .flatten()
68 + let read_dir = match std::fs::read_dir(&section_path) {
69 + Ok(rd) => rd,
70 + Err(e) => {
71 + tracing::warn!(path = %section_path.display(), error = %e, "Failed to read docs section directory");
72 + continue;
73 + }
74 + };
75 +
76 + let mut entries: Vec<_> = read_dir
62 77 .filter_map(|e| e.ok())
63 78 .filter(|e| {
64 79 e.path()
@@ -92,21 +107,24 @@ impl DocLoader {
92 107 );
93 108 let md_without_title = crate::text::strip_first_heading(&rewritten_md);
94 109 let html_content = crate::render_permissive(&md_without_title);
110 + #[cfg(feature = "directives")]
111 + let html_content = crate::directives::post_process_directives(&html_content);
95 112
96 113 let page = DocPage {
97 - title: title.clone(),
98 - slug: slug.clone(),
114 + title,
115 + slug,
99 116 section: section_display.clone(),
100 117 html_content,
101 118 };
102 119
103 120 index.push(DocIndexEntry {
104 - title: title.clone(),
105 - slug: slug.clone(),
106 - section: section_display.clone(),
121 + title: page.title.clone(),
122 + slug: page.slug.clone(),
123 + section: page.section.clone(),
107 124 });
108 125
109 - pages.insert(slug, page);
126 + let slug_key = page.slug.clone();
127 + pages.insert(slug_key, page);
110 128 }
111 129 }
112 130
@@ -122,6 +140,53 @@ impl DocLoader {
122 140 pub fn index(&self) -> &[DocIndexEntry] {
123 141 &self.index
124 142 }
143 +
144 + /// Build a search index with HTML stripped to plain text.
145 + pub fn search_index(&self) -> Vec<DocSearchEntry> {
146 + self.index
147 + .iter()
148 + .filter_map(|entry| {
149 + let page = self.pages.get(&entry.slug)?;
150 + Some(DocSearchEntry {
151 + slug: entry.slug.clone(),
152 + title: entry.title.clone(),
153 + section: entry.section.clone(),
154 + body_text: strip_html_tags(&page.html_content),
155 + })
156 + })
157 + .collect()
158 + }
159 + }
160 +
161 + /// Strip HTML tags from a string, returning plain text.
162 + /// Decodes common HTML entities so search indexes match plain-text queries.
163 + fn strip_html_tags(html: &str) -> String {
164 + let mut out = String::with_capacity(html.len());
165 + let mut in_tag = false;
166 + for ch in html.chars() {
167 + match ch {
168 + '<' => in_tag = true,
169 + '>' => {
170 + in_tag = false;
171 + // Add a space after closing tags to separate words.
172 + if !out.ends_with(' ') {
173 + out.push(' ');
174 + }
175 + }
176 + _ if !in_tag => out.push(ch),
177 + _ => {}
178 + }
179 + }
180 + // Collapse runs of whitespace.
181 + let collapsed: String = out.split_whitespace().collect::<Vec<_>>().join(" ");
182 + // Decode common HTML entities for search index accuracy.
183 + collapsed
184 + .replace("&amp;", "&")
185 + .replace("&lt;", "<")
186 + .replace("&gt;", ">")
187 + .replace("&quot;", "\"")
188 + .replace("&#x27;", "'")
189 + .replace("&#39;", "'")
125 190 }
126 191
127 192 /// Rewrite relative `.md` links to the configured prefix.
@@ -257,4 +322,33 @@ mod tests {
257 322 let result = rewrite_links(md, "/docs", None);
258 323 assert_eq!(result, md);
259 324 }
325 +
326 + #[test]
327 + fn strip_html_tags_removes_tags() {
328 + let html = "<p>Hello <strong>world</strong></p>";
329 + assert_eq!(strip_html_tags(html), "Hello world");
330 + }
331 +
332 + #[test]
333 + fn strip_html_tags_empty_input() {
334 + assert_eq!(strip_html_tags(""), "");
335 + }
336 +
337 + #[test]
338 + fn strip_html_tags_decodes_entities() {
339 + let html = "<p>Price: $10 &amp; free</p>";
340 + assert_eq!(strip_html_tags(html), "Price: $10 & free");
341 +
342 + let html2 = "<p>a &lt; b &gt; c</p>";
343 + assert_eq!(strip_html_tags(html2), "a < b > c");
344 +
345 + let html3 = "<p>&quot;hello&quot; &amp; &#x27;world&#39;</p>";
346 + assert_eq!(strip_html_tags(html3), "\"hello\" & 'world'");
347 + }
348 +
349 + #[test]
350 + fn strip_html_tags_nested_tags() {
351 + let html = "<div><p>A <em>nested <strong>deep</strong></em> tag</p></div>";
352 + assert_eq!(strip_html_tags(html), "A nested deep tag");
353 + }
260 354 }
@@ -0,0 +1,33 @@
1 + /// HTML-escape a string for safe interpolation into element content or attributes.
2 + ///
3 + /// Escapes all five HTML-significant characters: `& < > " '`.
4 + pub(crate) fn html_escape(s: &str) -> String {
5 + s.replace('&', "&amp;")
6 + .replace('<', "&lt;")
7 + .replace('>', "&gt;")
8 + .replace('"', "&quot;")
9 + .replace('\'', "&#x27;")
10 + }
11 +
12 + #[cfg(test)]
13 + mod tests {
14 + use super::*;
15 +
16 + #[test]
17 + fn escapes_all_five_chars() {
18 + assert_eq!(
19 + html_escape("A & B < C > D \" E ' F"),
20 + "A &amp; B &lt; C &gt; D &quot; E &#x27; F"
21 + );
22 + }
23 +
24 + #[test]
25 + fn no_change_for_safe_string() {
26 + assert_eq!(html_escape("hello world"), "hello world");
27 + }
28 +
29 + #[test]
30 + fn empty_string() {
31 + assert_eq!(html_escape(""), "");
32 + }
33 + }
@@ -40,7 +40,10 @@ pub fn parse_frontmatter(input: &str) -> (Option<Frontmatter>, &str) {
40 40
41 41 match toml::from_str::<Frontmatter>(toml_content) {
42 42 Ok(fm) => (Some(fm), rest_slice),
43 - Err(_) => (None, input),
43 + Err(e) => {
44 + tracing::warn!(error = %e, "Failed to parse TOML frontmatter");
45 + (None, input)
46 + }
44 47 }
45 48 } else {
46 49 (None, input)
M src/lib.rs +21 -1
@@ -1,10 +1,24 @@
1 + //! Configurable markdown-to-HTML rendering with sanitization presets.
2 + //!
3 + //! Provides four rendering presets for different trust levels:
4 + //! - **Permissive** -- full GFM (tables, footnotes, images, raw HTML). For trusted content.
5 + //! - **Standard** -- GFM without images. For app text fields.
6 + //! - **Strict** -- no images, no raw HTML, dangerous scheme filtering, nofollow. For UGC.
7 + //! - **Sanitize-only** -- ammonia cleaning without markdown parsing. For external HTML.
8 + //!
9 + //! Optional features add document loading, TOML frontmatter, @mention resolution,
10 + //! and quote attribution post-processing.
11 +
1 12 #[cfg(any(feature = "mentions", test))]
2 13 mod code_spans;
14 + mod escape;
3 15 mod render;
4 16 mod sanitize;
5 17 mod text;
6 18 mod toc;
7 19
20 + #[cfg(feature = "directives")]
21 + mod directives;
8 22 #[cfg(feature = "doc-loader")]
9 23 mod doc_loader;
10 24 #[cfg(feature = "frontmatter")]
@@ -13,6 +27,8 @@ mod frontmatter;
13 27 mod mentions;
14 28 #[cfg(feature = "quotes")]
15 29 mod quotes;
30 + #[cfg(feature = "media-urls")]
31 + mod media_urls;
16 32
17 33 // Re-export core types
18 34 pub use render::{RenderResult, Renderer};
@@ -21,14 +37,18 @@ pub use text::{extract_title, reading_time_minutes, strip_first_heading, word_co
21 37 pub use toc::{TocEntry, extract_toc, render_toc_html};
22 38
23 39 // Re-export feature-gated types
40 + #[cfg(feature = "directives")]
41 + pub use directives::post_process_directives;
24 42 #[cfg(feature = "doc-loader")]
25 - pub use doc_loader::{DocIndexEntry, DocLoader, DocLoaderConfig, DocPage};
43 + pub use doc_loader::{DocIndexEntry, DocLoader, DocLoaderConfig, DocPage, DocSearchEntry};
26 44 #[cfg(feature = "frontmatter")]
27 45 pub use frontmatter::{Frontmatter, parse_frontmatter};
28 46 #[cfg(feature = "mentions")]
29 47 pub use mentions::{extract_mentions, resolve_mentions};
30 48 #[cfg(feature = "quotes")]
31 49 pub use quotes::{QuoteAuthor, post_process_quotes};
50 + #[cfg(feature = "media-urls")]
51 + pub use media_urls::{img_to_video, rewrite_media_paths};
32 52
33 53 /// Render markdown with the permissive preset (GFM features, default ammonia).
34 54 pub fn render_permissive(markdown: &str) -> String {
@@ -0,0 +1,235 @@
1 + //! Pre-process and post-process markdown/HTML for media file references.
2 + //!
3 + //! Two-stage pipeline:
4 + //! 1. **Pre-process markdown** — rewrite `![alt](folder/file.png)` to
5 + //! `![alt](https://cdn.makenot.work/{user_id}/media/folder/file.png)`.
6 + //! 2. **Post-process HTML** — convert `<img src="...file.mp4">` to
7 + //! `<video controls src="..."></video>`.
8 +
9 + use std::sync::LazyLock;
10 +
11 + /// Matches markdown image syntax: `![alt text](url)`
12 + /// Captures: group 1 = alt text, group 2 = URL path
13 + static MD_IMAGE_RE: LazyLock<regex_lite::Regex> = LazyLock::new(|| {
14 + regex_lite::Regex::new(r"!\[([^\]]*)\]\(([^)]+)\)").expect("valid markdown image regex")
15 + });
16 +
17 + /// Matches `<img` tags with a src pointing to a video extension.
18 + static IMG_VIDEO_RE: LazyLock<regex_lite::Regex> = LazyLock::new(|| {
19 + regex_lite::Regex::new(
20 + r#"<img\s+([^>]*?)src="([^"]*\.(?:mp4|webm|mov))"([^>]*?)\s*/?>"#,
21 + )
22 + .expect("valid img video regex")
23 + });
24 +
25 + /// Matches `alt="..."` in an img tag's attributes.
26 + static ALT_RE: LazyLock<regex_lite::Regex> = LazyLock::new(|| {
27 + regex_lite::Regex::new(r#"alt="([^"]*)""#).expect("valid alt regex")
28 + });
29 +
30 + /// Rewrite relative image paths in markdown to absolute CDN URLs.
31 + ///
32 + /// Skips:
33 + /// - Absolute URLs (`http://`, `https://`, `data:`)
34 + /// - Absolute paths starting with `/`
35 + /// - Paths containing `..` (path traversal)
36 + ///
37 + /// Rewrites relative paths to: `{cdn_base}/{user_id}/media/{path}`
38 + pub fn rewrite_media_paths(markdown: &str, cdn_base: &str, user_id: &str) -> String {
39 + let cdn_base = cdn_base.trim_end_matches('/');
40 +
41 + MD_IMAGE_RE
42 + .replace_all(markdown, |caps: &regex_lite::Captures| {
43 + let alt = &caps[1];
44 + let path = &caps[2];
45 +
46 + // Skip absolute URLs and data URIs
47 + if path.starts_with("http://")
48 + || path.starts_with("https://")
49 + || path.starts_with("data:")
50 + || path.starts_with('/')
51 + {
52 + return caps[0].to_string();
53 + }
54 +
55 + // Reject path traversal
56 + if path.contains("..") {
57 + return caps[0].to_string();
58 + }
59 +
60 + format!("![{}]({}/{}/media/{})", alt, cdn_base, user_id, path)
61 + })
62 + .into_owned()
63 + }
64 +
65 + /// Convert `<img>` tags with video extensions (.mp4, .webm, .mov) to `<video>` elements.
66 + ///
67 + /// Preserves alt text as fallback content inside the `<video>` tag.
68 + pub fn img_to_video(html: &str) -> String {
69 + IMG_VIDEO_RE
70 + .replace_all(html, |caps: &regex_lite::Captures| {
71 + let before_src = &caps[1];
72 + let src = &caps[2];
73 + let after_src = &caps[3];
74 +
75 + // Extract alt text if present
76 + let attrs = format!("{}{}", before_src, after_src);
77 + let alt = ALT_RE
78 + .captures(&attrs)
79 + .map(|c| c[1].to_string())
80 + .unwrap_or_default();
81 +
82 + if alt.is_empty() {
83 + format!(r#"<video controls src="{}">Your browser does not support video.</video>"#, src)
84 + } else {
85 + format!(
86 + r#"<video controls src="{}">{}</video>"#,
87 + src,
88 + crate::escape::html_escape(&alt)
89 + )
90 + }
91 + })
92 + .into_owned()
93 + }
94 +
95 + #[cfg(test)]
96 + mod tests {
97 + use super::*;
98 +
99 + #[test]
100 + fn relative_path_rewritten() {
101 + let md = "![Screenshot](screenshots/demo.png)";
102 + let result = rewrite_media_paths(md, "https://cdn.makenot.work", "user-123");
103 + assert_eq!(
104 + result,
105 + "![Screenshot](https://cdn.makenot.work/user-123/media/screenshots/demo.png)"
106 + );
107 + }
108 +
109 + #[test]
110 + fn absolute_url_unchanged() {
111 + let md = "![Logo](https://example.com/logo.png)";
112 + let result = rewrite_media_paths(md, "https://cdn.makenot.work", "user-123");
113 + assert_eq!(result, md);
114 + }
115 +
116 + #[test]
117 + fn http_url_unchanged() {
118 + let md = "![Logo](http://example.com/logo.png)";
119 + let result = rewrite_media_paths(md, "https://cdn.makenot.work", "user-123");
120 + assert_eq!(result, md);
121 + }
122 +
123 + #[test]
124 + fn data_uri_unchanged() {
125 + let md = "![Pixel](data:image/png;base64,abc)";
126 + let result = rewrite_media_paths(md, "https://cdn.makenot.work", "user-123");
127 + assert_eq!(result, md);
128 + }
129 +
130 + #[test]
131 + fn absolute_path_unchanged() {
132 + let md = "![Doc](/static/images/docs/setup.png)";
133 + let result = rewrite_media_paths(md, "https://cdn.makenot.work", "user-123");
134 + assert_eq!(result, md);
135 + }
136 +
137 + #[test]
138 + fn path_traversal_unchanged() {
139 + let md = "![Hack](../../../etc/passwd)";
140 + let result = rewrite_media_paths(md, "https://cdn.makenot.work", "user-123");
141 + assert_eq!(result, md);
142 + }
143 +
144 + #[test]
145 + fn root_folder_file() {
146 + let md = "![Photo](photo.jpg)";
147 + let result = rewrite_media_paths(md, "https://cdn.makenot.work", "user-123");
148 + assert_eq!(
149 + result,
150 + "![Photo](https://cdn.makenot.work/user-123/media/photo.jpg)"
151 + );
152 + }
153 +
154 + #[test]
155 + fn cdn_base_trailing_slash_stripped() {
156 + let md = "![Img](img.png)";
157 + let result = rewrite_media_paths(md, "https://cdn.makenot.work/", "user-123");
158 + assert_eq!(
159 + result,
160 + "![Img](https://cdn.makenot.work/user-123/media/img.png)"
161 + );
162 + }
163 +
164 + #[test]
165 + fn video_extension_to_video_tag() {
166 + let html = r#"<img src="https://cdn.makenot.work/u/media/demo.mp4" alt="Demo">"#;
167 + let result = img_to_video(html);
168 + assert!(result.contains("<video controls"));
169 + assert!(result.contains(r#"src="https://cdn.makenot.work/u/media/demo.mp4""#));
170 + assert!(result.contains("Demo"));
171 + assert!(result.contains("</video>"));
172 + assert!(!result.contains("<img"));
173 + }
174 +
175 + #[test]
176 + fn non_video_image_unchanged() {
177 + let html = r#"<img src="https://cdn.makenot.work/u/media/photo.png" alt="Photo">"#;
178 + let result = img_to_video(html);
179 + assert_eq!(result, html);
180 + }
181 +
182 + #[test]
183 + fn webm_converted() {
184 + let html = r#"<img src="clip.webm">"#;
185 + let result = img_to_video(html);
186 + assert!(result.contains("<video controls"));
187 + assert!(result.contains("</video>"));
188 + }
189 +
190 + #[test]
191 + fn mov_converted() {
192 + let html = r#"<img src="clip.mov" alt="Clip">"#;
193 + let result = img_to_video(html);
194 + assert!(result.contains("<video controls"));
195 + assert!(result.contains("Clip"));
196 + }
197 +
198 + #[test]
199 + fn mixed_content() {
200 + let md = "Text before\n\n![Img](folder/img.png)\n\nMore text\n\n![Vid](folder/vid.mp4)\n\n![External](https://example.com/pic.jpg)";
201 + let rewritten = rewrite_media_paths(md, "https://cdn.makenot.work", "u1");
202 + assert!(rewritten.contains("https://cdn.makenot.work/u1/media/folder/img.png"));
203 + assert!(rewritten.contains("https://cdn.makenot.work/u1/media/folder/vid.mp4"));
204 + assert!(rewritten.contains("https://example.com/pic.jpg"));
205 + }
206 +
207 + #[test]
208 + fn empty_alt_text() {
209 + let md = "![](photo.jpg)";
210 + let result = rewrite_media_paths(md, "https://cdn.makenot.work", "u1");
211 + assert_eq!(
212 + result,
213 + "![](https://cdn.makenot.work/u1/media/photo.jpg)"
214 + );
215 + }
216 +
217 + #[test]
218 + fn video_tag_no_alt() {
219 + let html = r#"<img src="demo.mp4">"#;
220 + let result = img_to_video(html);
221 + assert!(result.contains("Your browser does not support video."));
222 + }
223 +
224 + #[test]
225 + fn multiple_images_in_html() {
226 + let html = r#"<img src="a.png" alt="A"><img src="b.mp4" alt="B"><img src="c.webm">"#;
227 + let result = img_to_video(html);
228 + // a.png stays as img
229 + assert!(result.contains(r#"<img src="a.png""#));
230 + // b.mp4 becomes video
231 + assert!(result.contains(r#"<video controls src="b.mp4">B</video>"#));
232 + // c.webm becomes video
233 + assert!(result.contains(r#"<video controls src="c.webm">"#));
234 + }
235 + }
M src/quotes.rs +2 -9
@@ -1,5 +1,7 @@
1 1 use std::collections::HashMap;
2 2
3 + use crate::escape::html_escape;
4 +
3 5 /// Quote author info for attribution rendering.
4 6 pub struct QuoteAuthor {
5 7 pub username: String,
@@ -7,15 +9,6 @@ pub struct QuoteAuthor {
7 9 pub is_removed: bool,
8 10 }
9 11
10 - /// HTML-escape a string for safe interpolation into raw HTML.
11 - fn html_escape(s: &str) -> String {
12 - s.replace('&', "&amp;")
13 - .replace('<', "&lt;")
14 - .replace('>', "&gt;")
15 - .replace('"', "&quot;")
16 - .replace('\'', "&#x27;")
17 - }
18 -
19 12 /// Post-process rendered HTML to replace `[quote:POST_ID:HASH]` markers with
20 13 /// clickable author attribution.
21 14 pub fn post_process_quotes(
M src/sanitize.rs +2 -1
@@ -3,7 +3,8 @@
3 3 pub enum SanitizePreset {
4 4 /// Default ammonia settings. Allows most safe HTML.
5 5 Permissive,
6 - /// Default ammonia settings. Same as Permissive.
6 + /// Default ammonia settings (same sanitization as Permissive; the difference
7 + /// is at the Renderer level — Standard strips images, Permissive doesn't).
7 8 Standard,
8 9 /// Adds `rel="noopener noreferrer nofollow"` to all links.
9 10 Strict,
M src/toc.rs +3 -14
@@ -1,5 +1,7 @@
1 1 use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
2 2
3 + use crate::escape::html_escape;
4 +
3 5 /// A single entry in a table of contents.
4 6 #[derive(Debug, Clone, PartialEq, Eq)]
5 7 pub struct TocEntry {
@@ -57,7 +59,7 @@ pub fn render_toc_html(entries: &[TocEntry]) -> String {
57 59 html.push_str(&format!(
58 60 "<li class=\"toc-h{}\"><a href=\"#{}\">{}</a></li>\n",
59 61 entry.level,
60 - html_escape_attr(&entry.anchor),
62 + html_escape(&entry.anchor),
61 63 html_escape(&entry.text),
62 64 ));
63 65 }
@@ -75,19 +77,6 @@ fn make_anchor(text: &str) -> String {
75 77 .collect()
76 78 }
77 79
78 - fn html_escape(s: &str) -> String {
79 - s.replace('&', "&amp;")
80 - .replace('<', "&lt;")
81 - .replace('>', "&gt;")
82 - }
83 -
84 - fn html_escape_attr(s: &str) -> String {
85 - s.replace('&', "&amp;")
86 - .replace('"', "&quot;")
87 - .replace('<', "&lt;")
88 - .replace('>', "&gt;")
89 - }
90 -
91 80 #[cfg(test)]
92 81 mod tests {
93 82 use super::*;