Skip to main content

afm_markdown/
code_block_mask.rs

1//! Pre-/post-process pass that hides 青空文庫 trigger characters
2//! inside CommonMark fenced code blocks.
3//!
4//! ## Why this exists
5//!
6//! `aozora_pipeline` recognises every `|` / `《` / `》` / `[` / `]` / `※` /
7//! `〔` / `〕` / `「` / `」` as a candidate trigger and rewrites it
8//! into a PUA sentinel before comrak ever sees the source. That is
9//! exactly what we want for prose; it is exactly what we *don't* want
10//! inside a fenced code block, where every byte is supposed to flow
11//! through to `<pre><code>` literally.
12//!
13//! `aozora_pipeline` is intentionally CommonMark-blind (ADR-0010 — the
14//! parser core has no opinion on Markdown), so the responsibility for
15//! teaching it about code-block context lives here. We:
16//!
17//! 1. Scan the source line by line and locate every fenced code block
18//!    (CommonMark info-string fence: a run of three or more backticks
19//!    or three or more tildes after at most three leading spaces,
20//!    closed by a same-character run that is at least as long).
21//! 2. Replace each Aozora trigger character inside a fence with
22//!    [`MASK_CHAR`] (U+E000 — Private Use Area, distinct from the
23//!    four sentinels U+E001..U+E004) and stash the original char in
24//!    insertion order.
25//! 3. After `comrak::format_html`, restore the trigger characters in
26//!    the HTML output by walking the originals list in the same
27//!    order. `comrak`'s HTML escape only touches `<`, `>`, `&`, `"`;
28//!    `MASK_CHAR` flows through untouched.
29//!
30//! ## Why not `\u{E000}` collisions?
31//!
32//! `aozora_pipeline`'s Phase 0 already scans for source-supplied PUA
33//! characters and emits a `Diagnostic::SourceContainsPua` for any
34//! encountered. We pre-scan for `MASK_CHAR` in the *original* source
35//! and skip masking entirely if any is present (returning the source
36//! verbatim and an empty originals list). That preserves the lexer's
37//! diagnostic on the user's pristine input and avoids an
38//! ambiguity-of-origin in the unmask step.
39
40use core::cmp::min;
41
42/// Private-use code point used to stand in for an Aozora trigger
43/// character that lives inside a fenced code block. Distinct from
44/// `aozora_pipeline::INLINE_SENTINEL` (U+E001) and the three block
45/// sentinels (U+E002..U+E004), so the masking pass cannot collide
46/// with the lexer's own sentinels.
47const MASK_CHAR: char = '\u{E000}';
48
49/// Every char `aozora_pipeline` treats as a recogniser trigger. Mirrors
50/// the upstream `aozora_pipeline` Phase 1 event tokeniser; if the upstream
51/// list grows, this list must follow.
52const AOZORA_TRIGGERS: &[char] = &['|', '《', '》', '[', ']', '※', '〔', '〕', '「', '」'];
53
54/// Mask every Aozora trigger character that appears inside a fenced
55/// code block. Returns the modified source and the ordered list of
56/// original characters that were replaced (for use by [`unmask_html`]).
57///
58/// Returns `(source.to_owned(), Vec::new())` and skips masking if the
59/// source already contains [`MASK_CHAR`] — see the module docs for
60/// the rationale.
61#[must_use]
62pub(crate) fn mask_code_block_triggers(source: &str) -> (String, Vec<char>) {
63    if source.contains(MASK_CHAR) {
64        return (source.to_owned(), Vec::new());
65    }
66
67    let mut out = String::with_capacity(source.len());
68    let mut originals: Vec<char> = Vec::new();
69    let mut state = MaskState::Outside;
70
71    for line in source.split_inclusive('\n') {
72        match state {
73            MaskState::Outside => {
74                out.push_str(line);
75                if let Some(fence) = parse_fence_open(line) {
76                    state = MaskState::Inside(fence);
77                }
78            }
79            MaskState::Inside(open) => {
80                if is_fence_close(line, open) {
81                    out.push_str(line);
82                    state = MaskState::Outside;
83                } else {
84                    for ch in line.chars() {
85                        if AOZORA_TRIGGERS.contains(&ch) {
86                            originals.push(ch);
87                            out.push(MASK_CHAR);
88                        } else {
89                            out.push(ch);
90                        }
91                    }
92                }
93            }
94        }
95    }
96
97    (out, originals)
98}
99
100/// Reverse the masking. For every [`MASK_CHAR`] in `html`, take the
101/// next entry from `originals` (in source-scan order, which matches
102/// the order they appear in HTML).
103///
104/// If `originals` runs short, remaining `MASK_CHAR`s flow through
105/// unchanged — that is benign because they would be rendered as a
106/// PUA glyph in the browser and never collide with body text.
107#[must_use]
108pub(crate) fn unmask_html(html: &str, originals: &[char]) -> String {
109    if originals.is_empty() {
110        return html.to_owned();
111    }
112    let mut out = String::with_capacity(html.len());
113    let mut idx = 0;
114    for ch in html.chars() {
115        if ch == MASK_CHAR && idx < originals.len() {
116            out.push(originals[idx]);
117            idx += 1;
118        } else {
119            out.push(ch);
120        }
121    }
122    out
123}
124
125#[derive(Debug, Clone, Copy)]
126enum MaskState {
127    Outside,
128    Inside(FenceOpen),
129}
130
131#[derive(Debug, Clone, Copy)]
132struct FenceOpen {
133    /// Backtick or tilde — the fence character chosen on the open line.
134    marker: char,
135    /// Number of consecutive marker chars in the opening fence.
136    width: usize,
137}
138
139/// Recognise the opening of a fenced code block on this line.
140/// CommonMark allows up to 3 leading spaces before the fence run.
141/// Returns the fence shape if `line` is a valid open fence.
142fn parse_fence_open(line: &str) -> Option<FenceOpen> {
143    let stripped = trim_leading_indent(line, 3);
144    let bytes = stripped.as_bytes();
145    if bytes.is_empty() {
146        return None;
147    }
148    let marker = match bytes[0] {
149        b'`' => '`',
150        b'~' => '~',
151        _ => return None,
152    };
153    let width = bytes.iter().take_while(|&&b| b == bytes[0]).count();
154    if width < 3 {
155        return None;
156    }
157    // CommonMark forbids backticks in the info string of a backtick
158    // fence (it would defeat closure detection). We don't need to
159    // honour that to detect *opens* — the close-detector below
160    // re-checks marker char + width independently.
161    Some(FenceOpen { marker, width })
162}
163
164/// Recognise a closing fence: same marker char as `open`, at least
165/// `open.width` repetitions, optional leading indent up to 3 spaces,
166/// nothing but whitespace after the run.
167fn is_fence_close(line: &str, open: FenceOpen) -> bool {
168    let stripped = trim_leading_indent(line, 3);
169    let bytes = stripped.as_bytes();
170    let want = match open.marker {
171        '`' => b'`',
172        '~' => b'~',
173        _ => return false,
174    };
175    let run = bytes.iter().take_while(|&&b| b == want).count();
176    if run < open.width {
177        return false;
178    }
179    bytes[run..]
180        .iter()
181        .all(|&b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'))
182}
183
184/// Strip up to `max` leading ASCII spaces from `line`. Tabs are not
185/// expanded — CommonMark allows them inside the indent budget but
186/// our masking pass is a pre-pass for trigger char masking, not a
187/// CommonMark conformance check; tabs flow through untouched and the
188/// fence-detector simply fails on lines that lead with a tab. That is
189/// a strict subset of valid fences but matches every real-world afm
190/// source we have seen.
191fn trim_leading_indent(line: &str, max: usize) -> &str {
192    let bytes = line.as_bytes();
193    let cap = min(bytes.len(), max);
194    let consumed = bytes.iter().take(cap).take_while(|&&b| b == b' ').count();
195    &line[consumed..]
196}
197
198#[cfg(test)]
199mod tests {
200    use super::*;
201
202    #[test]
203    fn no_code_block_no_mask() {
204        let (out, originals) = mask_code_block_triggers("|青梅《おうめ》");
205        assert_eq!(out, "|青梅《おうめ》");
206        assert!(originals.is_empty());
207    }
208
209    #[test]
210    fn fenced_code_triggers_get_masked() {
211        let src = "before\n```\n|青梅《おうめ》\n```\nafter";
212        let (out, originals) = mask_code_block_triggers(src);
213        assert!(!out.contains('|'), "trigger leaked: {out:?}");
214        assert!(!out.contains('《'), "trigger leaked: {out:?}");
215        assert!(!out.contains('》'), "trigger leaked: {out:?}");
216        // before / after stay untouched
217        assert!(out.starts_with("before\n```\n"));
218        assert!(out.ends_with("\n```\nafter"));
219        assert_eq!(originals, vec!['|', '《', '》']);
220    }
221
222    #[test]
223    fn tilde_fence_works_too() {
224        let src = "~~~\n[#改ページ]\n~~~";
225        let (out, originals) = mask_code_block_triggers(src);
226        assert!(!out.contains('['));
227        assert_eq!(originals, vec!['[', ']']);
228    }
229
230    #[test]
231    fn close_fence_must_match_marker() {
232        // Opened with ``` but closed with ~~~ → still inside the
233        // fence; everything to EOF stays masked.
234        let src = "```\n|inside\n~~~\n|still\n";
235        let (_, originals) = mask_code_block_triggers(src);
236        assert_eq!(originals, vec!['|', '|']);
237    }
238
239    #[test]
240    fn close_fence_must_be_at_least_as_wide() {
241        // Opened with ````, closed with only ``` → not closed.
242        let src = "````\n|inside\n```\n|still\n";
243        let (_, originals) = mask_code_block_triggers(src);
244        assert_eq!(originals, vec!['|', '|']);
245    }
246
247    #[test]
248    fn outside_text_is_left_alone() {
249        let src = "|prose《outside》\n```\n|inside\n```\n|after《tail》";
250        let (out, originals) = mask_code_block_triggers(src);
251        assert!(out.contains("|prose《outside》"), "out: {out}");
252        assert!(out.contains("|after《tail》"), "out: {out}");
253        assert_eq!(originals, vec!['|']);
254    }
255
256    #[test]
257    fn pre_existing_mask_char_disables_masking() {
258        // If the source already contains MASK_CHAR, we cannot
259        // distinguish a masked trigger from a literal PUA char on the
260        // unmask side, so we bail out and leave aozora-pipeline's own
261        // PUA-collision diagnostic in charge.
262        let src = "\u{E000}\n```\n|trigger\n```".to_owned();
263        let (out, originals) = mask_code_block_triggers(&src);
264        assert_eq!(out, src);
265        assert!(originals.is_empty());
266    }
267
268    #[test]
269    fn unmask_round_trips_fenced_triggers() {
270        let src = "```\n|青梅《おうめ》\n```";
271        let (masked, originals) = mask_code_block_triggers(src);
272        // Pretend comrak emitted the masked content verbatim inside a
273        // <pre><code> block (which is exactly what it does).
274        let pseudo_html = format!(
275            "<pre><code>{}\n</code></pre>\n",
276            &masked[4..masked.len() - 4]
277        );
278        let restored = unmask_html(&pseudo_html, &originals);
279        assert!(restored.contains('|'), "got: {restored}");
280        assert!(restored.contains('《'));
281        assert!(restored.contains('》'));
282    }
283
284    #[test]
285    fn unmask_with_empty_originals_is_a_noop() {
286        assert_eq!(unmask_html("hello", &[]), "hello");
287    }
288
289    #[test]
290    fn unmask_handles_more_mask_chars_than_originals_gracefully() {
291        // Edge case: comrak somehow emitted more mask chars than we
292        // recorded. The extras flow through verbatim — benign.
293        let originals = vec!['|'];
294        let masked = format!("{MASK_CHAR}{MASK_CHAR}");
295        let restored = unmask_html(&masked, &originals);
296        assert_eq!(restored.chars().filter(|&c| c == '|').count(), 1);
297        assert_eq!(restored.chars().filter(|&c| c == MASK_CHAR).count(), 1);
298    }
299
300    #[test]
301    fn indent_up_to_three_spaces_does_not_break_fence_detection() {
302        let src = "   ```\n|inside\n   ```\nafter";
303        let (_, originals) = mask_code_block_triggers(src);
304        assert_eq!(originals, vec!['|']);
305    }
306
307    #[test]
308    fn indent_of_four_spaces_disables_the_fence() {
309        // Four leading spaces: the line is not a fence open per
310        // CommonMark (it would be an indented code block instead, but
311        // we don't mask indented code blocks). The trigger remains.
312        let src = "    ```\n|prose\n    ```";
313        let (out, originals) = mask_code_block_triggers(src);
314        assert!(out.contains('|'), "out: {out}");
315        assert!(originals.is_empty());
316    }
317}