afm_markdown/code_block_mask.rs
1//! Pre-/post-process pass that hides 青空文庫 trigger characters
2//! inside CommonMark fenced code blocks.
3//!
4//! ## Why this exists
5//!
6//! `aozora_pipeline` recognises every `|` / `《` / `》` / `[` / `]` / `※` /
7//! `〔` / `〕` / `「` / `」` as a candidate trigger and rewrites it
8//! into a PUA sentinel before comrak ever sees the source. That is
9//! exactly what we want for prose; it is exactly what we *don't* want
10//! inside a fenced code block, where every byte is supposed to flow
11//! through to `<pre><code>` literally.
12//!
13//! `aozora_pipeline` is intentionally CommonMark-blind (ADR-0010 — the
14//! parser core has no opinion on Markdown), so the responsibility for
15//! teaching it about code-block context lives here. We:
16//!
17//! 1. Scan the source line by line and locate every fenced code block
18//! (CommonMark info-string fence: a run of three or more backticks
19//! or three or more tildes after at most three leading spaces,
20//! closed by a same-character run that is at least as long).
21//! 2. Replace each Aozora trigger character inside a fence with
22//! [`MASK_CHAR`] (U+E000 — Private Use Area, distinct from the
23//! four sentinels U+E001..U+E004) and stash the original char in
24//! insertion order.
25//! 3. After `comrak::format_html`, restore the trigger characters in
26//! the HTML output by walking the originals list in the same
27//! order. `comrak`'s HTML escape only touches `<`, `>`, `&`, `"`;
28//! `MASK_CHAR` flows through untouched.
29//!
30//! ## Why not `\u{E000}` collisions?
31//!
32//! `aozora_pipeline`'s Phase 0 already scans for source-supplied PUA
33//! characters and emits a `Diagnostic::SourceContainsPua` for any
34//! encountered. We pre-scan for `MASK_CHAR` in the *original* source
35//! and skip masking entirely if any is present (returning the source
36//! verbatim and an empty originals list). That preserves the lexer's
37//! diagnostic on the user's pristine input and avoids an
38//! ambiguity-of-origin in the unmask step.
39
40use core::cmp::min;
41
42/// Private-use code point used to stand in for an Aozora trigger
43/// character that lives inside a fenced code block. Distinct from
44/// `aozora_pipeline::INLINE_SENTINEL` (U+E001) and the three block
45/// sentinels (U+E002..U+E004), so the masking pass cannot collide
46/// with the lexer's own sentinels.
47const MASK_CHAR: char = '\u{E000}';
48
49/// Every char `aozora_pipeline` treats as a recogniser trigger. Mirrors
50/// the upstream `aozora_pipeline` Phase 1 event tokeniser; if the upstream
51/// list grows, this list must follow.
52const AOZORA_TRIGGERS: &[char] = &['|', '《', '》', '[', ']', '※', '〔', '〕', '「', '」'];
53
54/// Mask every Aozora trigger character that appears inside a fenced
55/// code block. Returns the modified source and the ordered list of
56/// original characters that were replaced (for use by [`unmask_html`]).
57///
58/// Returns `(source.to_owned(), Vec::new())` and skips masking if the
59/// source already contains [`MASK_CHAR`] — see the module docs for
60/// the rationale.
61#[must_use]
62pub(crate) fn mask_code_block_triggers(source: &str) -> (String, Vec<char>) {
63 if source.contains(MASK_CHAR) {
64 return (source.to_owned(), Vec::new());
65 }
66
67 let mut out = String::with_capacity(source.len());
68 let mut originals: Vec<char> = Vec::new();
69 let mut state = MaskState::Outside;
70
71 for line in source.split_inclusive('\n') {
72 match state {
73 MaskState::Outside => {
74 out.push_str(line);
75 if let Some(fence) = parse_fence_open(line) {
76 state = MaskState::Inside(fence);
77 }
78 }
79 MaskState::Inside(open) => {
80 if is_fence_close(line, open) {
81 out.push_str(line);
82 state = MaskState::Outside;
83 } else {
84 for ch in line.chars() {
85 if AOZORA_TRIGGERS.contains(&ch) {
86 originals.push(ch);
87 out.push(MASK_CHAR);
88 } else {
89 out.push(ch);
90 }
91 }
92 }
93 }
94 }
95 }
96
97 (out, originals)
98}
99
100/// Reverse the masking. For every [`MASK_CHAR`] in `html`, take the
101/// next entry from `originals` (in source-scan order, which matches
102/// the order they appear in HTML).
103///
104/// If `originals` runs short, remaining `MASK_CHAR`s flow through
105/// unchanged — that is benign because they would be rendered as a
106/// PUA glyph in the browser and never collide with body text.
107#[must_use]
108pub(crate) fn unmask_html(html: &str, originals: &[char]) -> String {
109 if originals.is_empty() {
110 return html.to_owned();
111 }
112 let mut out = String::with_capacity(html.len());
113 let mut idx = 0;
114 for ch in html.chars() {
115 if ch == MASK_CHAR && idx < originals.len() {
116 out.push(originals[idx]);
117 idx += 1;
118 } else {
119 out.push(ch);
120 }
121 }
122 out
123}
124
125#[derive(Debug, Clone, Copy)]
126enum MaskState {
127 Outside,
128 Inside(FenceOpen),
129}
130
131#[derive(Debug, Clone, Copy)]
132struct FenceOpen {
133 /// Backtick or tilde — the fence character chosen on the open line.
134 marker: char,
135 /// Number of consecutive marker chars in the opening fence.
136 width: usize,
137}
138
139/// Recognise the opening of a fenced code block on this line.
140/// CommonMark allows up to 3 leading spaces before the fence run.
141/// Returns the fence shape if `line` is a valid open fence.
142fn parse_fence_open(line: &str) -> Option<FenceOpen> {
143 let stripped = trim_leading_indent(line, 3);
144 let bytes = stripped.as_bytes();
145 if bytes.is_empty() {
146 return None;
147 }
148 let marker = match bytes[0] {
149 b'`' => '`',
150 b'~' => '~',
151 _ => return None,
152 };
153 let width = bytes.iter().take_while(|&&b| b == bytes[0]).count();
154 if width < 3 {
155 return None;
156 }
157 // CommonMark forbids backticks in the info string of a backtick
158 // fence (it would defeat closure detection). We don't need to
159 // honour that to detect *opens* — the close-detector below
160 // re-checks marker char + width independently.
161 Some(FenceOpen { marker, width })
162}
163
164/// Recognise a closing fence: same marker char as `open`, at least
165/// `open.width` repetitions, optional leading indent up to 3 spaces,
166/// nothing but whitespace after the run.
167fn is_fence_close(line: &str, open: FenceOpen) -> bool {
168 let stripped = trim_leading_indent(line, 3);
169 let bytes = stripped.as_bytes();
170 let want = match open.marker {
171 '`' => b'`',
172 '~' => b'~',
173 _ => return false,
174 };
175 let run = bytes.iter().take_while(|&&b| b == want).count();
176 if run < open.width {
177 return false;
178 }
179 bytes[run..]
180 .iter()
181 .all(|&b| matches!(b, b' ' | b'\t' | b'\n' | b'\r'))
182}
183
184/// Strip up to `max` leading ASCII spaces from `line`. Tabs are not
185/// expanded — CommonMark allows them inside the indent budget but
186/// our masking pass is a pre-pass for trigger char masking, not a
187/// CommonMark conformance check; tabs flow through untouched and the
188/// fence-detector simply fails on lines that lead with a tab. That is
189/// a strict subset of valid fences but matches every real-world afm
190/// source we have seen.
191fn trim_leading_indent(line: &str, max: usize) -> &str {
192 let bytes = line.as_bytes();
193 let cap = min(bytes.len(), max);
194 let consumed = bytes.iter().take(cap).take_while(|&&b| b == b' ').count();
195 &line[consumed..]
196}
197
198#[cfg(test)]
199mod tests {
200 use super::*;
201
202 #[test]
203 fn no_code_block_no_mask() {
204 let (out, originals) = mask_code_block_triggers("|青梅《おうめ》");
205 assert_eq!(out, "|青梅《おうめ》");
206 assert!(originals.is_empty());
207 }
208
209 #[test]
210 fn fenced_code_triggers_get_masked() {
211 let src = "before\n```\n|青梅《おうめ》\n```\nafter";
212 let (out, originals) = mask_code_block_triggers(src);
213 assert!(!out.contains('|'), "trigger leaked: {out:?}");
214 assert!(!out.contains('《'), "trigger leaked: {out:?}");
215 assert!(!out.contains('》'), "trigger leaked: {out:?}");
216 // before / after stay untouched
217 assert!(out.starts_with("before\n```\n"));
218 assert!(out.ends_with("\n```\nafter"));
219 assert_eq!(originals, vec!['|', '《', '》']);
220 }
221
222 #[test]
223 fn tilde_fence_works_too() {
224 let src = "~~~\n[#改ページ]\n~~~";
225 let (out, originals) = mask_code_block_triggers(src);
226 assert!(!out.contains('['));
227 assert_eq!(originals, vec!['[', ']']);
228 }
229
230 #[test]
231 fn close_fence_must_match_marker() {
232 // Opened with ``` but closed with ~~~ → still inside the
233 // fence; everything to EOF stays masked.
234 let src = "```\n|inside\n~~~\n|still\n";
235 let (_, originals) = mask_code_block_triggers(src);
236 assert_eq!(originals, vec!['|', '|']);
237 }
238
239 #[test]
240 fn close_fence_must_be_at_least_as_wide() {
241 // Opened with ````, closed with only ``` → not closed.
242 let src = "````\n|inside\n```\n|still\n";
243 let (_, originals) = mask_code_block_triggers(src);
244 assert_eq!(originals, vec!['|', '|']);
245 }
246
247 #[test]
248 fn outside_text_is_left_alone() {
249 let src = "|prose《outside》\n```\n|inside\n```\n|after《tail》";
250 let (out, originals) = mask_code_block_triggers(src);
251 assert!(out.contains("|prose《outside》"), "out: {out}");
252 assert!(out.contains("|after《tail》"), "out: {out}");
253 assert_eq!(originals, vec!['|']);
254 }
255
256 #[test]
257 fn pre_existing_mask_char_disables_masking() {
258 // If the source already contains MASK_CHAR, we cannot
259 // distinguish a masked trigger from a literal PUA char on the
260 // unmask side, so we bail out and leave aozora-pipeline's own
261 // PUA-collision diagnostic in charge.
262 let src = "\u{E000}\n```\n|trigger\n```".to_owned();
263 let (out, originals) = mask_code_block_triggers(&src);
264 assert_eq!(out, src);
265 assert!(originals.is_empty());
266 }
267
268 #[test]
269 fn unmask_round_trips_fenced_triggers() {
270 let src = "```\n|青梅《おうめ》\n```";
271 let (masked, originals) = mask_code_block_triggers(src);
272 // Pretend comrak emitted the masked content verbatim inside a
273 // <pre><code> block (which is exactly what it does).
274 let pseudo_html = format!(
275 "<pre><code>{}\n</code></pre>\n",
276 &masked[4..masked.len() - 4]
277 );
278 let restored = unmask_html(&pseudo_html, &originals);
279 assert!(restored.contains('|'), "got: {restored}");
280 assert!(restored.contains('《'));
281 assert!(restored.contains('》'));
282 }
283
284 #[test]
285 fn unmask_with_empty_originals_is_a_noop() {
286 assert_eq!(unmask_html("hello", &[]), "hello");
287 }
288
289 #[test]
290 fn unmask_handles_more_mask_chars_than_originals_gracefully() {
291 // Edge case: comrak somehow emitted more mask chars than we
292 // recorded. The extras flow through verbatim — benign.
293 let originals = vec!['|'];
294 let masked = format!("{MASK_CHAR}{MASK_CHAR}");
295 let restored = unmask_html(&masked, &originals);
296 assert_eq!(restored.chars().filter(|&c| c == '|').count(), 1);
297 assert_eq!(restored.chars().filter(|&c| c == MASK_CHAR).count(), 1);
298 }
299
300 #[test]
301 fn indent_up_to_three_spaces_does_not_break_fence_detection() {
302 let src = " ```\n|inside\n ```\nafter";
303 let (_, originals) = mask_code_block_triggers(src);
304 assert_eq!(originals, vec!['|']);
305 }
306
307 #[test]
308 fn indent_of_four_spaces_disables_the_fence() {
309 // Four leading spaces: the line is not a fence open per
310 // CommonMark (it would be an indented code block instead, but
311 // we don't mask indented code blocks). The trigger remains.
312 let src = " ```\n|prose\n ```";
313 let (out, originals) = mask_code_block_triggers(src);
314 assert!(out.contains('|'), "out: {out}");
315 assert!(originals.is_empty());
316 }
317}