Skip to main content

afm_markdown/
post_process.rs

1//! HTML post-processing: splice Aozora sentinels into rendered comrak HTML.
2//!
3//! The afm pipeline runs comrak verbatim against the lexer's normalized
4//! text. Comrak emits ordinary `<p>...</p>` paragraphs for the lines
5//! the lexer planted with PUA sentinels (U+E001..U+E004 are not in
6//! CommonMark's HTML escape set, so they survive `format_html` verbatim).
7//! This module rewrites that HTML so each sentinel becomes its real
8//! Aozora HTML, while plain comrak output passes through unchanged.
9//!
10//! ## Sentinel taxonomy
11//!
12//! | Sentinel               | Source shape       | comrak emits            | We rewrite to                                    |
13//! |------------------------|--------------------|-------------------------|---------------------------------------------------|
14//! | `INLINE` (U+E001)      | inline `|...《》` | text inside a paragraph | `aozora_render::render_node::render` of the node |
15//! | `BLOCK_LEAF` (U+E002)  | leaf annotation    | `<p>U+E002</p>`         | `render_node` output (no surrounding `<p>`)      |
16//! | `BLOCK_OPEN` (U+E003)  | container start    | `<p>U+E003</p>`         | `render_node` open-pass output                   |
17//! | `BLOCK_CLOSE` (U+E004) | container end      | `<p>U+E004</p>`         | `render_node` close-pass output                  |
18//!
19//! ## Paragraph-aware splice
20//!
21//! Two cases beyond the sentinel-substitution above are handled per
22//! paragraph:
23//!
24//! - **Heading promotion** — a paragraph carrying a `HeadingHint`
25//!   inline sentinel (`[#「X」は大見出し]`) becomes
26//!   `<h{level}>{target}</h{level}>`. Other Aozora sentinels in the
27//!   same paragraph are consumed for registry lockstep but their HTML
28//!   is dropped, since the heading body is the hint's `target` field.
29//! - **Stack-balanced container close** — a `BlockClose` paragraph
30//!   without a matching open is silently discarded so we don't emit
31//!   orphan `</div>` tags. This protects the Tier-D tag-balance
32//!   invariant against pathological inputs.
33//!
34//! ## Order-based dispatch
35//!
36//! `aozora_pipeline` writes sentinels into `normalized` in source order,
37//! and the registry tables are sorted by byte position by
38//! construction. comrak preserves text order across `<p>...</p>`
39//! boundaries, so the order we encounter sentinels in the rendered
40//! HTML matches the order of the corresponding registry entries.
41//! We therefore pre-flatten the registry into an ordered
42//! `Vec<NodeRef<'_>>` keyed by source position and dispatch
43//! sequentially. No byte-position lookup is needed at HTML-rewrite
44//! time.
45
46use core::fmt;
47
48use aozora_pipeline::{BorrowedLexOutput, INLINE_SENTINEL};
49use aozora_render::render_node;
50use aozora_syntax::borrowed::{AozoraNode, HeadingHint, NodeRef};
51use aozora_syntax::{Container, ContainerKind};
52
53use crate::sentinels::{
54    BlockSentinelKind, SentinelCursor, flatten_registry_in_source_order, is_sentinel_char,
55    sole_block_sentinel,
56};
57
58/// Splice every Aozora sentinel in `comrak_html` into its real HTML
59/// rendering, using the registry inside `lex_out`.
60#[must_use]
61pub(crate) fn splice_aozora_html(comrak_html: &str, lex_out: &BorrowedLexOutput<'_>) -> String {
62    let nodes = flatten_registry_in_source_order(lex_out);
63    let mut state = SpliceState {
64        cursor: SentinelCursor::new(nodes.as_slice()),
65        container_stack: Vec::new(),
66    };
67
68    let mut out = String::with_capacity(comrak_html.len());
69    splice_into(comrak_html, &mut state, &mut out);
70    // Close any container that was opened but never closed in the
71    // source. Without this, malformed inputs produce an HTML tree
72    // with orphan `<div>` tags and Tier-D (tag balance) breaks.
73    while let Some(kind) = state.container_stack.pop() {
74        render_node_into(AozoraNode::Container(Container { kind }), false, &mut out);
75    }
76    // Brand boundary: the upstream `aozora-render` crate emits
77    // `aozora-*` CSS classes (its own brand for pure 青空文庫記法
78    // output). afm-markdown is a different surface — Aozora Flavored
79    // Markdown — and its output uses the `afm-*` brand. Rewrite every
80    // `aozora-*` class token to its `afm-*` counterpart before emit.
81    let rebranded = rebrand_aozora_classes_to_afm(&out);
82    // Defensive Tier-A guard: every `[#…]` that the upstream lexer
83    // failed to claim (e.g. an empty annotation `[#]` nested inside
84    // a baseless ruby pair `《》`, which the aozora-pipeline Phase 3
85    // replay path drops on the floor) gets wrapped in an
86    // `afm-annotation` hidden span here so the canary can't leak.
87    // No-op on the happy path because clean inputs leave no bare
88    // `[#` in the spliced HTML.
89    let bracket_safe = wrap_orphan_brackets_in_place(&rebranded);
90    // Defensive Tier-D guard: aozora's `[#…]` annotation claim can
91    // split a CommonMark emphasis run (e.g. `____` continued past the
92    // annotation), leaving `<strong>` opens unmatched at `</p>` time.
93    // We scan each `<p>...</p>` and prepend the missing inline closes
94    // before `</p>` so HTML tag balance survives even on those inputs.
95    balance_inline_tags_in_paragraphs(&bracket_safe)
96}
97
98/// Per-paragraph inline-tag balancer.
99///
100/// Walks each `<p>...</p>` substring once, counts open vs close
101/// occurrences for each emphasis-family inline tag, and prepends any
102/// missing closes before the paragraph's `</p>`. Touches no other
103/// container kinds — paragraphs are where comrak's emphasis pairing
104/// can leak the most under aozora-induced text splits.
105///
106/// Inline-tag-name list is intentionally narrow (`strong` / `em` /
107/// `code` / `del` / `s` / `sup` / `sub`): these are the CommonMark +
108/// GFM emphasis families that comrak resolves greedily and that
109/// aozora's annotation splitter can leave unbalanced. `span`, `ruby`,
110/// `a`, etc. are emitted by the renderer in matched pairs and stay
111/// out of this pass to avoid double-closing.
112fn balance_inline_tags_in_paragraphs(html: &str) -> String {
113    /// `(open_exact, open_with_attr, close)` for each inline tag we
114    /// rebalance. Static so the iteration allocates nothing.
115    const INLINE_TAGS: &[(&str, &str, &str)] = &[
116        ("<strong>", "<strong ", "</strong>"),
117        ("<em>", "<em ", "</em>"),
118        ("<code>", "<code ", "</code>"),
119        ("<del>", "<del ", "</del>"),
120        ("<s>", "<s ", "</s>"),
121        ("<sup>", "<sup ", "</sup>"),
122        ("<sub>", "<sub ", "</sub>"),
123    ];
124
125    let mut out = String::with_capacity(html.len());
126    let mut rest = html;
127
128    while let Some(p_start) = rest.find("<p>").or_else(|| rest.find("<p ")) {
129        let Some(p_end_rel) = rest[p_start..].find("</p>") else {
130            break;
131        };
132        let p_end = p_start + p_end_rel;
133
134        out.push_str(&rest[..p_end]);
135
136        let body = &rest[p_start..p_end];
137        for (open_exact, open_attr, close) in INLINE_TAGS {
138            let opens = body.matches(open_exact).count() + body.matches(open_attr).count();
139            let closes = body.matches(close).count();
140            if opens > closes {
141                for _ in 0..(opens - closes) {
142                    out.push_str(close);
143                }
144            }
145        }
146
147        out.push_str("</p>");
148        rest = &rest[p_end + "</p>".len()..];
149    }
150
151    out.push_str(rest);
152    out
153}
154
155/// Rewrite every `aozora-*` class token in `class="..."` attribute
156/// values to `afm-*`. Touches only class attributes — the brand on
157/// `data-*` attributes, on link targets, on text bodies, etc. is
158/// preserved verbatim.
159fn rebrand_aozora_classes_to_afm(html: &str) -> String {
160    if !html.contains("aozora-") {
161        return html.to_owned();
162    }
163    let mut out = String::with_capacity(html.len());
164    let mut cursor = 0;
165    while let Some(rel) = html[cursor..].find("class=\"") {
166        let attr_start = cursor + rel + "class=\"".len();
167        out.push_str(&html[cursor..attr_start]);
168        let Some(close_rel) = html[attr_start..].find('"') else {
169            out.push_str(&html[attr_start..]);
170            return out;
171        };
172        let attr_end = attr_start + close_rel;
173        let attr_value = &html[attr_start..attr_end];
174        for (i, token) in attr_value.split_ascii_whitespace().enumerate() {
175            if i > 0 {
176                out.push(' ');
177            }
178            if let Some(rest) = token.strip_prefix("aozora-") {
179                out.push_str("afm-");
180                out.push_str(rest);
181            } else {
182                out.push_str(token);
183            }
184        }
185        out.push('"');
186        cursor = attr_end + 1;
187    }
188    out.push_str(&html[cursor..]);
189    out
190}
191
192/// Find every `[#…]` in `html` that lives outside an HTML tag and
193/// outside an existing `afm-annotation` wrapper, and wrap it in a
194/// hidden `<span class="afm-annotation" hidden>…</span>`. The class
195/// name matches `aozora-render`'s annotation wrapper so
196/// `test_support::strip_annotation_wrappers` continues to recognise
197/// it, and the pass is idempotent: a second invocation finds the
198/// `afm-annotation` substring in the prefix and skips re-wrapping.
199fn wrap_orphan_brackets_in_place(html: &str) -> String {
200    let needle = "[#";
201    let close = ']';
202    let wrapper_class = "afm-annotation";
203    let wrapper_open = "<span class=\"afm-annotation\" hidden>";
204    let wrapper_close = "</span>";
205
206    if !html.contains(needle) {
207        return html.to_owned();
208    }
209
210    let mut out = String::with_capacity(html.len());
211    let mut cursor = 0;
212    while let Some(rel) = html[cursor..].find(needle) {
213        let abs = cursor + rel;
214        // Decide skip vs wrap by inspecting the *already-emitted* prefix
215        // (`out` + literal bytes from `cursor..abs`). This avoids the
216        // false-skip you'd get from looking back into `html` after we've
217        // started rewriting it.
218        let mut prefix = String::with_capacity(out.len() + (abs - cursor));
219        prefix.push_str(&out);
220        prefix.push_str(&html[cursor..abs]);
221        let last_open_tag = prefix.rfind('<').unwrap_or(0);
222        let last_close_tag = prefix.rfind('>').unwrap_or(0);
223        let inside_tag = last_open_tag > last_close_tag && !prefix.is_empty();
224        // `already_wrapped` checks only the *current* unfinished span:
225        // if a previous wrapper has already closed (`</span>` after the
226        // last `wrapper_class` mention), we are no longer inside it.
227        let last_wrapper_class = prefix.rfind(wrapper_class);
228        let last_wrapper_close = prefix.rfind(wrapper_close);
229        let already_wrapped = match (last_wrapper_class, last_wrapper_close) {
230            (Some(c), Some(z)) => c > z,
231            (Some(_), None) => true,
232            _ => false,
233        };
234        if inside_tag || already_wrapped {
235            out.push_str(&html[cursor..abs + needle.len()]);
236            cursor = abs + needle.len();
237            continue;
238        }
239        // Find a matching `]` after the marker. If none, wrap up to
240        // the next `<` (start of next tag) or EOF — never leave a bare
241        // bracket behind.
242        let after_open = abs + needle.len();
243        let bracket_run_end = html[after_open..]
244            .find(close)
245            .map(|r| after_open + r + close.len_utf8())
246            .or_else(|| html[after_open..].find('<').map(|r| after_open + r))
247            .unwrap_or(html.len());
248        out.push_str(&html[cursor..abs]);
249        out.push_str(wrapper_open);
250        push_html_escaped(&mut out, &html[abs..bracket_run_end]);
251        out.push_str(wrapper_close);
252        cursor = bracket_run_end;
253    }
254    out.push_str(&html[cursor..]);
255    out
256}
257
258struct SpliceState<'a, 'src> {
259    cursor: SentinelCursor<'a, 'src>,
260    /// `ContainerKind` of every still-open paired container, in LIFO
261    /// order. Push on `BlockOpen`, pop on `BlockClose`. Tracking the
262    /// kind (rather than just a depth counter) lets us synthesise a
263    /// matching close node when the source ends without one.
264    container_stack: Vec<ContainerKind>,
265}
266
267impl<'src> SpliceState<'_, 'src> {
268    fn peek(&self, offset: usize) -> Option<NodeRef<'src>> {
269        self.cursor.peek(offset)
270    }
271    fn next(&mut self) -> Option<NodeRef<'src>> {
272        self.cursor.next()
273    }
274    fn advance(&mut self, n: usize) {
275        self.cursor.advance(n);
276    }
277}
278
279fn splice_into(html: &str, state: &mut SpliceState<'_, '_>, out: &mut String) {
280    let mut cursor = 0;
281    let len = html.len();
282    while cursor < len {
283        // Process every `<p>...</p>` as a unit so we can handle
284        // single-block-sentinel paragraphs and heading-hint
285        // promotions structurally. Any inline sentinels living in
286        // *other* block contexts (`<h1>`, `<li>`, `<blockquote>`,
287        // table cells) flow through `splice_inline_pass`, which
288        // substitutes them in place without touching the surrounding
289        // tags.
290        let Some(p_open_rel) = html[cursor..].find("<p>") else {
291            // No more `<p>` anchors. The remainder may still contain
292            // inline sentinels embedded in headings / list items /
293            // tables, so finish with one inline pass.
294            splice_inline_pass(&html[cursor..], state, out);
295            break;
296        };
297        let p_open_abs = cursor + p_open_rel;
298
299        // Region between the cursor and the next `<p>` may carry
300        // inline sentinels (e.g. inside an `<h1>` body). Run an
301        // inline pass instead of a verbatim copy.
302        if p_open_abs > cursor {
303            splice_inline_pass(&html[cursor..p_open_abs], state, out);
304        }
305
306        let after_open = p_open_abs + 3;
307        let Some(close_rel) = html[after_open..].find("</p>") else {
308            // Malformed markup; treat the rest as inline and bail.
309            splice_inline_pass(&html[p_open_abs..], state, out);
310            break;
311        };
312        let p_close_abs = after_open + close_rel;
313        let inner = &html[after_open..p_close_abs];
314        let after_close = p_close_abs + 4; // skip "</p>"
315
316        process_paragraph(inner, state, out);
317        cursor = after_close;
318    }
319}
320
321fn process_paragraph(inner: &str, state: &mut SpliceState<'_, '_>, out: &mut String) {
322    // Case 1: a paragraph whose body is exactly one block-sentinel
323    // character. comrak isolates these because lex pads them with
324    // `\n\n` (Phase 4). We replace the whole `<p>...</p>` with
325    // standalone block / container HTML.
326    if let Some(kind) = sole_block_sentinel(inner) {
327        let Some(node_ref) = state.next() else {
328            return;
329        };
330        match (kind, node_ref) {
331            (BlockSentinelKind::Leaf, NodeRef::BlockLeaf(node)) => {
332                render_node_into(node, true, out);
333            }
334            (BlockSentinelKind::Open, NodeRef::BlockOpen(ck)) => {
335                state.container_stack.push(ck);
336                render_node_into(AozoraNode::Container(Container { kind: ck }), true, out);
337            }
338            (BlockSentinelKind::Close, NodeRef::BlockClose(ck))
339                if state.container_stack.pop().is_some() =>
340            {
341                // Matched open: emit the close tag.
342                render_node_into(AozoraNode::Container(Container { kind: ck }), false, out);
343            }
344            _ => {
345                // Registry/HTML drift; drop the entry.
346            }
347        }
348        return;
349    }
350
351    // Case 2: paragraph carries a `HeadingHint` inline sentinel —
352    // promote the host paragraph to `<h{level}>...</h{level}>` and
353    // discard the rest of the paragraph's sentinel HTML (the heading
354    // body is the hint's `target`, not the surrounding text).
355    if let Some(hint) = heading_hint_in_paragraph(inner, state) {
356        consume_inline_sentinels(inner, state);
357        let level = hint.level.clamp(1, 6);
358        write!(out, "<h{level}>").expect("writing to a String never fails");
359        push_html_escaped(out, &hint.target);
360        write!(out, "</h{level}>").expect("writing to a String never fails");
361        out.push('\n');
362        return;
363    }
364
365    // Case 3: ordinary paragraph — re-emit the wrapper and substitute
366    // any inline sentinels in place.
367    out.push_str("<p>");
368    splice_inline_pass(inner, state, out);
369    out.push_str("</p>");
370}
371
372/// Peek the inline sentinels in this paragraph against the registry.
373/// If the first inline sentinel is a `HeadingHint`, return it.
374fn heading_hint_in_paragraph<'src>(
375    inner: &str,
376    state: &SpliceState<'_, 'src>,
377) -> Option<&'src HeadingHint<'src>> {
378    let mut peek_offset = 0;
379    for ch in inner.chars() {
380        if !is_sentinel_char(ch) {
381            continue;
382        }
383        let node = state.peek(peek_offset)?;
384        peek_offset += 1;
385        if let NodeRef::Inline(AozoraNode::HeadingHint(h)) = node {
386            return Some(h);
387        }
388    }
389    None
390}
391
392/// Consume every inline-sentinel registry entry that the paragraph
393/// covers. Used after a heading-hint rewrite to keep the dispatcher
394/// in lockstep without emitting any of the in-paragraph nodes.
395fn consume_inline_sentinels(inner: &str, state: &mut SpliceState<'_, '_>) {
396    let count = inner.chars().filter(|&c| is_sentinel_char(c)).count();
397    state.advance(count);
398}
399
400fn splice_inline_pass(slice: &str, state: &mut SpliceState<'_, '_>, out: &mut String) {
401    let mut cursor = 0;
402    for (idx, ch) in slice.char_indices() {
403        if !is_sentinel_char(ch) {
404            continue;
405        }
406        out.push_str(&slice[cursor..idx]);
407        cursor = idx + ch.len_utf8();
408        let Some(node_ref) = state.next() else {
409            continue;
410        };
411        if ch == INLINE_SENTINEL {
412            if let NodeRef::Inline(node) = node_ref {
413                render_node_into(node, true, out);
414            }
415            // Mismatch (block payload at an inline position) → drop.
416        } else {
417            // Block sentinel inside an inline pass (e.g. inside a
418            // fenced code block, where comrak emits the sentinel as
419            // raw text). Drop the registry entry; emit nothing.
420        }
421    }
422    out.push_str(&slice[cursor..]);
423}
424
425fn render_node_into(node: AozoraNode<'_>, entering: bool, out: &mut String) {
426    render_node::render(node, entering, &mut StringSink(out))
427        .expect("writing AozoraNode HTML to a String cannot fail");
428}
429
430fn push_html_escaped(out: &mut String, s: &str) {
431    for ch in s.chars() {
432        match ch {
433            '&' => out.push_str("&amp;"),
434            '<' => out.push_str("&lt;"),
435            '>' => out.push_str("&gt;"),
436            '"' => out.push_str("&quot;"),
437            '\'' => out.push_str("&#39;"),
438            _ => out.push(ch),
439        }
440    }
441}
442
443/// `fmt::Write` adapter over `&mut String`.
444struct StringSink<'s>(&'s mut String);
445
446impl fmt::Write for StringSink<'_> {
447    fn write_str(&mut self, s: &str) -> fmt::Result {
448        self.0.write_str(s)
449    }
450}
451
452// `write!` macro brings `fmt::Write` into scope.
453use core::fmt::Write as _;
454
455#[cfg(test)]
456mod tests {
457    use super::*;
458    use aozora_pipeline::BLOCK_LEAF_SENTINEL;
459    use aozora_syntax::borrowed::Arena;
460
461    fn render(input: &str) -> String {
462        let arena = Arena::new();
463        let lex_out = aozora_pipeline::lex_into_arena(input, &arena);
464        let comrak_arena = comrak::Arena::new();
465        let opts = comrak::Options::default();
466        let root = comrak::parse_document(&comrak_arena, lex_out.normalized, &opts);
467        let mut html = String::new();
468        comrak::format_html(root, &opts, &mut html).unwrap();
469        splice_aozora_html(&html, &lex_out)
470    }
471
472    #[test]
473    fn plain_text_passes_through() {
474        assert!(render("hello").contains("hello"));
475    }
476
477    #[test]
478    fn ruby_inline_sentinel_is_replaced() {
479        let html = render("|青梅《おうめ》");
480        assert!(html.contains("<ruby>"), "html: {html}");
481        assert!(html.contains("青梅"));
482        assert!(html.contains("おうめ"));
483        assert!(!html.contains(INLINE_SENTINEL));
484    }
485
486    #[test]
487    fn page_break_block_leaf_replaces_paragraph() {
488        let html = render("前\n\n[#改ページ]\n\n後");
489        assert!(!html.contains(BLOCK_LEAF_SENTINEL));
490        assert!(!html.contains("<p>\u{E002}</p>"));
491    }
492
493    #[test]
494    fn heading_hint_promotes_paragraph_to_heading() {
495        let html = render("第一篇[#「第一篇」は大見出し]");
496        assert!(
497            html.contains("<h1>第一篇</h1>"),
498            "expected <h1>第一篇</h1>, got {html}"
499        );
500    }
501
502    #[test]
503    fn orphan_close_does_not_emit_div() {
504        let html = render("[#ここで字下げ終わり]");
505        let opens = html.matches("<div").count();
506        let closes = html.matches("</div>").count();
507        assert_eq!(opens, closes, "tag-balance broken: {html}");
508    }
509
510    #[test]
511    fn malformed_unclosed_paragraph_does_not_panic() {
512        // Pins `splice_into`'s `</p>`-not-found fallback. Synthesise a
513        // payload comrak would never emit (an unclosed `<p>` tag) and
514        // confirm the splice walks it without panicking.
515        let arena = Arena::new();
516        let lex_out = aozora_pipeline::lex_into_arena("hello", &arena);
517        let out = splice_aozora_html("<p>unclosed paragraph", &lex_out);
518        assert!(out.contains("unclosed paragraph"), "got: {out}");
519    }
520
521    #[test]
522    fn block_sentinel_paragraph_with_exhausted_registry_does_not_panic() {
523        // Pins `process_paragraph`'s `state.next() = None` early-return.
524        // We hand the splicer a paragraph that *looks* like a block
525        // sentinel but for which the registry is empty. The splicer
526        // must drop the paragraph silently.
527        let arena = Arena::new();
528        let lex_out = aozora_pipeline::lex_into_arena("plain", &arena);
529        // `lex_out.registry` for "plain" is empty, but we feed an HTML
530        // payload that pretends to contain one. The splicer should
531        // produce no Aozora HTML for that paragraph and not panic.
532        let payload = format!("<p>{BLOCK_LEAF_SENTINEL}</p>\n");
533        let out = splice_aozora_html(&payload, &lex_out);
534        assert!(
535            !out.contains(BLOCK_LEAF_SENTINEL),
536            "sentinel survived: {out}"
537        );
538    }
539
540    #[test]
541    fn block_sentinel_inside_inline_pass_drops_silently() {
542        // Pins `splice_inline_pass`'s "block sentinel found here"
543        // fallback. This is the exact path that fenced-code-block
544        // contents trigger: a block sentinel survives into a non-`<p>`
545        // context and must be discarded silently rather than panicking
546        // or leaking.
547        let html = render("```\n[#改ページ]\n```");
548        // The page-break marker must not leak into the code block as
549        // its `afm-page-break` div, because it lives inside `<pre>`.
550        // Either the sentinel is dropped (current behaviour) or its
551        // markup escapes into the `<pre>` body — both are acceptable
552        // for code-block content; what matters is that no panic
553        // occurs and no raw sentinel char survives.
554        assert!(
555            !html.contains(BLOCK_LEAF_SENTINEL),
556            "sentinel leaked: {html}"
557        );
558    }
559
560    #[test]
561    fn heading_hint_target_html_special_chars_are_escaped() {
562        // `push_html_escaped` covers the `<`/`>`/`&`/`"`/`'` arms only
563        // when a HeadingHint target carries one of those characters.
564        // Exercise each via a forward-reference heading hint whose
565        // target is the special char run.
566        let html = render("<&\"'><&\"'>[#「<&\"'>」は大見出し]");
567        assert!(html.contains("&lt;"), "missing < escape: {html}");
568        assert!(html.contains("&gt;"), "missing > escape: {html}");
569        assert!(html.contains("&amp;"), "missing & escape: {html}");
570        assert!(html.contains("&quot;"), "missing \" escape: {html}");
571        assert!(html.contains("&#39;"), "missing ' escape: {html}");
572    }
573}