afm_markdown/post_process.rs
1//! HTML post-processing: splice Aozora sentinels into rendered comrak HTML.
2//!
3//! The afm pipeline runs comrak verbatim against the lexer's normalized
4//! text. Comrak emits ordinary `<p>...</p>` paragraphs for the lines
5//! the lexer planted with PUA sentinels (U+E001..U+E004 are not in
6//! CommonMark's HTML escape set, so they survive `format_html` verbatim).
7//! This module rewrites that HTML so each sentinel becomes its real
8//! Aozora HTML, while plain comrak output passes through unchanged.
9//!
10//! ## Sentinel taxonomy
11//!
12//! | Sentinel | Source shape | comrak emits | We rewrite to |
13//! |------------------------|--------------------|-------------------------|---------------------------------------------------|
14//! | `INLINE` (U+E001) | inline `|...《》` | text inside a paragraph | `aozora_render::render_node::render` of the node |
15//! | `BLOCK_LEAF` (U+E002) | leaf annotation | `<p>U+E002</p>` | `render_node` output (no surrounding `<p>`) |
16//! | `BLOCK_OPEN` (U+E003) | container start | `<p>U+E003</p>` | `render_node` open-pass output |
17//! | `BLOCK_CLOSE` (U+E004) | container end | `<p>U+E004</p>` | `render_node` close-pass output |
18//!
19//! ## Paragraph-aware splice
20//!
21//! Two cases beyond the sentinel-substitution above are handled per
22//! paragraph:
23//!
24//! - **Heading promotion** — a paragraph carrying a `HeadingHint`
25//! inline sentinel (`[#「X」は大見出し]`) becomes
26//! `<h{level}>{target}</h{level}>`. Other Aozora sentinels in the
27//! same paragraph are consumed for registry lockstep but their HTML
28//! is dropped, since the heading body is the hint's `target` field.
29//! - **Stack-balanced container close** — a `BlockClose` paragraph
30//! without a matching open is silently discarded so we don't emit
31//! orphan `</div>` tags. This protects the Tier-D tag-balance
32//! invariant against pathological inputs.
33//!
34//! ## Order-based dispatch
35//!
36//! `aozora_pipeline` writes sentinels into `normalized` in source order,
37//! and the registry tables are sorted by byte position by
38//! construction. comrak preserves text order across `<p>...</p>`
39//! boundaries, so the order we encounter sentinels in the rendered
40//! HTML matches the order of the corresponding registry entries.
41//! We therefore pre-flatten the registry into an ordered
42//! `Vec<NodeRef<'_>>` keyed by source position and dispatch
43//! sequentially. No byte-position lookup is needed at HTML-rewrite
44//! time.
45
46use core::fmt;
47
48use aozora_pipeline::{BorrowedLexOutput, INLINE_SENTINEL};
49use aozora_render::render_node;
50use aozora_syntax::borrowed::{AozoraNode, HeadingHint, NodeRef};
51use aozora_syntax::{Container, ContainerKind};
52
53use crate::sentinels::{
54 BlockSentinelKind, SentinelCursor, flatten_registry_in_source_order, is_sentinel_char,
55 sole_block_sentinel,
56};
57
58/// Splice every Aozora sentinel in `comrak_html` into its real HTML
59/// rendering, using the registry inside `lex_out`.
60#[must_use]
61pub(crate) fn splice_aozora_html(comrak_html: &str, lex_out: &BorrowedLexOutput<'_>) -> String {
62 let nodes = flatten_registry_in_source_order(lex_out);
63 let mut state = SpliceState {
64 cursor: SentinelCursor::new(nodes.as_slice()),
65 container_stack: Vec::new(),
66 };
67
68 let mut out = String::with_capacity(comrak_html.len());
69 splice_into(comrak_html, &mut state, &mut out);
70 // Close any container that was opened but never closed in the
71 // source. Without this, malformed inputs produce an HTML tree
72 // with orphan `<div>` tags and Tier-D (tag balance) breaks.
73 while let Some(kind) = state.container_stack.pop() {
74 render_node_into(AozoraNode::Container(Container { kind }), false, &mut out);
75 }
76 // Brand boundary: the upstream `aozora-render` crate emits
77 // `aozora-*` CSS classes (its own brand for pure 青空文庫記法
78 // output). afm-markdown is a different surface — Aozora Flavored
79 // Markdown — and its output uses the `afm-*` brand. Rewrite every
80 // `aozora-*` class token to its `afm-*` counterpart before emit.
81 let rebranded = rebrand_aozora_classes_to_afm(&out);
82 // Defensive Tier-A guard: every `[#…]` that the upstream lexer
83 // failed to claim (e.g. an empty annotation `[#]` nested inside
84 // a baseless ruby pair `《》`, which the aozora-pipeline Phase 3
85 // replay path drops on the floor) gets wrapped in an
86 // `afm-annotation` hidden span here so the canary can't leak.
87 // No-op on the happy path because clean inputs leave no bare
88 // `[#` in the spliced HTML.
89 let bracket_safe = wrap_orphan_brackets_in_place(&rebranded);
90 // Defensive Tier-D guard: aozora's `[#…]` annotation claim can
91 // split a CommonMark emphasis run (e.g. `____` continued past the
92 // annotation), leaving `<strong>` opens unmatched at `</p>` time.
93 // We scan each `<p>...</p>` and prepend the missing inline closes
94 // before `</p>` so HTML tag balance survives even on those inputs.
95 balance_inline_tags_in_paragraphs(&bracket_safe)
96}
97
98/// Per-paragraph inline-tag balancer.
99///
100/// Walks each `<p>...</p>` substring once, counts open vs close
101/// occurrences for each emphasis-family inline tag, and prepends any
102/// missing closes before the paragraph's `</p>`. Touches no other
103/// container kinds — paragraphs are where comrak's emphasis pairing
104/// can leak the most under aozora-induced text splits.
105///
106/// Inline-tag-name list is intentionally narrow (`strong` / `em` /
107/// `code` / `del` / `s` / `sup` / `sub`): these are the CommonMark +
108/// GFM emphasis families that comrak resolves greedily and that
109/// aozora's annotation splitter can leave unbalanced. `span`, `ruby`,
110/// `a`, etc. are emitted by the renderer in matched pairs and stay
111/// out of this pass to avoid double-closing.
112fn balance_inline_tags_in_paragraphs(html: &str) -> String {
113 /// `(open_exact, open_with_attr, close)` for each inline tag we
114 /// rebalance. Static so the iteration allocates nothing.
115 const INLINE_TAGS: &[(&str, &str, &str)] = &[
116 ("<strong>", "<strong ", "</strong>"),
117 ("<em>", "<em ", "</em>"),
118 ("<code>", "<code ", "</code>"),
119 ("<del>", "<del ", "</del>"),
120 ("<s>", "<s ", "</s>"),
121 ("<sup>", "<sup ", "</sup>"),
122 ("<sub>", "<sub ", "</sub>"),
123 ];
124
125 let mut out = String::with_capacity(html.len());
126 let mut rest = html;
127
128 while let Some(p_start) = rest.find("<p>").or_else(|| rest.find("<p ")) {
129 let Some(p_end_rel) = rest[p_start..].find("</p>") else {
130 break;
131 };
132 let p_end = p_start + p_end_rel;
133
134 out.push_str(&rest[..p_end]);
135
136 let body = &rest[p_start..p_end];
137 for (open_exact, open_attr, close) in INLINE_TAGS {
138 let opens = body.matches(open_exact).count() + body.matches(open_attr).count();
139 let closes = body.matches(close).count();
140 if opens > closes {
141 for _ in 0..(opens - closes) {
142 out.push_str(close);
143 }
144 }
145 }
146
147 out.push_str("</p>");
148 rest = &rest[p_end + "</p>".len()..];
149 }
150
151 out.push_str(rest);
152 out
153}
154
155/// Rewrite every `aozora-*` class token in `class="..."` attribute
156/// values to `afm-*`. Touches only class attributes — the brand on
157/// `data-*` attributes, on link targets, on text bodies, etc. is
158/// preserved verbatim.
159fn rebrand_aozora_classes_to_afm(html: &str) -> String {
160 if !html.contains("aozora-") {
161 return html.to_owned();
162 }
163 let mut out = String::with_capacity(html.len());
164 let mut cursor = 0;
165 while let Some(rel) = html[cursor..].find("class=\"") {
166 let attr_start = cursor + rel + "class=\"".len();
167 out.push_str(&html[cursor..attr_start]);
168 let Some(close_rel) = html[attr_start..].find('"') else {
169 out.push_str(&html[attr_start..]);
170 return out;
171 };
172 let attr_end = attr_start + close_rel;
173 let attr_value = &html[attr_start..attr_end];
174 for (i, token) in attr_value.split_ascii_whitespace().enumerate() {
175 if i > 0 {
176 out.push(' ');
177 }
178 if let Some(rest) = token.strip_prefix("aozora-") {
179 out.push_str("afm-");
180 out.push_str(rest);
181 } else {
182 out.push_str(token);
183 }
184 }
185 out.push('"');
186 cursor = attr_end + 1;
187 }
188 out.push_str(&html[cursor..]);
189 out
190}
191
192/// Find every `[#…]` in `html` that lives outside an HTML tag and
193/// outside an existing `afm-annotation` wrapper, and wrap it in a
194/// hidden `<span class="afm-annotation" hidden>…</span>`. The class
195/// name matches `aozora-render`'s annotation wrapper so
196/// `test_support::strip_annotation_wrappers` continues to recognise
197/// it, and the pass is idempotent: a second invocation finds the
198/// `afm-annotation` substring in the prefix and skips re-wrapping.
199fn wrap_orphan_brackets_in_place(html: &str) -> String {
200 let needle = "[#";
201 let close = ']';
202 let wrapper_class = "afm-annotation";
203 let wrapper_open = "<span class=\"afm-annotation\" hidden>";
204 let wrapper_close = "</span>";
205
206 if !html.contains(needle) {
207 return html.to_owned();
208 }
209
210 let mut out = String::with_capacity(html.len());
211 let mut cursor = 0;
212 while let Some(rel) = html[cursor..].find(needle) {
213 let abs = cursor + rel;
214 // Decide skip vs wrap by inspecting the *already-emitted* prefix
215 // (`out` + literal bytes from `cursor..abs`). This avoids the
216 // false-skip you'd get from looking back into `html` after we've
217 // started rewriting it.
218 let mut prefix = String::with_capacity(out.len() + (abs - cursor));
219 prefix.push_str(&out);
220 prefix.push_str(&html[cursor..abs]);
221 let last_open_tag = prefix.rfind('<').unwrap_or(0);
222 let last_close_tag = prefix.rfind('>').unwrap_or(0);
223 let inside_tag = last_open_tag > last_close_tag && !prefix.is_empty();
224 // `already_wrapped` checks only the *current* unfinished span:
225 // if a previous wrapper has already closed (`</span>` after the
226 // last `wrapper_class` mention), we are no longer inside it.
227 let last_wrapper_class = prefix.rfind(wrapper_class);
228 let last_wrapper_close = prefix.rfind(wrapper_close);
229 let already_wrapped = match (last_wrapper_class, last_wrapper_close) {
230 (Some(c), Some(z)) => c > z,
231 (Some(_), None) => true,
232 _ => false,
233 };
234 if inside_tag || already_wrapped {
235 out.push_str(&html[cursor..abs + needle.len()]);
236 cursor = abs + needle.len();
237 continue;
238 }
239 // Find a matching `]` after the marker. If none, wrap up to
240 // the next `<` (start of next tag) or EOF — never leave a bare
241 // bracket behind.
242 let after_open = abs + needle.len();
243 let bracket_run_end = html[after_open..]
244 .find(close)
245 .map(|r| after_open + r + close.len_utf8())
246 .or_else(|| html[after_open..].find('<').map(|r| after_open + r))
247 .unwrap_or(html.len());
248 out.push_str(&html[cursor..abs]);
249 out.push_str(wrapper_open);
250 push_html_escaped(&mut out, &html[abs..bracket_run_end]);
251 out.push_str(wrapper_close);
252 cursor = bracket_run_end;
253 }
254 out.push_str(&html[cursor..]);
255 out
256}
257
258struct SpliceState<'a, 'src> {
259 cursor: SentinelCursor<'a, 'src>,
260 /// `ContainerKind` of every still-open paired container, in LIFO
261 /// order. Push on `BlockOpen`, pop on `BlockClose`. Tracking the
262 /// kind (rather than just a depth counter) lets us synthesise a
263 /// matching close node when the source ends without one.
264 container_stack: Vec<ContainerKind>,
265}
266
267impl<'src> SpliceState<'_, 'src> {
268 fn peek(&self, offset: usize) -> Option<NodeRef<'src>> {
269 self.cursor.peek(offset)
270 }
271 fn next(&mut self) -> Option<NodeRef<'src>> {
272 self.cursor.next()
273 }
274 fn advance(&mut self, n: usize) {
275 self.cursor.advance(n);
276 }
277}
278
279fn splice_into(html: &str, state: &mut SpliceState<'_, '_>, out: &mut String) {
280 let mut cursor = 0;
281 let len = html.len();
282 while cursor < len {
283 // Process every `<p>...</p>` as a unit so we can handle
284 // single-block-sentinel paragraphs and heading-hint
285 // promotions structurally. Any inline sentinels living in
286 // *other* block contexts (`<h1>`, `<li>`, `<blockquote>`,
287 // table cells) flow through `splice_inline_pass`, which
288 // substitutes them in place without touching the surrounding
289 // tags.
290 let Some(p_open_rel) = html[cursor..].find("<p>") else {
291 // No more `<p>` anchors. The remainder may still contain
292 // inline sentinels embedded in headings / list items /
293 // tables, so finish with one inline pass.
294 splice_inline_pass(&html[cursor..], state, out);
295 break;
296 };
297 let p_open_abs = cursor + p_open_rel;
298
299 // Region between the cursor and the next `<p>` may carry
300 // inline sentinels (e.g. inside an `<h1>` body). Run an
301 // inline pass instead of a verbatim copy.
302 if p_open_abs > cursor {
303 splice_inline_pass(&html[cursor..p_open_abs], state, out);
304 }
305
306 let after_open = p_open_abs + 3;
307 let Some(close_rel) = html[after_open..].find("</p>") else {
308 // Malformed markup; treat the rest as inline and bail.
309 splice_inline_pass(&html[p_open_abs..], state, out);
310 break;
311 };
312 let p_close_abs = after_open + close_rel;
313 let inner = &html[after_open..p_close_abs];
314 let after_close = p_close_abs + 4; // skip "</p>"
315
316 process_paragraph(inner, state, out);
317 cursor = after_close;
318 }
319}
320
321fn process_paragraph(inner: &str, state: &mut SpliceState<'_, '_>, out: &mut String) {
322 // Case 1: a paragraph whose body is exactly one block-sentinel
323 // character. comrak isolates these because lex pads them with
324 // `\n\n` (Phase 4). We replace the whole `<p>...</p>` with
325 // standalone block / container HTML.
326 if let Some(kind) = sole_block_sentinel(inner) {
327 let Some(node_ref) = state.next() else {
328 return;
329 };
330 match (kind, node_ref) {
331 (BlockSentinelKind::Leaf, NodeRef::BlockLeaf(node)) => {
332 render_node_into(node, true, out);
333 }
334 (BlockSentinelKind::Open, NodeRef::BlockOpen(ck)) => {
335 state.container_stack.push(ck);
336 render_node_into(AozoraNode::Container(Container { kind: ck }), true, out);
337 }
338 (BlockSentinelKind::Close, NodeRef::BlockClose(ck))
339 if state.container_stack.pop().is_some() =>
340 {
341 // Matched open: emit the close tag.
342 render_node_into(AozoraNode::Container(Container { kind: ck }), false, out);
343 }
344 _ => {
345 // Registry/HTML drift; drop the entry.
346 }
347 }
348 return;
349 }
350
351 // Case 2: paragraph carries a `HeadingHint` inline sentinel —
352 // promote the host paragraph to `<h{level}>...</h{level}>` and
353 // discard the rest of the paragraph's sentinel HTML (the heading
354 // body is the hint's `target`, not the surrounding text).
355 if let Some(hint) = heading_hint_in_paragraph(inner, state) {
356 consume_inline_sentinels(inner, state);
357 let level = hint.level.clamp(1, 6);
358 write!(out, "<h{level}>").expect("writing to a String never fails");
359 push_html_escaped(out, &hint.target);
360 write!(out, "</h{level}>").expect("writing to a String never fails");
361 out.push('\n');
362 return;
363 }
364
365 // Case 3: ordinary paragraph — re-emit the wrapper and substitute
366 // any inline sentinels in place.
367 out.push_str("<p>");
368 splice_inline_pass(inner, state, out);
369 out.push_str("</p>");
370}
371
372/// Peek the inline sentinels in this paragraph against the registry.
373/// If the first inline sentinel is a `HeadingHint`, return it.
374fn heading_hint_in_paragraph<'src>(
375 inner: &str,
376 state: &SpliceState<'_, 'src>,
377) -> Option<&'src HeadingHint<'src>> {
378 let mut peek_offset = 0;
379 for ch in inner.chars() {
380 if !is_sentinel_char(ch) {
381 continue;
382 }
383 let node = state.peek(peek_offset)?;
384 peek_offset += 1;
385 if let NodeRef::Inline(AozoraNode::HeadingHint(h)) = node {
386 return Some(h);
387 }
388 }
389 None
390}
391
392/// Consume every inline-sentinel registry entry that the paragraph
393/// covers. Used after a heading-hint rewrite to keep the dispatcher
394/// in lockstep without emitting any of the in-paragraph nodes.
395fn consume_inline_sentinels(inner: &str, state: &mut SpliceState<'_, '_>) {
396 let count = inner.chars().filter(|&c| is_sentinel_char(c)).count();
397 state.advance(count);
398}
399
400fn splice_inline_pass(slice: &str, state: &mut SpliceState<'_, '_>, out: &mut String) {
401 let mut cursor = 0;
402 for (idx, ch) in slice.char_indices() {
403 if !is_sentinel_char(ch) {
404 continue;
405 }
406 out.push_str(&slice[cursor..idx]);
407 cursor = idx + ch.len_utf8();
408 let Some(node_ref) = state.next() else {
409 continue;
410 };
411 if ch == INLINE_SENTINEL {
412 if let NodeRef::Inline(node) = node_ref {
413 render_node_into(node, true, out);
414 }
415 // Mismatch (block payload at an inline position) → drop.
416 } else {
417 // Block sentinel inside an inline pass (e.g. inside a
418 // fenced code block, where comrak emits the sentinel as
419 // raw text). Drop the registry entry; emit nothing.
420 }
421 }
422 out.push_str(&slice[cursor..]);
423}
424
425fn render_node_into(node: AozoraNode<'_>, entering: bool, out: &mut String) {
426 render_node::render(node, entering, &mut StringSink(out))
427 .expect("writing AozoraNode HTML to a String cannot fail");
428}
429
430fn push_html_escaped(out: &mut String, s: &str) {
431 for ch in s.chars() {
432 match ch {
433 '&' => out.push_str("&"),
434 '<' => out.push_str("<"),
435 '>' => out.push_str(">"),
436 '"' => out.push_str("""),
437 '\'' => out.push_str("'"),
438 _ => out.push(ch),
439 }
440 }
441}
442
443/// `fmt::Write` adapter over `&mut String`.
444struct StringSink<'s>(&'s mut String);
445
446impl fmt::Write for StringSink<'_> {
447 fn write_str(&mut self, s: &str) -> fmt::Result {
448 self.0.write_str(s)
449 }
450}
451
452// `write!` macro brings `fmt::Write` into scope.
453use core::fmt::Write as _;
454
455#[cfg(test)]
456mod tests {
457 use super::*;
458 use aozora_pipeline::BLOCK_LEAF_SENTINEL;
459 use aozora_syntax::borrowed::Arena;
460
461 fn render(input: &str) -> String {
462 let arena = Arena::new();
463 let lex_out = aozora_pipeline::lex_into_arena(input, &arena);
464 let comrak_arena = comrak::Arena::new();
465 let opts = comrak::Options::default();
466 let root = comrak::parse_document(&comrak_arena, lex_out.normalized, &opts);
467 let mut html = String::new();
468 comrak::format_html(root, &opts, &mut html).unwrap();
469 splice_aozora_html(&html, &lex_out)
470 }
471
472 #[test]
473 fn plain_text_passes_through() {
474 assert!(render("hello").contains("hello"));
475 }
476
477 #[test]
478 fn ruby_inline_sentinel_is_replaced() {
479 let html = render("|青梅《おうめ》");
480 assert!(html.contains("<ruby>"), "html: {html}");
481 assert!(html.contains("青梅"));
482 assert!(html.contains("おうめ"));
483 assert!(!html.contains(INLINE_SENTINEL));
484 }
485
486 #[test]
487 fn page_break_block_leaf_replaces_paragraph() {
488 let html = render("前\n\n[#改ページ]\n\n後");
489 assert!(!html.contains(BLOCK_LEAF_SENTINEL));
490 assert!(!html.contains("<p>\u{E002}</p>"));
491 }
492
493 #[test]
494 fn heading_hint_promotes_paragraph_to_heading() {
495 let html = render("第一篇[#「第一篇」は大見出し]");
496 assert!(
497 html.contains("<h1>第一篇</h1>"),
498 "expected <h1>第一篇</h1>, got {html}"
499 );
500 }
501
502 #[test]
503 fn orphan_close_does_not_emit_div() {
504 let html = render("[#ここで字下げ終わり]");
505 let opens = html.matches("<div").count();
506 let closes = html.matches("</div>").count();
507 assert_eq!(opens, closes, "tag-balance broken: {html}");
508 }
509
510 #[test]
511 fn malformed_unclosed_paragraph_does_not_panic() {
512 // Pins `splice_into`'s `</p>`-not-found fallback. Synthesise a
513 // payload comrak would never emit (an unclosed `<p>` tag) and
514 // confirm the splice walks it without panicking.
515 let arena = Arena::new();
516 let lex_out = aozora_pipeline::lex_into_arena("hello", &arena);
517 let out = splice_aozora_html("<p>unclosed paragraph", &lex_out);
518 assert!(out.contains("unclosed paragraph"), "got: {out}");
519 }
520
521 #[test]
522 fn block_sentinel_paragraph_with_exhausted_registry_does_not_panic() {
523 // Pins `process_paragraph`'s `state.next() = None` early-return.
524 // We hand the splicer a paragraph that *looks* like a block
525 // sentinel but for which the registry is empty. The splicer
526 // must drop the paragraph silently.
527 let arena = Arena::new();
528 let lex_out = aozora_pipeline::lex_into_arena("plain", &arena);
529 // `lex_out.registry` for "plain" is empty, but we feed an HTML
530 // payload that pretends to contain one. The splicer should
531 // produce no Aozora HTML for that paragraph and not panic.
532 let payload = format!("<p>{BLOCK_LEAF_SENTINEL}</p>\n");
533 let out = splice_aozora_html(&payload, &lex_out);
534 assert!(
535 !out.contains(BLOCK_LEAF_SENTINEL),
536 "sentinel survived: {out}"
537 );
538 }
539
540 #[test]
541 fn block_sentinel_inside_inline_pass_drops_silently() {
542 // Pins `splice_inline_pass`'s "block sentinel found here"
543 // fallback. This is the exact path that fenced-code-block
544 // contents trigger: a block sentinel survives into a non-`<p>`
545 // context and must be discarded silently rather than panicking
546 // or leaking.
547 let html = render("```\n[#改ページ]\n```");
548 // The page-break marker must not leak into the code block as
549 // its `afm-page-break` div, because it lives inside `<pre>`.
550 // Either the sentinel is dropped (current behaviour) or its
551 // markup escapes into the `<pre>` body — both are acceptable
552 // for code-block content; what matters is that no panic
553 // occurs and no raw sentinel char survives.
554 assert!(
555 !html.contains(BLOCK_LEAF_SENTINEL),
556 "sentinel leaked: {html}"
557 );
558 }
559
560 #[test]
561 fn heading_hint_target_html_special_chars_are_escaped() {
562 // `push_html_escaped` covers the `<`/`>`/`&`/`"`/`'` arms only
563 // when a HeadingHint target carries one of those characters.
564 // Exercise each via a forward-reference heading hint whose
565 // target is the special char run.
566 let html = render("<&\"'><&\"'>[#「<&\"'>」は大見出し]");
567 assert!(html.contains("<"), "missing < escape: {html}");
568 assert!(html.contains(">"), "missing > escape: {html}");
569 assert!(html.contains("&"), "missing & escape: {html}");
570 assert!(html.contains("""), "missing \" escape: {html}");
571 assert!(html.contains("'"), "missing ' escape: {html}");
572 }
573}