Skip to main content

afm_markdown/
lib.rs

1//! Aozora Flavored Markdown — CommonMark + GFM + 青空文庫記法.
2//!
3//! Layers `aozora-pipeline` (青空文庫記法 borrowed-AST lexer) onto a
4//! vendored verbatim comrak so a single [`render_to_string`] call
5//! turns afm source into HTML. Public entry points:
6//!
7//! - [`render_to_string`] — render afm source straight to HTML.
8//! - [`serialize`] — afm-source round-trip (delegates to
9//!   [`aozora_render::serialize::serialize`]).
10//! - [`Options`] — configuration; [`Options::afm_default`] enables
11//!   the GFM extensions afm uses on top of CommonMark.
12//!
13//! ## Pipeline
14//!
15//! ```text
16//! source                                   ── UTF-8 input
17//!   │
18//!   ▼ aozora_pipeline::lex_into_arena      ── normalized text + Registry
19//!   │
20//!   ▼ comrak::parse_document               ── vanilla CommonMark + GFM
21//!   │   (PUA sentinels U+E001..U+E004 flow through as plain text)
22//!   │
23//!   ▼ comrak::format_html_with_options     ── HTML with sentinels
24//!   │
25//!   ▼ post_process::splice_aozora_html     ── sentinel → aozora-render
26//!   │   · INLINE_SENTINEL → render_node::render output
27//!   │   · BLOCK_LEAF paragraphs → leaf node HTML
28//!   │   · BLOCK_OPEN/CLOSE paragraphs → container open/close
29//!   │
30//!   ▼
31//! HTML
32//! ```
33//!
34//! Comrak is unmodified: the v0.52.0 verbatim tree carries no
35//! Aozora-aware code (ADR-0001 budget = 0).
36
37#![forbid(unsafe_code)]
38
39mod code_block_mask;
40pub mod html;
41pub mod ir;
42mod post_process;
43mod sentinels;
44mod source_line_anchors;
45
46#[doc(hidden)]
47pub mod test_support;
48
49pub use aozora_pipeline::{
50    BLOCK_CLOSE_SENTINEL, BLOCK_LEAF_SENTINEL, BLOCK_OPEN_SENTINEL, INLINE_SENTINEL,
51};
52pub use aozora_spec::{Diagnostic, DiagnosticSource, Severity};
53
54use aozora_render::serialize as aozora_serialize;
55use aozora_syntax::borrowed::Arena;
56use comrak::nodes::AstNode;
57
58/// Parse-time configuration for [`render_to_string`] and friends.
59#[derive(Debug, Clone, Default)]
60pub struct Options<'c> {
61    pub comrak: comrak::Options<'c>,
62    /// When `true`, run the aozora lex pre-pass and HTML
63    /// post-processing. When `false`, the input flows straight into
64    /// vanilla `comrak::parse_document` + `format_html` — used by the
65    /// CommonMark / GFM spec conformance runners to verify the wrapper
66    /// does not perturb upstream behaviour.
67    pub aozora_enabled: bool,
68    /// When `true`, the HTML renderer adds `data-afm-source-line="N"`
69    /// (1-based) to every top-level block element it emits. The
70    /// afm-obsidian document-mode adapter (Pillar 6 of the plan)
71    /// uses these anchors to map per-block post-processor calls back
72    /// to slices of the rendered fragment without re-parsing.
73    ///
74    /// Defaults to `false`. Cost when enabled: one extra walk over
75    /// comrak's top-level AST children + a streaming insert pass on
76    /// the produced HTML. Both are O(blocks).
77    pub source_line_anchors: bool,
78}
79
80impl Options<'_> {
81    /// Default afm configuration: GFM extensions on (strikethrough,
82    /// table, autolink, tasklist), hardbreaks on so each Aozora source
83    /// newline becomes a `<br>` (verse / dialogue boundaries are
84    /// load-bearing in 青空文庫 source).
85    #[must_use]
86    pub fn afm_default() -> Self {
87        let mut comrak = comrak::Options::default();
88        comrak.extension.strikethrough = true;
89        comrak.extension.table = true;
90        comrak.extension.autolink = true;
91        comrak.extension.tasklist = true;
92        comrak.render.hardbreaks = true;
93        Self {
94            comrak,
95            aozora_enabled: true,
96            source_line_anchors: false,
97        }
98    }
99
100    /// Plain CommonMark (no GFM, no Aozora, raw HTML enabled). Used by
101    /// the CommonMark 0.31.2 spec-conformance test to verify the
102    /// wrapper does not perturb comrak's CommonMark behaviour.
103    #[must_use]
104    pub fn commonmark_only() -> Self {
105        let mut comrak = comrak::Options::default();
106        comrak.render.r#unsafe = true;
107        Self {
108            comrak,
109            aozora_enabled: false,
110            source_line_anchors: false,
111        }
112    }
113
114    /// Pure-GFM extension set (no Aozora, raw HTML enabled). Used by
115    /// the GFM 0.29 spec-conformance test.
116    #[must_use]
117    pub fn gfm_only() -> Self {
118        let mut comrak = comrak::Options::default();
119        comrak.extension.strikethrough = true;
120        comrak.extension.table = true;
121        comrak.extension.autolink = true;
122        comrak.extension.tasklist = true;
123        comrak.extension.tagfilter = true;
124        comrak.render.r#unsafe = true;
125        Self {
126            comrak,
127            aozora_enabled: false,
128            source_line_anchors: false,
129        }
130    }
131
132    /// Builder-style toggle for source-line anchors. Returns a new
133    /// `Options` with `source_line_anchors = on`.
134    ///
135    /// ```
136    /// use afm_markdown::Options;
137    /// let opts = Options::afm_default().with_source_line_anchors(true);
138    /// assert!(opts.source_line_anchors);
139    /// ```
140    #[must_use]
141    pub fn with_source_line_anchors(mut self, on: bool) -> Self {
142        self.source_line_anchors = on;
143        self
144    }
145}
146
147/// Output of [`render_to_string`].
148#[derive(Debug)]
149pub struct Rendered {
150    /// HTML output, with every Aozora sentinel substituted.
151    pub html: String,
152    /// Non-fatal lexer observations (unclosed pairs, PUA collisions,
153    /// stray triggers, …). Empty on the happy path.
154    pub diagnostics: Vec<Diagnostic>,
155}
156
157/// Output of [`render_to_ir`].
158///
159/// The IR projection alongside the HTML and diagnostics. Used by the
160/// `afm-wasm` bridge so the JS-side renderer can pick its own output
161/// target (DOM fragment, `CodeMirror` `RangeSet`, semantic tokens, …)
162/// from a single source.
163#[derive(Debug)]
164pub struct RenderedIr {
165    pub ir: ir::IrDocument,
166    pub html: String,
167    pub diagnostics: Vec<Diagnostic>,
168}
169
170/// Render afm source text to HTML.
171///
172/// One-stop entry point for the typical caller (afm CLI, afm-epub).
173/// Internally:
174///
175/// 1. [`aozora_pipeline::lex_into_arena`] turns the source into a normalized
176///    text (with PUA sentinels at every Aozora construct) plus a
177///    borrowed `Registry`.
178/// 2. `comrak::parse_document` + `comrak::format_html` runs against
179///    the normalized text — sentinels flow through as plain text since
180///    they are not in CommonMark's escape set (`<`/`>`/`&`/`"`).
181/// 3. The internal `post_process` module sweeps the produced HTML,
182///    substituting each sentinel with the matching
183///    `aozora_render::render_node` output.
184///
185/// # Panics
186///
187/// Panics if `comrak::format_html` fails to write into the internal
188/// `String` sink — `String` cannot fail as a `fmt::Write`, so this
189/// branch is unreachable in normal use.
190#[must_use]
191pub fn render_to_string(input: &str, options: &Options<'_>) -> Rendered {
192    if !options.aozora_enabled {
193        let comrak_arena = comrak::Arena::new();
194        let root = comrak::parse_document(&comrak_arena, input, &options.comrak);
195        let anchors = if options.source_line_anchors {
196            source_line_anchors::collect_top_level_lines(root)
197        } else {
198            Vec::new()
199        };
200        let mut html = String::new();
201        comrak::format_html(root, &options.comrak, &mut html)
202            .expect("formatting to a String never fails");
203        let final_html = if options.source_line_anchors {
204            source_line_anchors::inject_anchors(&html, &anchors)
205        } else {
206            html
207        };
208        return Rendered {
209            html: final_html,
210            diagnostics: Vec::new(),
211        };
212    }
213
214    // Pre-process: hide aozora trigger characters that live inside a
215    // CommonMark fenced code block from the lexer. `aozora_pipeline` is
216    // CommonMark-blind by design (ADR-0010), so this lives here. See
217    // `code_block_mask` module docs for the masking scheme.
218    let (masked_source, mask_originals) = code_block_mask::mask_code_block_triggers(input);
219
220    let arena = Arena::new();
221    let lex_out = aozora_pipeline::lex_into_arena(&masked_source, &arena);
222
223    let comrak_arena = comrak::Arena::new();
224    let root = comrak::parse_document(&comrak_arena, lex_out.normalized, &options.comrak);
225    let anchors = if options.source_line_anchors {
226        source_line_anchors::collect_top_level_lines(root)
227    } else {
228        Vec::new()
229    };
230    let mut comrak_html = String::new();
231    comrak::format_html(root, &options.comrak, &mut comrak_html)
232        .expect("formatting to a String never fails");
233
234    let spliced = post_process::splice_aozora_html(&comrak_html, &lex_out);
235    let unmasked = code_block_mask::unmask_html(&spliced, &mask_originals);
236    let html = if options.source_line_anchors {
237        source_line_anchors::inject_anchors(&unmasked, &anchors)
238    } else {
239        unmasked
240    };
241
242    Rendered {
243        html,
244        diagnostics: lex_out.diagnostics,
245    }
246}
247
248/// Render afm source to a structured IR + HTML + diagnostics.
249///
250/// Mirrors [`render_to_string`] but additionally walks comrak's AST
251/// to emit a typed [`ir::IrDocument`]. The IR is the canonical
252/// contract between afm-wasm and afm-obsidian's TS renderers.
253///
254/// The IR covers the full Markdown side (paragraph, heading,
255/// blockquote, list, code, thematic break, table, image) and the
256/// full Aozora side (`Ruby` / `DoubleRuby` / `Bouten` / `Tcy` /
257/// `Gaiji` / `Annotation` / `PageBreak` / `SectionBreak` /
258/// `Container`); heading hints (`[#「X」は大見出し]`) promote
259/// their host paragraph to `IrBlock::Heading` so the IR shape
260/// matches the rendered HTML one-for-one.
261///
262/// # Panics
263///
264/// Panics if `comrak::format_html` fails to write into the internal
265/// `String` sink — `String` cannot fail as a `fmt::Write`, so this
266/// branch is unreachable in normal use.
267#[must_use]
268pub fn render_to_ir(input: &str, options: &Options<'_>) -> RenderedIr {
269    if !options.aozora_enabled {
270        let comrak_arena = comrak::Arena::new();
271        let root = comrak::parse_document(&comrak_arena, input, &options.comrak);
272        let ir_doc = ir::build_ir(root, None);
273        let anchors = if options.source_line_anchors {
274            source_line_anchors::collect_top_level_lines(root)
275        } else {
276            Vec::new()
277        };
278        let mut html = String::new();
279        comrak::format_html(root, &options.comrak, &mut html)
280            .expect("formatting to a String never fails");
281        let final_html = if options.source_line_anchors {
282            source_line_anchors::inject_anchors(&html, &anchors)
283        } else {
284            html
285        };
286        return RenderedIr {
287            ir: ir_doc,
288            html: final_html,
289            diagnostics: Vec::new(),
290        };
291    }
292
293    let (masked_source, mask_originals) = code_block_mask::mask_code_block_triggers(input);
294
295    let arena = Arena::new();
296    let lex_out = aozora_pipeline::lex_into_arena(&masked_source, &arena);
297
298    let comrak_arena = comrak::Arena::new();
299    let root = comrak::parse_document(&comrak_arena, lex_out.normalized, &options.comrak);
300    let ir_doc = ir::build_ir(root, Some(&lex_out));
301    let anchors = if options.source_line_anchors {
302        source_line_anchors::collect_top_level_lines(root)
303    } else {
304        Vec::new()
305    };
306    let mut comrak_html = String::new();
307    comrak::format_html(root, &options.comrak, &mut comrak_html)
308        .expect("formatting to a String never fails");
309
310    let spliced = post_process::splice_aozora_html(&comrak_html, &lex_out);
311    let unmasked = code_block_mask::unmask_html(&spliced, &mask_originals);
312    let html = if options.source_line_anchors {
313        source_line_anchors::inject_anchors(&unmasked, &anchors)
314    } else {
315        unmasked
316    };
317
318    RenderedIr {
319        ir: ir_doc,
320        html,
321        diagnostics: lex_out.diagnostics,
322    }
323}
324
325/// One block of [`render_blocks_to_ir`]'s output.
326///
327/// Each entry corresponds to one top-level comrak child. `html` is the
328/// rendered HTML for that child (with Aozora sentinels spliced).
329/// `ir` is the IR projection — typically a single block, but may be
330/// empty for comrak constructs without a v0.2 IR mapping (definition
331/// lists, footnote refs, raw HTML, etc.) and may carry more than one
332/// block when an Aozora paired-container drains at the call boundary.
333#[derive(Debug, Clone)]
334pub struct RenderedBlock {
335    pub ir: Vec<ir::IrBlock>,
336    pub html: String,
337    /// 1-based line where this block began in the source.
338    pub source_line: u32,
339}
340
341/// Per-block streaming render.
342///
343/// Produces one [`RenderedBlock`] per top-level comrak child, in
344/// document order. Used by afm-obsidian's chunked-cancellation path
345/// (ADR-0009): the JS bridge can iterate the returned vector and
346/// check its `AbortSignal` between blocks.
347///
348/// The current implementation parses the document once (a single
349/// comrak pass) and renders each top-level block's HTML separately
350/// using `comrak::format_html`. Diagnostics from the lexer are
351/// returned alongside the blocks, attached to the document as a
352/// whole rather than per-block (the lexer pass is non-block-scoped).
353///
354/// Limitation: container constructs that span multiple top-level
355/// blocks (e.g., `[#ここから2字下げ]`...`[#ここで字下げ終わり]`)
356/// are emitted as separate blocks; the consumer is responsible for
357/// re-assembling them. The whole-document `render_to_ir` path
358/// preserves cross-block structure if you need it.
359#[must_use]
360pub fn render_blocks_to_ir(
361    input: &str,
362    options: &Options<'_>,
363) -> (Vec<RenderedBlock>, Vec<Diagnostic>) {
364    if !options.aozora_enabled {
365        let comrak_arena = comrak::Arena::new();
366        let root = comrak::parse_document(&comrak_arena, input, &options.comrak);
367        let blocks = collect_rendered_blocks(root, options, /* lex_out */ None);
368        return (blocks, Vec::new());
369    }
370
371    let (masked_source, _mask_originals) = code_block_mask::mask_code_block_triggers(input);
372    let arena = Arena::new();
373    let lex_out = aozora_pipeline::lex_into_arena(&masked_source, &arena);
374    let comrak_arena = comrak::Arena::new();
375    let root = comrak::parse_document(&comrak_arena, lex_out.normalized, &options.comrak);
376    let blocks = collect_rendered_blocks(root, options, Some(&lex_out));
377    (blocks, lex_out.diagnostics)
378}
379
380fn collect_rendered_blocks<'a>(
381    root: &'a AstNode<'a>,
382    options: &Options<'_>,
383    lex_out: Option<&aozora_pipeline::BorrowedLexOutput<'a>>,
384) -> Vec<RenderedBlock> {
385    // Single builder threads its cursor across blocks so the
386    // sentinel stream stays in lockstep with comrak's depth-first
387    // walk over `root.children()`. A per-call walker would restart
388    // the cursor at 0 for every block and misalign Aozora projection
389    // against the registry.
390    let mut ir_builder = ir::StreamingIrBuilder::new(lex_out);
391    let mut blocks = Vec::new();
392    for child in root.children() {
393        let data = child.data.borrow();
394        let line = u32::try_from(data.sourcepos.start.line)
395            .unwrap_or(u32::MAX)
396            .max(1);
397        drop(data);
398        let ir_blocks = ir_builder.walk_block(child);
399        let mut block_html = String::new();
400        comrak::format_html(child, &options.comrak, &mut block_html)
401            .expect("formatting a String never fails");
402        let html_final = if let Some(lo) = lex_out {
403            post_process::splice_aozora_html(&block_html, lo)
404        } else {
405            block_html
406        };
407        blocks.push(RenderedBlock {
408            ir: ir_blocks,
409            html: html_final,
410            source_line: line,
411        });
412    }
413    blocks
414}
415
416/// Round-trip an afm source through the lexer and back to canonical
417/// afm-source text.
418///
419/// Delegates to [`aozora_render::serialize::serialize`] — the
420/// borrowed-AST inverse of `lex_into_arena`. Plain CommonMark portions
421/// of the input pass through verbatim because the lexer leaves them
422/// untouched.
423#[must_use]
424pub fn serialize(input: &str) -> String {
425    let arena = Arena::new();
426    let lex_out = aozora_pipeline::lex_into_arena(input, &arena);
427    aozora_serialize::serialize(&lex_out)
428}
429
430#[cfg(test)]
431mod tests {
432    use super::*;
433
434    #[test]
435    fn plain_text_round_trips_through_html() {
436        let r = render_to_string("hello, world", &Options::afm_default());
437        assert!(r.html.contains("hello, world"), "html: {}", r.html);
438        assert!(r.diagnostics.is_empty());
439    }
440
441    #[test]
442    fn plain_text_serialize_returns_input_unchanged() {
443        assert_eq!(serialize("plain text"), "plain text");
444    }
445
446    #[test]
447    fn ruby_renders_as_html_ruby_element() {
448        let r = render_to_string("|青梅《おうめ》へ", &Options::afm_default());
449        assert!(r.html.contains("<ruby>"), "html: {}", r.html);
450        assert!(r.html.contains("青梅"));
451        assert!(r.html.contains("おうめ"));
452        // No bare [# leak (Tier-A canary).
453        assert!(!r.html.contains("[#"));
454    }
455
456    #[test]
457    fn page_break_promotes_and_does_not_leak_brackets() {
458        let r = render_to_string("前[#改ページ]後", &Options::afm_default());
459        assert!(!r.html.contains("[#"), "html: {}", r.html);
460    }
461
462    #[test]
463    fn unknown_annotation_keeps_brackets_inside_wrapper() {
464        let r = render_to_string("前[#ほげふが]後", &Options::afm_default());
465        // The annotation HTML carries the original text inside an
466        // `afm-annotation` wrapper, so the bracket character may
467        // appear, but never bare in body text.
468        assert!(
469            !contains_bare_bracket(&r.html),
470            "bare bracket leaked in: {}",
471            r.html
472        );
473    }
474
475    #[test]
476    fn commonmark_passes_through_with_heading_intact() {
477        let r = render_to_string("# Hello\n\nworld", &Options::afm_default());
478        assert!(r.html.contains("<h1>Hello</h1>"), "html: {}", r.html);
479        assert!(r.html.contains("world"));
480    }
481
482    #[test]
483    fn gfm_only_options_have_aozora_disabled_and_gfm_extensions_enabled() {
484        let opts = Options::gfm_only();
485        assert!(!opts.aozora_enabled, "gfm_only must skip the aozora pass");
486        assert!(opts.comrak.extension.strikethrough);
487        assert!(opts.comrak.extension.table);
488        assert!(opts.comrak.extension.autolink);
489        assert!(opts.comrak.extension.tasklist);
490        assert!(opts.comrak.extension.tagfilter);
491        assert!(opts.comrak.render.r#unsafe);
492    }
493
494    #[test]
495    fn gfm_only_renders_strikethrough_and_does_not_recognise_ruby() {
496        // gfm_only's contract: GFM extensions on, Aozora pre-pass off.
497        // The strikethrough must produce `<del>`; the ruby-shaped
498        // `|...《》` source must survive verbatim because the lexer
499        // never ran.
500        let opts = Options::gfm_only();
501        let html = render_to_string("~~strike~~ |青梅《おうめ》", &opts).html;
502        assert!(html.contains("<del>strike</del>"), "html: {html}");
503        assert!(
504            html.contains("|青梅"),
505            "ruby trigger must survive raw: {html}"
506        );
507        assert!(
508            !html.contains("<ruby>"),
509            "ruby must NOT render in gfm-only: {html}"
510        );
511    }
512
513    #[test]
514    fn contains_bare_bracket_helper_detects_leaked_marker() {
515        // Pins the "bare bracket leaked" branch of the helper itself.
516        // The needle appears outside any tag and outside an
517        // `afm-annotation` wrapper.
518        assert!(contains_bare_bracket("plain [# leak"));
519        assert!(!contains_bare_bracket(
520            "<span class=\"afm-annotation\" hidden>[#</span>"
521        ));
522        assert!(!contains_bare_bracket("no marker at all"));
523    }
524
525    /// Tier-A canary: every occurrence of `[#` must be inside an
526    /// `afm-annotation` wrapper — never in raw body text.
527    fn contains_bare_bracket(html: &str) -> bool {
528        let needle = "[#";
529        let wrapper_open = "afm-annotation";
530        let mut pos = 0;
531        while let Some(idx) = html[pos..].find(needle) {
532            let abs = pos + idx;
533            let prefix = &html[..abs];
534            let last_open = prefix.rfind('<').unwrap_or(0);
535            let last_close = prefix.rfind('>').unwrap_or(0);
536            let inside_tag = last_open > last_close;
537            let in_wrapper = prefix.contains(wrapper_open);
538            if !inside_tag && !in_wrapper {
539                return true;
540            }
541            pos = abs + needle.len();
542        }
543        false
544    }
545}