Skip to main content

afm_markdown/
ir.rs

1//! Intermediate representation produced by [`crate::render_to_ir`].
2//!
3//! The shape mirrors the TypeScript `IRDocument` consumed by
4//! `afm-obsidian/src/ir/types.ts` (and validated in
5//! `afm-obsidian/src/ir/from-wasm.ts`). Keeping the names and field
6//! ordering aligned across the FFI boundary makes the
7//! `serde-wasm-bindgen` round-trip a pass-through, no shape adapters
8//! needed.
9//!
10//! # Coverage
11//!
12//! - **Markdown side**: paragraphs, headings, lists, blockquotes,
13//!   fenced code, tables, thematic breaks. Inline runs preserve
14//!   `Strong`, `Emphasis`, `Link`, `Code`, `LineBreak`, and verbatim
15//!   `Text`.
16//! - **Aozora side**: `Ruby` / `DoubleRuby` / `Bouten` / `Tcy` /
17//!   `Gaiji` / `Annotation` (inline) and `Container` / `PageBreak` /
18//!   `SectionBreak` (block). Heading hints
19//!   (`[#「X」は大見出し]`) promote their host paragraph to
20//!   `IrBlock::Heading` directly, mirroring [`crate::post_process`].
21//!
22//! # Architecture
23//!
24//! The walker is built from three small primitives:
25//!
26//! 1. [`crate::sentinels::SentinelCursor`] — the shared registry-stream
27//!    cursor. The HTML splicer ([`crate::post_process`]) and this
28//!    builder both consume the same source-order sequence of
29//!    `NodeRef` entries; the cursor abstraction keeps them in
30//!    lockstep.
31//! 2. [`ParaScan`] — single-descent paragraph profile. One walk per
32//!    paragraph computes both the sole-block-sentinel test and the
33//!    heading-hint lookahead at once, eliminating the two-scan
34//!    redundancy that a naive translation of the HTML splicer would
35//!    have.
36//! 3. [`OpenContainer`] — the per-walker container stack. Where the
37//!    HTML splicer can stream open/close tags into a string buffer,
38//!    the IR demands a tree, so each open container collects
39//!    `IrBlock`s into its own `Vec` until the matching close arrives.
40//!    Move semantics (no `clone`) carry the children into the closed
41//!    `IrBlock::Container`.
42
43use aozora_encoding::gaiji::Resolved;
44use aozora_pipeline::BorrowedLexOutput;
45use aozora_syntax::borrowed::{
46    Annotation as AozoraAnnotation, AozoraNode, Bouten as AozoraBouten, Content,
47    DoubleRuby as AozoraDoubleRuby, Gaiji as AozoraGaiji, HeadingHint, NodeRef, Ruby as AozoraRuby,
48    Segment, TateChuYoko,
49};
50use aozora_syntax::{AnnotationKind, BoutenKind, BoutenPosition, ContainerKind, SectionKind};
51use comrak::nodes::{
52    AstNode, ListType, NodeHeading, NodeList, NodeValue, Sourcepos, TableAlignment,
53};
54use serde::Serialize;
55
56use crate::sentinels::{
57    BlockSentinelKind, SentinelCursor, flatten_registry_in_source_order, for_each_text_descendant,
58    is_sentinel_char, paragraph_sole_block_sentinel,
59};
60
61/// Saturating `usize → u32`. Source line/column overflow requires
62/// `~4G`-line files, so saturating to `u32::MAX` is safe.
63fn to_u32(n: usize) -> u32 {
64    u32::try_from(n).unwrap_or(u32::MAX)
65}
66
67#[derive(Debug, Default, Clone, Serialize)]
68#[serde(rename_all = "camelCase")]
69pub struct IrDocument {
70    pub blocks: Vec<IrBlock>,
71    pub diagnostics: Vec<IrDiagnostic>,
72}
73
74#[derive(Debug, Clone, Serialize)]
75#[serde(tag = "kind", rename_all = "camelCase")]
76pub enum IrBlock {
77    Paragraph {
78        children: Vec<IrInline>,
79        #[serde(skip_serializing_if = "Option::is_none")]
80        source_line: Option<u32>,
81        #[serde(skip_serializing_if = "Option::is_none")]
82        range: Option<Range>,
83    },
84    Heading {
85        level: u8,
86        children: Vec<IrInline>,
87        #[serde(skip_serializing_if = "Option::is_none")]
88        source_line: Option<u32>,
89        #[serde(skip_serializing_if = "Option::is_none")]
90        range: Option<Range>,
91    },
92    Blockquote {
93        children: Vec<Self>,
94        #[serde(skip_serializing_if = "Option::is_none")]
95        source_line: Option<u32>,
96        #[serde(skip_serializing_if = "Option::is_none")]
97        range: Option<Range>,
98    },
99    List {
100        ordered: bool,
101        #[serde(skip_serializing_if = "Option::is_none")]
102        start: Option<u32>,
103        items: Vec<IrListItem>,
104        #[serde(skip_serializing_if = "Option::is_none")]
105        source_line: Option<u32>,
106        #[serde(skip_serializing_if = "Option::is_none")]
107        range: Option<Range>,
108    },
109    CodeBlock {
110        #[serde(skip_serializing_if = "Option::is_none")]
111        lang: Option<String>,
112        value: String,
113        #[serde(skip_serializing_if = "Option::is_none")]
114        source_line: Option<u32>,
115        #[serde(skip_serializing_if = "Option::is_none")]
116        range: Option<Range>,
117    },
118    ThematicBreak {
119        #[serde(skip_serializing_if = "Option::is_none")]
120        source_line: Option<u32>,
121        #[serde(skip_serializing_if = "Option::is_none")]
122        range: Option<Range>,
123    },
124    Table {
125        header: IrTableRow,
126        rows: Vec<IrTableRow>,
127        align: Vec<IrTableAlign>,
128        #[serde(skip_serializing_if = "Option::is_none")]
129        source_line: Option<u32>,
130        #[serde(skip_serializing_if = "Option::is_none")]
131        range: Option<Range>,
132    },
133    // ----- Aozora-specific block variants -----
134    /// Paired-container wrapper. `subtype` is one of `"indent"`,
135    /// `"alignEnd"`, `"keigakomi"`, `"warichu"`. `indent_level` is set
136    /// to `Some(n)` for `"indent"` (字下げ amount) and `"alignEnd"`
137    /// (地上げ offset); `None` otherwise.
138    Container {
139        subtype: String,
140        children: Vec<Self>,
141        #[serde(skip_serializing_if = "Option::is_none")]
142        indent_level: Option<u32>,
143        #[serde(skip_serializing_if = "Option::is_none")]
144        source_line: Option<u32>,
145        #[serde(skip_serializing_if = "Option::is_none")]
146        range: Option<Range>,
147    },
148    PageBreak {
149        #[serde(skip_serializing_if = "Option::is_none")]
150        source_line: Option<u32>,
151        #[serde(skip_serializing_if = "Option::is_none")]
152        range: Option<Range>,
153    },
154    /// `[#改丁/改段/改見開き]`. `subtype` is one of `"choho"`,
155    /// `"dan"`, `"spread"` (camelCase tags matching upstream
156    /// [`SectionKind`]). `[#改ページ]` is its own block — see
157    /// [`IrBlock::PageBreak`].
158    SectionBreak {
159        subtype: String,
160        #[serde(skip_serializing_if = "Option::is_none")]
161        source_line: Option<u32>,
162        #[serde(skip_serializing_if = "Option::is_none")]
163        range: Option<Range>,
164    },
165}
166
167#[derive(Debug, Clone, Serialize)]
168pub struct IrTableRow {
169    pub cells: Vec<Vec<IrInline>>,
170    #[serde(skip_serializing_if = "Option::is_none")]
171    pub range: Option<Range>,
172}
173
174#[derive(Debug, Clone, Serialize)]
175pub struct IrListItem {
176    pub children: Vec<IrBlock>,
177    #[serde(skip_serializing_if = "Option::is_none")]
178    pub range: Option<Range>,
179}
180
181#[derive(Debug, Clone, Copy, Serialize)]
182#[serde(rename_all = "camelCase")]
183pub enum IrTableAlign {
184    Left,
185    Center,
186    Right,
187    Default,
188}
189
190#[derive(Debug, Clone, Serialize)]
191#[serde(tag = "kind", rename_all = "camelCase")]
192pub enum IrInline {
193    Text {
194        value: String,
195        #[serde(skip_serializing_if = "Option::is_none")]
196        range: Option<Range>,
197    },
198    Code {
199        value: String,
200        #[serde(skip_serializing_if = "Option::is_none")]
201        range: Option<Range>,
202    },
203    Strong {
204        children: Vec<Self>,
205        #[serde(skip_serializing_if = "Option::is_none")]
206        range: Option<Range>,
207    },
208    Emphasis {
209        children: Vec<Self>,
210        #[serde(skip_serializing_if = "Option::is_none")]
211        range: Option<Range>,
212    },
213    Link {
214        href: String,
215        #[serde(skip_serializing_if = "Option::is_none")]
216        title: Option<String>,
217        children: Vec<Self>,
218        #[serde(skip_serializing_if = "Option::is_none")]
219        range: Option<Range>,
220    },
221    /// CommonMark image. `alt` carries the alt-text inlines exactly
222    /// as comrak parses them (typically a single `Text`). `url` is
223    /// the image source; `title` is the optional `"…"` argument.
224    Image {
225        url: String,
226        #[serde(skip_serializing_if = "Option::is_none")]
227        title: Option<String>,
228        alt: Vec<Self>,
229        #[serde(skip_serializing_if = "Option::is_none")]
230        range: Option<Range>,
231    },
232    LineBreak {
233        hard: bool,
234        #[serde(skip_serializing_if = "Option::is_none")]
235        range: Option<Range>,
236    },
237    // ----- Aozora-specific variants (mirror TS IRInline) -----
238    /// Furigana. `reading` is the flattened reading text;
239    /// `explicit` is `true` when the source used the explicit
240    /// `|base《reading》` opener.
241    Ruby {
242        base: Vec<Self>,
243        reading: String,
244        explicit: bool,
245        #[serde(skip_serializing_if = "Option::is_none")]
246        range: Option<Range>,
247    },
248    /// `《《…》》` double-bracket bouten. Upstream's `DoubleRuby`
249    /// carries a single `content` payload — that payload becomes
250    /// `base` here. The shape is intentionally minimal: any future
251    /// upstream addition (e.g., explicit ring-style metadata) lands
252    /// as a new optional field rather than re-using empty strings as
253    /// placeholders.
254    DoubleRuby {
255        base: Vec<Self>,
256        #[serde(skip_serializing_if = "Option::is_none")]
257        range: Option<Range>,
258    },
259    /// Emphasis dots / sidelines. `style` is one of `"goma"`,
260    /// `"whiteSesame"`, `"circle"`, `"whiteCircle"`, `"doubleCircle"`,
261    /// `"janome"`, `"cross"`, `"whiteTriangle"`, `"wavyLine"`,
262    /// `"underLine"`, `"doubleUnderLine"`. `position` is `"right"` or
263    /// `"left"`.
264    Bouten {
265        children: Vec<Self>,
266        style: String,
267        position: String,
268        #[serde(skip_serializing_if = "Option::is_none")]
269        range: Option<Range>,
270    },
271    Gaiji {
272        #[serde(skip_serializing_if = "Option::is_none")]
273        codepoint: Option<String>,
274        #[serde(skip_serializing_if = "Option::is_none")]
275        description: Option<String>,
276        #[serde(skip_serializing_if = "Option::is_none")]
277        fallback_text: Option<String>,
278        #[serde(skip_serializing_if = "Option::is_none")]
279        range: Option<Range>,
280    },
281    Tcy {
282        text: String,
283        #[serde(skip_serializing_if = "Option::is_none")]
284        range: Option<Range>,
285    },
286    /// Generic annotation. `payload` is the raw bytes between
287    /// `[#` and `]`. `resolved` carries the [`AnnotationKind`]
288    /// camelCase tag (`"asIs"`, `"textualNote"`, `"invalidRubySpan"`,
289    /// `"warichuOpen"`, `"warichuClose"`) when the upstream lexer
290    /// classified the annotation; `None` for `Unknown`.
291    Annotation {
292        payload: String,
293        #[serde(skip_serializing_if = "Option::is_none")]
294        resolved: Option<String>,
295        #[serde(skip_serializing_if = "Option::is_none")]
296        range: Option<Range>,
297    },
298}
299
300#[derive(Debug, Clone, Serialize)]
301pub struct IrDiagnostic {
302    pub level: String,
303    pub message: String,
304    #[serde(skip_serializing_if = "Option::is_none")]
305    pub code: Option<String>,
306    #[serde(skip_serializing_if = "Option::is_none")]
307    pub range: Option<Range>,
308}
309
310#[derive(Debug, Clone, Copy, Serialize)]
311pub struct Range {
312    pub from: u32,
313    pub to: u32,
314}
315
316// ===================================================================
317// Walker entry points
318// ===================================================================
319
320/// Walk a comrak AST root and project it to [`IrDocument`].
321///
322/// `lex_out` carries the borrowed-AST registry. When `Some`, every
323/// PUA sentinel in the comrak text is projected to its matching
324/// [`IrBlock`] / [`IrInline`] variant; when `None`, the walker
325/// degrades to markdown-only behaviour (used by
326/// `Options::aozora_enabled = false`).
327pub(crate) fn build_ir<'a>(
328    root: &'a AstNode<'a>,
329    lex_out: Option<&BorrowedLexOutput<'a>>,
330) -> IrDocument {
331    let nodes = lex_out
332        .map(flatten_registry_in_source_order)
333        .unwrap_or_default();
334    let mut walker = IrWalker::new(nodes.as_slice());
335    walker.walk_root(root);
336    IrDocument {
337        blocks: walker.finish(),
338        diagnostics: Vec::new(),
339    }
340}
341
342/// Stateful per-block IR builder for streaming mode.
343///
344/// Materialises the registry once at construction time and threads a
345/// shared cursor across successive `walk_block` calls so multi-block
346/// inputs preserve the registry's source order. The cursor lives in
347/// this struct (not in the walker) so individual `walk_block` calls
348/// can be issued lazily — afm-obsidian's chunked-cancellation path
349/// (ADR-0009) uses this to checkpoint between blocks.
350///
351/// Container open/close paragraphs that span multiple top-level
352/// blocks emit fragmented `IrBlock::Container` blocks: the open
353/// pushes onto a stack that drains at the next `walk_block` boundary
354/// (so each block is internally consistent in nesting). Whole-doc
355/// `build_ir` remains the canonical path for cross-block nesting.
356#[derive(Debug)]
357pub struct StreamingIrBuilder<'src> {
358    nodes: Vec<NodeRef<'src>>,
359    cursor_idx: usize,
360}
361
362impl<'src> StreamingIrBuilder<'src> {
363    /// Materialise the registry once. `None` produces an empty
364    /// builder that degrades to markdown-only projection.
365    #[must_use]
366    pub fn new(lex_out: Option<&BorrowedLexOutput<'src>>) -> Self {
367        Self {
368            nodes: lex_out
369                .map(flatten_registry_in_source_order)
370                .unwrap_or_default(),
371            cursor_idx: 0,
372        }
373    }
374
375    /// Walk a single comrak block, advancing the shared cursor.
376    /// Streaming-mode containers fragment per-block; for whole-doc
377    /// nesting use [`build_ir`].
378    pub fn walk_block<'a>(&mut self, node: &'a AstNode<'a>) -> Vec<IrBlock> {
379        let mut walker = IrWalker::with_cursor_idx(self.nodes.as_slice(), self.cursor_idx);
380        walker.walk_top(node);
381        let next_idx = walker.cursor.position();
382        let blocks = walker.finish();
383        self.cursor_idx = next_idx;
384        blocks
385    }
386}
387
388// ===================================================================
389// Walker
390// ===================================================================
391
392/// Tree builder that consumes comrak nodes plus a sentinel cursor and
393/// emits `IrBlock`s into a stack-balanced container hierarchy.
394///
395/// The state mirrors [`crate::post_process`]'s `SpliceState` for the
396/// HTML side: same cursor, same balanced-container model, same
397/// orphan-close drain at end-of-document. They differ only in the
398/// emit target (string buffer vs. tree of `Vec<IrBlock>`).
399///
400/// Lifetimes:
401///
402/// - `'c` is the lifetime of the registry slice the walker borrows
403///   (typically a local `Vec<NodeRef<'src>>` materialised at the call
404///   site).
405/// - `'src` is the arena/source lifetime that every borrowed
406///   [`AozoraNode`] payload references.
407struct IrWalker<'c, 'src> {
408    cursor: SentinelCursor<'c, 'src>,
409    /// Document-level blocks gathered so far. When a container is
410    /// open, new blocks go onto its top-of-stack `children` instead.
411    top: Vec<IrBlock>,
412    /// Stack of currently-open paired containers. Each frame owns the
413    /// blocks gathered between its open and (eventual) close marker.
414    open: Vec<OpenContainer>,
415}
416
417struct OpenContainer {
418    kind: ContainerKind,
419    source_line: Option<u32>,
420    children: Vec<IrBlock>,
421}
422
423impl<'c, 'src> IrWalker<'c, 'src> {
424    fn new(nodes: &'c [NodeRef<'src>]) -> Self {
425        Self {
426            cursor: SentinelCursor::new(nodes),
427            top: Vec::new(),
428            open: Vec::new(),
429        }
430    }
431
432    /// Construct a walker that resumes from a given cursor index in
433    /// `nodes`. Used by [`StreamingIrBuilder`] to thread cursor state
434    /// across per-block walks.
435    fn with_cursor_idx(nodes: &'c [NodeRef<'src>], idx: usize) -> Self {
436        Self {
437            cursor: SentinelCursor::with_position(nodes, idx),
438            top: Vec::new(),
439            open: Vec::new(),
440        }
441    }
442
443    /// Consume the walker, draining any unclosed containers (mirror of
444    /// the HTML splicer's end-of-document orphan-close pass).
445    fn finish(mut self) -> Vec<IrBlock> {
446        while let Some(open) = self.open.pop() {
447            let block = open.into_block();
448            place_in(&mut self.open, &mut self.top, block);
449        }
450        self.top
451    }
452
453    fn walk_root<'a>(&mut self, root: &'a AstNode<'a>) {
454        for child in root.children() {
455            self.walk_top(child);
456        }
457    }
458
459    fn walk_top<'a>(&mut self, node: &'a AstNode<'a>) {
460        let (source_line, is_paragraph) = top_metadata(node);
461        if is_paragraph && let Some(action) = self.classify_paragraph(node) {
462            self.dispatch_paragraph(action, source_line);
463            return;
464        }
465        if let Some(block) = self.walk_block(node, true) {
466            place_in(&mut self.open, &mut self.top, block);
467        }
468    }
469
470    /// Run a single descent over `node`'s text descendants, returning
471    /// the most specific paragraph action (sole block sentinel or
472    /// heading hint promotion) supported by the registry lookahead.
473    fn classify_paragraph<'a>(&self, node: &'a AstNode<'a>) -> Option<ParagraphAction<'src>> {
474        if let Some(kind) = paragraph_sole_block_sentinel(node) {
475            return Some(ParagraphAction::BlockSentinel(kind));
476        }
477        let scan = ParaScan::run(node, &self.cursor);
478        if let Some(hint) = scan.first_heading_hint {
479            return Some(ParagraphAction::HeadingHint {
480                hint,
481                sentinels_to_consume: scan.total_sentinels,
482            });
483        }
484        None
485    }
486
487    fn dispatch_paragraph(&mut self, action: ParagraphAction<'src>, source_line: u32) {
488        match action {
489            ParagraphAction::BlockSentinel(kind) => self.handle_block_sentinel(kind, source_line),
490            ParagraphAction::HeadingHint {
491                hint,
492                sentinels_to_consume,
493            } => self.handle_heading_hint(hint, sentinels_to_consume, source_line),
494        }
495    }
496
497    fn handle_block_sentinel(&mut self, kind: BlockSentinelKind, source_line: u32) {
498        let Some(node_ref) = self.cursor.next() else {
499            return;
500        };
501        match (kind, node_ref) {
502            (BlockSentinelKind::Leaf, NodeRef::BlockLeaf(leaf)) => {
503                if let Some(block) = project_block_leaf(leaf, source_line) {
504                    place_in(&mut self.open, &mut self.top, block);
505                }
506            }
507            (BlockSentinelKind::Open, NodeRef::BlockOpen(ck)) => {
508                self.open.push(OpenContainer {
509                    kind: ck,
510                    source_line: Some(source_line),
511                    children: Vec::new(),
512                });
513            }
514            (BlockSentinelKind::Close, NodeRef::BlockClose(_)) => {
515                if let Some(open) = self.open.pop() {
516                    let block = open.into_block();
517                    place_in(&mut self.open, &mut self.top, block);
518                }
519                // Orphan close: silently dropped, in lockstep with
520                // `splice_aozora_html`'s defensive guard.
521            }
522            _ => {}
523        }
524    }
525
526    fn handle_heading_hint(
527        &mut self,
528        hint: &'src HeadingHint<'src>,
529        sentinels_to_consume: usize,
530        source_line: u32,
531    ) {
532        self.cursor.advance(sentinels_to_consume);
533        let block = IrBlock::Heading {
534            level: hint.level.clamp(1, 6),
535            children: vec![IrInline::Text {
536                value: hint.target.as_str().to_owned(),
537                range: None,
538            }],
539            source_line: Some(source_line),
540            range: None,
541        };
542        place_in(&mut self.open, &mut self.top, block);
543    }
544
545    fn walk_block<'a>(&mut self, node: &'a AstNode<'a>, top_level: bool) -> Option<IrBlock> {
546        let data = node.data.borrow();
547        let source_line = top_level.then(|| to_u32(data.sourcepos.start.line).max(1));
548        let range = sourcepos_to_range(&data.sourcepos);
549        match &data.value {
550            NodeValue::Paragraph => {
551                drop(data);
552                Some(IrBlock::Paragraph {
553                    children: self.collect_inlines(node),
554                    source_line,
555                    range,
556                })
557            }
558            NodeValue::Heading(NodeHeading { level, .. }) => {
559                let level = (*level).clamp(1, 6);
560                drop(data);
561                Some(IrBlock::Heading {
562                    level,
563                    children: self.collect_inlines(node),
564                    source_line,
565                    range,
566                })
567            }
568            NodeValue::BlockQuote => {
569                drop(data);
570                Some(IrBlock::Blockquote {
571                    children: self.collect_blocks(node),
572                    source_line,
573                    range,
574                })
575            }
576            NodeValue::List(NodeList {
577                list_type, start, ..
578            }) => {
579                let ordered = matches!(list_type, ListType::Ordered);
580                let start = (*start > 1).then(|| to_u32(*start));
581                drop(data);
582                Some(IrBlock::List {
583                    ordered,
584                    start,
585                    items: self.collect_list_items(node),
586                    source_line,
587                    range,
588                })
589            }
590            NodeValue::CodeBlock(code) => {
591                let lang = (!code.info.is_empty()).then(|| code.info.clone());
592                let value = code.literal.clone();
593                drop(data);
594                Some(IrBlock::CodeBlock {
595                    lang,
596                    value,
597                    source_line,
598                    range,
599                })
600            }
601            NodeValue::ThematicBreak => {
602                drop(data);
603                Some(IrBlock::ThematicBreak { source_line, range })
604            }
605            NodeValue::Table(table) => {
606                let aligns: Vec<IrTableAlign> =
607                    table.alignments.iter().copied().map(table_align).collect();
608                drop(data);
609                Some(self.walk_table(
610                    node,
611                    TableMeta {
612                        align: aligns,
613                        source_line,
614                        range,
615                    },
616                ))
617            }
618            // List items, table rows, and table cells are handled by
619            // their parents. Other unhandled block kinds (definition
620            // list, footnote refs, etc.) drop from the IR — the HTML
621            // still has them.
622            _ => None,
623        }
624    }
625
626    fn walk_table<'a>(&mut self, node: &'a AstNode<'a>, meta: TableMeta) -> IrBlock {
627        let mut rows: Vec<IrTableRow> = Vec::new();
628        for child in node.children() {
629            rows.push(self.collect_table_row(child));
630        }
631        let header = rows.first().cloned().unwrap_or(IrTableRow {
632            cells: Vec::new(),
633            range: None,
634        });
635        let body = if rows.is_empty() {
636            Vec::new()
637        } else {
638            rows[1..].to_vec()
639        };
640        IrBlock::Table {
641            header,
642            rows: body,
643            align: meta.align,
644            source_line: meta.source_line,
645            range: meta.range,
646        }
647    }
648
649    fn collect_blocks<'a>(&mut self, node: &'a AstNode<'a>) -> Vec<IrBlock> {
650        let mut out = Vec::new();
651        for child in node.children() {
652            if let Some(block) = self.walk_block(child, false) {
653                out.push(block);
654            }
655        }
656        out
657    }
658
659    fn collect_list_items<'a>(&mut self, node: &'a AstNode<'a>) -> Vec<IrListItem> {
660        let mut out = Vec::new();
661        for child in node.children() {
662            let data = child.data.borrow();
663            let is_item = matches!(data.value, NodeValue::Item(_));
664            let range = sourcepos_to_range(&data.sourcepos);
665            drop(data);
666            if !is_item {
667                continue;
668            }
669            out.push(IrListItem {
670                children: self.collect_blocks(child),
671                range,
672            });
673        }
674        out
675    }
676
677    fn collect_table_row<'a>(&mut self, row: &'a AstNode<'a>) -> IrTableRow {
678        let data = row.data.borrow();
679        let range = sourcepos_to_range(&data.sourcepos);
680        drop(data);
681        let mut cells = Vec::new();
682        for cell in row.children() {
683            cells.push(self.collect_inlines(cell));
684        }
685        IrTableRow { cells, range }
686    }
687
688    fn collect_inlines<'a>(&mut self, node: &'a AstNode<'a>) -> Vec<IrInline> {
689        let mut out = Vec::new();
690        for child in node.children() {
691            self.emit_inline(child, &mut out);
692        }
693        out
694    }
695
696    fn emit_inline<'a>(&mut self, node: &'a AstNode<'a>, out: &mut Vec<IrInline>) {
697        let data = node.data.borrow();
698        let range = sourcepos_to_range(&data.sourcepos);
699        match &data.value {
700            NodeValue::Text(s) => {
701                let s = s.clone();
702                drop(data);
703                self.project_text_with_sentinels(&s, range, out);
704            }
705            NodeValue::Code(c) => {
706                let value = c.literal.clone();
707                drop(data);
708                out.push(IrInline::Code { value, range });
709            }
710            NodeValue::Strong => {
711                drop(data);
712                out.push(IrInline::Strong {
713                    children: self.collect_inlines(node),
714                    range,
715                });
716            }
717            NodeValue::Emph => {
718                drop(data);
719                out.push(IrInline::Emphasis {
720                    children: self.collect_inlines(node),
721                    range,
722                });
723            }
724            NodeValue::Link(link) => {
725                let href = link.url.clone();
726                let title = (!link.title.is_empty()).then(|| link.title.clone());
727                drop(data);
728                out.push(IrInline::Link {
729                    href,
730                    title,
731                    children: self.collect_inlines(node),
732                    range,
733                });
734            }
735            NodeValue::Image(image) => {
736                let url = image.url.clone();
737                let title = (!image.title.is_empty()).then(|| image.title.clone());
738                drop(data);
739                out.push(IrInline::Image {
740                    url,
741                    title,
742                    alt: self.collect_inlines(node),
743                    range,
744                });
745            }
746            NodeValue::SoftBreak => {
747                drop(data);
748                out.push(IrInline::LineBreak { hard: false, range });
749            }
750            NodeValue::LineBreak => {
751                drop(data);
752                out.push(IrInline::LineBreak { hard: true, range });
753            }
754            // Image, footnote refs, raw HTML, etc. drop quietly.
755            _ => {}
756        }
757    }
758
759    fn project_text_with_sentinels(
760        &mut self,
761        text: &str,
762        range: Option<Range>,
763        out: &mut Vec<IrInline>,
764    ) {
765        // Fast path: no sentinels in this text run.
766        if !text.chars().any(is_sentinel_char) {
767            if !text.is_empty() {
768                out.push(IrInline::Text {
769                    value: text.to_owned(),
770                    range,
771                });
772            }
773            return;
774        }
775        let mut cursor = 0;
776        for (idx, ch) in text.char_indices() {
777            if !is_sentinel_char(ch) {
778                continue;
779            }
780            let head = &text[cursor..idx];
781            if !head.is_empty() {
782                out.push(IrInline::Text {
783                    value: head.to_owned(),
784                    range,
785                });
786            }
787            cursor = idx + ch.len_utf8();
788            let Some(node_ref) = self.cursor.next() else {
789                continue;
790            };
791            // Block sentinels surviving into an inline context (e.g.
792            // raw text inside a fenced code block) drop silently —
793            // matches `crate::post_process::splice_inline_pass`.
794            if let NodeRef::Inline(aozora) = node_ref
795                && let Some(inline) = project_inline(aozora)
796            {
797                out.push(inline);
798            }
799        }
800        let tail = &text[cursor..];
801        if !tail.is_empty() {
802            out.push(IrInline::Text {
803                value: tail.to_owned(),
804                range,
805            });
806        }
807    }
808}
809
810/// Push `block` onto the top-of-stack open container's children, or
811/// onto the document's top-level blocks if no container is open.
812fn place_in(open: &mut [OpenContainer], top: &mut Vec<IrBlock>, block: IrBlock) {
813    if let Some(frame) = open.last_mut() {
814        frame.children.push(block);
815    } else {
816        top.push(block);
817    }
818}
819
820impl OpenContainer {
821    fn into_block(self) -> IrBlock {
822        IrBlock::Container {
823            subtype: container_subtype(self.kind).to_owned(),
824            children: self.children,
825            indent_level: container_indent_level(self.kind),
826            source_line: self.source_line,
827            range: None,
828        }
829    }
830}
831
832struct TableMeta {
833    align: Vec<IrTableAlign>,
834    source_line: Option<u32>,
835    range: Option<Range>,
836}
837
838#[derive(Debug, Clone, Copy)]
839enum ParagraphAction<'src> {
840    BlockSentinel(BlockSentinelKind),
841    HeadingHint {
842        hint: &'src HeadingHint<'src>,
843        sentinels_to_consume: usize,
844    },
845}
846
847fn top_metadata<'a>(node: &'a AstNode<'a>) -> (u32, bool) {
848    let data = node.data.borrow();
849    let line = u32::try_from(data.sourcepos.start.line)
850        .unwrap_or(u32::MAX)
851        .max(1);
852    let is_para = matches!(data.value, NodeValue::Paragraph);
853    (line, is_para)
854}
855
856// ===================================================================
857// Single-descent paragraph profile.
858// ===================================================================
859
860/// Collected paragraph properties. The walker computes this in one
861/// pass over the paragraph's text descendants and dispatches off the
862/// result.
863struct ParaScan<'src> {
864    /// Total sentinel chars in the paragraph's text descendants.
865    /// Equals the number of registry entries the paragraph would
866    /// consume during inline projection.
867    total_sentinels: usize,
868    /// First sentinel that the registry classifies as a heading hint.
869    /// `None` if the paragraph carries no inline heading hint.
870    first_heading_hint: Option<&'src HeadingHint<'src>>,
871}
872
873impl<'src> ParaScan<'src> {
874    fn run<'a>(node: &'a AstNode<'a>, cursor: &SentinelCursor<'_, 'src>) -> Self {
875        let mut total_sentinels = 0usize;
876        let mut first_heading_hint = None;
877        for_each_text_descendant(node, |text| {
878            for ch in text.chars() {
879                if !is_sentinel_char(ch) {
880                    continue;
881                }
882                if first_heading_hint.is_none()
883                    && let Some(NodeRef::Inline(AozoraNode::HeadingHint(h))) =
884                        cursor.peek(total_sentinels)
885                {
886                    first_heading_hint = Some(h);
887                }
888                total_sentinels += 1;
889            }
890        });
891        Self {
892            total_sentinels,
893            first_heading_hint,
894        }
895    }
896}
897
898// ===================================================================
899// AozoraNode → IR projection.
900// ===================================================================
901
902fn project_inline(node: AozoraNode<'_>) -> Option<IrInline> {
903    match node {
904        AozoraNode::Ruby(r) => Some(project_ruby(r)),
905        AozoraNode::DoubleRuby(d) => Some(project_double_ruby(d)),
906        AozoraNode::Bouten(b) => Some(project_bouten(b)),
907        AozoraNode::TateChuYoko(t) => Some(project_tcy(t)),
908        AozoraNode::Gaiji(g) => Some(project_gaiji(g)),
909        AozoraNode::Annotation(a) => Some(project_annotation(a)),
910        // HeadingHint is consumed at the paragraph level, never inline.
911        // Other variants (`Indent` leaf, `AlignEnd` leaf, `Warichu`,
912        // `Sashie`, `Kaeriten`, `AozoraHeading`, `Keigakomi`) exist as
913        // block markers in upstream and don't have a v0.2 inline
914        // projection. They appear in the HTML but drop from the IR.
915        _ => None,
916    }
917}
918
919fn project_block_leaf(node: AozoraNode<'_>, source_line: u32) -> Option<IrBlock> {
920    match node {
921        AozoraNode::PageBreak => Some(IrBlock::PageBreak {
922            source_line: Some(source_line),
923            range: None,
924        }),
925        AozoraNode::SectionBreak(kind) => Some(IrBlock::SectionBreak {
926            subtype: section_kind_subtype(kind).to_owned(),
927            source_line: Some(source_line),
928            range: None,
929        }),
930        // Other block-leaf variants (`Sashie`, `AozoraHeading`, …)
931        // have no v0.2 IR projection. The HTML still carries them.
932        _ => None,
933    }
934}
935
936fn project_ruby(r: &AozoraRuby<'_>) -> IrInline {
937    IrInline::Ruby {
938        base: project_content_inlines(r.base.get()),
939        reading: content_to_string(r.reading.get()),
940        explicit: r.delim_explicit,
941        range: None,
942    }
943}
944
945fn project_double_ruby(d: &AozoraDoubleRuby<'_>) -> IrInline {
946    IrInline::DoubleRuby {
947        base: project_content_inlines(d.content.get()),
948        range: None,
949    }
950}
951
952fn project_bouten(b: &AozoraBouten<'_>) -> IrInline {
953    IrInline::Bouten {
954        children: project_content_inlines(b.target.get()),
955        style: bouten_kind_str(b.kind).to_owned(),
956        position: bouten_position_str(b.position).to_owned(),
957        range: None,
958    }
959}
960
961fn project_tcy(t: &TateChuYoko<'_>) -> IrInline {
962    IrInline::Tcy {
963        text: content_to_string(t.text.get()),
964        range: None,
965    }
966}
967
968fn project_gaiji(g: &AozoraGaiji<'_>) -> IrInline {
969    IrInline::Gaiji {
970        codepoint: g.ucs.map(resolved_to_string),
971        description: (!g.description.is_empty()).then(|| g.description.to_owned()),
972        fallback_text: None,
973        range: None,
974    }
975}
976
977fn project_annotation(a: &AozoraAnnotation<'_>) -> IrInline {
978    IrInline::Annotation {
979        payload: a.raw.as_str().to_owned(),
980        resolved: annotation_kind_resolved(a.kind).map(str::to_owned),
981        range: None,
982    }
983}
984
985fn project_content_inlines(content: Content<'_>) -> Vec<IrInline> {
986    match content {
987        Content::Plain(s) if !s.is_empty() => vec![IrInline::Text {
988            value: s.to_owned(),
989            range: None,
990        }],
991        Content::Segments(segs) => {
992            let mut out = Vec::with_capacity(segs.len());
993            for seg in segs {
994                match *seg {
995                    Segment::Text(t) if !t.is_empty() => out.push(IrInline::Text {
996                        value: t.to_owned(),
997                        range: None,
998                    }),
999                    Segment::Gaiji(g) => out.push(project_gaiji(g)),
1000                    Segment::Annotation(a) => out.push(project_annotation(a)),
1001                    // Empty `Segment::Text` plus any future
1002                    // non-exhaustive variant: drop quietly.
1003                    _ => {}
1004                }
1005            }
1006            out
1007        }
1008        // `Content::Plain("")` plus any future non-exhaustive variant:
1009        // produce no IR.
1010        _ => Vec::new(),
1011    }
1012}
1013
1014fn content_to_string(content: Content<'_>) -> String {
1015    match content {
1016        Content::Plain(s) => s.to_owned(),
1017        Content::Segments(segs) => {
1018            let mut out = String::new();
1019            for seg in segs {
1020                if let Segment::Text(t) = seg {
1021                    out.push_str(t);
1022                }
1023            }
1024            out
1025        }
1026        _ => String::new(),
1027    }
1028}
1029
1030fn resolved_to_string(r: Resolved) -> String {
1031    match r {
1032        Resolved::Char(c) => c.to_string(),
1033        Resolved::Multi(s) => s.to_owned(),
1034    }
1035}
1036
1037// All upstream payload enums are `#[non_exhaustive]`. The trailing
1038// wildcard arm fires only when a future upstream release adds a
1039// variant before afm bumps its dep, so we keep its return value
1040// **distinct** from every named variant: the wildcard returns
1041// `"unknown"` (or `None`), and named variants return their own
1042// semantic mapping. That way a future-variant hit is observable in
1043// the IR rather than silently coinciding with a known variant's
1044// output. Clippy's `match_same_arms` would otherwise flag any
1045// explicit arm that happens to share the wildcard body — but we
1046// don't have to silence the lint because our values are genuinely
1047// distinct everywhere.
1048
1049const fn bouten_kind_str(k: BoutenKind) -> &'static str {
1050    match k {
1051        BoutenKind::Goma => "goma",
1052        BoutenKind::WhiteSesame => "whiteSesame",
1053        BoutenKind::Circle => "circle",
1054        BoutenKind::WhiteCircle => "whiteCircle",
1055        BoutenKind::DoubleCircle => "doubleCircle",
1056        BoutenKind::Janome => "janome",
1057        BoutenKind::Cross => "cross",
1058        BoutenKind::WhiteTriangle => "whiteTriangle",
1059        BoutenKind::WavyLine => "wavyLine",
1060        BoutenKind::UnderLine => "underLine",
1061        BoutenKind::DoubleUnderLine => "doubleUnderLine",
1062        _ => "unknown",
1063    }
1064}
1065
1066const fn bouten_position_str(p: BoutenPosition) -> &'static str {
1067    match p {
1068        BoutenPosition::Right => "right",
1069        BoutenPosition::Left => "left",
1070        _ => "unknown",
1071    }
1072}
1073
1074const fn section_kind_subtype(kind: SectionKind) -> &'static str {
1075    match kind {
1076        SectionKind::Choho => "choho",
1077        SectionKind::Dan => "dan",
1078        SectionKind::Spread => "spread",
1079        _ => "unknown",
1080    }
1081}
1082
1083const fn container_subtype(kind: ContainerKind) -> &'static str {
1084    match kind {
1085        ContainerKind::Indent { .. } => "indent",
1086        ContainerKind::Warichu => "warichu",
1087        ContainerKind::Keigakomi => "keigakomi",
1088        ContainerKind::AlignEnd { .. } => "alignEnd",
1089        _ => "unknown",
1090    }
1091}
1092
1093const fn container_indent_level(kind: ContainerKind) -> Option<u32> {
1094    // Only the size-carrying variants emit a depth. `Warichu` and
1095    // `Keigakomi` (and any future non-exhaustive variant) fall
1096    // through the wildcard with `None`.
1097    match kind {
1098        ContainerKind::Indent { amount } => Some(amount as u32),
1099        ContainerKind::AlignEnd { offset } => Some(offset as u32),
1100        _ => None,
1101    }
1102}
1103
1104const fn annotation_kind_resolved(k: AnnotationKind) -> Option<&'static str> {
1105    // Named annotation kinds project to their camelCase tag.
1106    // `Unknown` deliberately differs from a future-variant hit:
1107    // `Some("unknown")` says the upstream classifier saw the
1108    // annotation but couldn't classify it, whereas `None` says afm
1109    // doesn't know about this variant of `AnnotationKind` yet.
1110    match k {
1111        AnnotationKind::Unknown => Some("unknown"),
1112        AnnotationKind::AsIs => Some("asIs"),
1113        AnnotationKind::TextualNote => Some("textualNote"),
1114        AnnotationKind::InvalidRubySpan => Some("invalidRubySpan"),
1115        AnnotationKind::WarichuOpen => Some("warichuOpen"),
1116        AnnotationKind::WarichuClose => Some("warichuClose"),
1117        _ => None,
1118    }
1119}
1120
1121fn table_align(a: TableAlignment) -> IrTableAlign {
1122    match a {
1123        TableAlignment::Left => IrTableAlign::Left,
1124        TableAlignment::Center => IrTableAlign::Center,
1125        TableAlignment::Right => IrTableAlign::Right,
1126        TableAlignment::None => IrTableAlign::Default,
1127    }
1128}
1129
1130fn sourcepos_to_range(s: &Sourcepos) -> Option<Range> {
1131    // comrak source positions are 1-based line/column. We convert to
1132    // a pseudo-byte range by collapsing line numbers — the HTML
1133    // output doesn't carry true byte offsets, so the range here is
1134    // best-effort.
1135    let from = to_u32(
1136        s.start
1137            .line
1138            .saturating_sub(1)
1139            .saturating_mul(1024)
1140            .saturating_add(s.start.column.saturating_sub(1)),
1141    );
1142    let to = to_u32(
1143        s.end
1144            .line
1145            .saturating_sub(1)
1146            .saturating_mul(1024)
1147            .saturating_add(s.end.column.saturating_sub(1)),
1148    );
1149    (to >= from).then_some(Range { from, to })
1150}
1151
1152#[cfg(test)]
1153mod tests {
1154    //! Unit tests for the pure projection helpers.
1155    //!
1156    //! These cover the match arms inside the `const fn` projectors
1157    //! that are otherwise reachable only through specific Aozora
1158    //! input patterns — enumerating every input grammar in
1159    //! integration tests would be both noisy and fragile against
1160    //! upstream parser evolution. Calling the projectors directly
1161    //! with synthetic enum values pins each match arm to a value, so
1162    //! an upstream rename or variant removal fails the build at the
1163    //! call site rather than silently in the IR.
1164
1165    use super::*;
1166    use aozora_syntax::AlignEnd;
1167    use comrak::nodes::{LineColumn, Sourcepos};
1168
1169    #[test]
1170    fn bouten_kind_str_covers_every_upstream_variant() {
1171        let cases = [
1172            (BoutenKind::Goma, "goma"),
1173            (BoutenKind::WhiteSesame, "whiteSesame"),
1174            (BoutenKind::Circle, "circle"),
1175            (BoutenKind::WhiteCircle, "whiteCircle"),
1176            (BoutenKind::DoubleCircle, "doubleCircle"),
1177            (BoutenKind::Janome, "janome"),
1178            (BoutenKind::Cross, "cross"),
1179            (BoutenKind::WhiteTriangle, "whiteTriangle"),
1180            (BoutenKind::WavyLine, "wavyLine"),
1181            (BoutenKind::UnderLine, "underLine"),
1182            (BoutenKind::DoubleUnderLine, "doubleUnderLine"),
1183        ];
1184        for (kind, expected) in cases {
1185            assert_eq!(bouten_kind_str(kind), expected);
1186        }
1187    }
1188
1189    #[test]
1190    fn bouten_position_str_covers_left_and_right() {
1191        assert_eq!(bouten_position_str(BoutenPosition::Right), "right");
1192        assert_eq!(bouten_position_str(BoutenPosition::Left), "left");
1193    }
1194
1195    #[test]
1196    fn section_kind_subtype_covers_every_upstream_variant() {
1197        assert_eq!(section_kind_subtype(SectionKind::Choho), "choho");
1198        assert_eq!(section_kind_subtype(SectionKind::Dan), "dan");
1199        assert_eq!(section_kind_subtype(SectionKind::Spread), "spread");
1200    }
1201
1202    #[test]
1203    fn container_subtype_and_indent_level_round_trip_each_variant() {
1204        let indent = ContainerKind::Indent { amount: 3 };
1205        assert_eq!(container_subtype(indent), "indent");
1206        assert_eq!(container_indent_level(indent), Some(3));
1207
1208        let align = ContainerKind::AlignEnd {
1209            offset: AlignEnd { offset: 1 }.offset,
1210        };
1211        assert_eq!(container_subtype(align), "alignEnd");
1212        assert_eq!(container_indent_level(align), Some(1));
1213
1214        assert_eq!(container_subtype(ContainerKind::Warichu), "warichu");
1215        assert!(container_indent_level(ContainerKind::Warichu).is_none());
1216        assert_eq!(container_subtype(ContainerKind::Keigakomi), "keigakomi");
1217        assert!(container_indent_level(ContainerKind::Keigakomi).is_none());
1218    }
1219
1220    #[test]
1221    fn annotation_kind_resolved_covers_every_named_variant() {
1222        // `Unknown` is the upstream classifier's "tried, gave up"
1223        // outcome; we surface it as `Some("unknown")` so consumers
1224        // distinguish it from a future-variant hit (`None`).
1225        assert_eq!(
1226            annotation_kind_resolved(AnnotationKind::Unknown),
1227            Some("unknown")
1228        );
1229        assert_eq!(annotation_kind_resolved(AnnotationKind::AsIs), Some("asIs"));
1230        assert_eq!(
1231            annotation_kind_resolved(AnnotationKind::TextualNote),
1232            Some("textualNote")
1233        );
1234        assert_eq!(
1235            annotation_kind_resolved(AnnotationKind::InvalidRubySpan),
1236            Some("invalidRubySpan")
1237        );
1238        assert_eq!(
1239            annotation_kind_resolved(AnnotationKind::WarichuOpen),
1240            Some("warichuOpen")
1241        );
1242        assert_eq!(
1243            annotation_kind_resolved(AnnotationKind::WarichuClose),
1244            Some("warichuClose")
1245        );
1246    }
1247
1248    #[test]
1249    fn resolved_to_string_handles_char_and_multi() {
1250        assert_eq!(resolved_to_string(Resolved::Char('a')), "a");
1251        assert_eq!(resolved_to_string(Resolved::Multi("か゚")), "か゚");
1252    }
1253
1254    #[test]
1255    fn project_content_inlines_covers_plain_segments_and_empty() {
1256        assert!(project_content_inlines(Content::Plain("")).is_empty());
1257        let plain = project_content_inlines(Content::Plain("hi"));
1258        assert!(matches!(
1259            plain.as_slice(),
1260            [IrInline::Text { value, .. }] if value == "hi"
1261        ));
1262
1263        let segs: &[Segment<'_>] = &[Segment::Text("a"), Segment::Text("")];
1264        let segs_out = project_content_inlines(Content::Segments(segs));
1265        // Empty Text drops; non-empty survives.
1266        assert_eq!(segs_out.len(), 1);
1267    }
1268
1269    #[test]
1270    fn content_to_string_concatenates_segment_text_only() {
1271        assert_eq!(content_to_string(Content::Plain("xyz")), "xyz");
1272        let segs: &[Segment<'_>] = &[Segment::Text("a"), Segment::Text("b")];
1273        assert_eq!(content_to_string(Content::Segments(segs)), "ab");
1274    }
1275
1276    #[test]
1277    fn table_align_maps_every_alignment() {
1278        assert!(matches!(
1279            table_align(TableAlignment::Left),
1280            IrTableAlign::Left
1281        ));
1282        assert!(matches!(
1283            table_align(TableAlignment::Center),
1284            IrTableAlign::Center
1285        ));
1286        assert!(matches!(
1287            table_align(TableAlignment::Right),
1288            IrTableAlign::Right
1289        ));
1290        assert!(matches!(
1291            table_align(TableAlignment::None),
1292            IrTableAlign::Default
1293        ));
1294    }
1295
1296    #[test]
1297    fn sourcepos_to_range_returns_some_for_well_ordered_positions() {
1298        let pos = Sourcepos {
1299            start: LineColumn { line: 1, column: 1 },
1300            end: LineColumn { line: 1, column: 5 },
1301        };
1302        let range = sourcepos_to_range(&pos).expect("forward range");
1303        assert!(range.from <= range.to);
1304    }
1305
1306    #[test]
1307    fn sourcepos_to_range_returns_none_for_inverted_positions() {
1308        // Constructed (impossible) inverted sourcepos: start later
1309        // than end. The helper guards against negative ranges by
1310        // returning `None`, which keeps the IR robust under malformed
1311        // upstream output.
1312        let pos = Sourcepos {
1313            start: LineColumn { line: 5, column: 5 },
1314            end: LineColumn { line: 1, column: 1 },
1315        };
1316        assert!(sourcepos_to_range(&pos).is_none());
1317    }
1318}