Skip to main content

afm_markdown/
sentinels.rs

1//! Shared primitives for both [`crate::post_process`] (HTML splicer)
2//! and [`crate::ir`] (IR builder).
3//!
4//! Both downstream consumers walk the same sentinel-position stream
5//! produced by `aozora-pipeline`. They differ only in their emit
6//! target (string buffer vs. typed tree), not in how they sequence
7//! the registry. This module owns the sequencing primitives so the
8//! two walkers stay in lockstep automatically.
9//!
10//! Design notes:
11//!
12//! - The fast `is_sentinel_char` check is a single subtract-and-compare
13//!   on the codepoint (`ch as u32 - 0xE001 < 4`). Hotter than the
14//!   `matches!` chain it replaces because every paragraph-text walk
15//!   touches this predicate per char.
16//! - `flatten_registry_in_source_order` materialises the registry
17//!   into a `Vec<NodeRef>` keyed by source-order traversal, since
18//!   both walkers consume entries linearly. The `EytzingerMap`
19//!   upstream is binary-search-friendly but we never look up by
20//!   position at HTML rewrite time — the order alone is sufficient.
21//! - `sole_block_sentinel` walks a `&str` of paragraph-inner text
22//!   without allocating; `paragraph_sole_block_sentinel` walks a
23//!   comrak paragraph node directly with the same semantics, also
24//!   allocation-free.
25
26use aozora_pipeline::{
27    BLOCK_CLOSE_SENTINEL, BLOCK_LEAF_SENTINEL, BLOCK_OPEN_SENTINEL, BorrowedLexOutput,
28    INLINE_SENTINEL,
29};
30use aozora_spec::NormalizedOffset;
31use aozora_syntax::borrowed::NodeRef;
32use comrak::nodes::{AstNode, NodeValue};
33
34/// Which paired sentinel a block-sentinel paragraph carries.
35#[derive(Debug, Clone, Copy, PartialEq, Eq)]
36pub(crate) enum BlockSentinelKind {
37    Leaf,
38    Open,
39    Close,
40}
41
42impl BlockSentinelKind {
43    /// Map a char codepoint back to its sentinel kind. `None` for
44    /// inline sentinel and non-sentinel chars.
45    #[inline]
46    pub(crate) const fn from_char(ch: char) -> Option<Self> {
47        match ch {
48            BLOCK_LEAF_SENTINEL => Some(Self::Leaf),
49            BLOCK_OPEN_SENTINEL => Some(Self::Open),
50            BLOCK_CLOSE_SENTINEL => Some(Self::Close),
51            _ => None,
52        }
53    }
54}
55
56/// True iff `ch` is one of the four PUA sentinel codepoints
57/// `U+E001..=U+E004`.
58///
59/// Implemented as a single subtract-and-compare. The optimiser would
60/// likely fold the equivalent `matches!` chain into the same code,
61/// but writing it once explicitly keeps the hot path obvious to
62/// readers and lets us const-eval it where needed.
63#[inline]
64pub(crate) const fn is_sentinel_char(ch: char) -> bool {
65    (ch as u32).wrapping_sub(INLINE_SENTINEL as u32) < 4
66}
67
68/// If `inner` consists of exactly one block-sentinel character
69/// (optionally surrounded by ASCII whitespace), return its kind.
70/// Inline sentinels never qualify.
71pub(crate) fn sole_block_sentinel(inner: &str) -> Option<BlockSentinelKind> {
72    let trimmed = inner.trim_matches(|c: char| matches!(c, ' ' | '\t' | '\n' | '\r'));
73    let mut chars = trimmed.chars();
74    let first = chars.next()?;
75    if chars.next().is_some() {
76        return None;
77    }
78    BlockSentinelKind::from_char(first)
79}
80
81/// Allocation-free analogue of [`sole_block_sentinel`] that walks a
82/// comrak paragraph node directly.
83///
84/// Returns `Some(kind)` iff the paragraph's body, taken across all
85/// `Text`-node descendants, contains exactly one block-sentinel
86/// codepoint and otherwise consists only of ASCII whitespace, AND
87/// the paragraph has no non-`Text` descendants (which would imply
88/// embedded inline structure incompatible with a sole-sentinel
89/// paragraph).
90pub(crate) fn paragraph_sole_block_sentinel<'a>(
91    node: &'a AstNode<'a>,
92) -> Option<BlockSentinelKind> {
93    let mut found: Option<BlockSentinelKind> = None;
94    if walk_text_only_descendants(node, &mut |s| {
95        for ch in s.chars() {
96            if matches!(ch, ' ' | '\t' | '\n' | '\r') {
97                continue;
98            }
99            let Some(kind) = BlockSentinelKind::from_char(ch) else {
100                return ControlFlow::Break;
101            };
102            if found.is_some() {
103                return ControlFlow::Break;
104            }
105            found = Some(kind);
106        }
107        ControlFlow::Continue
108    }) {
109        found
110    } else {
111        None
112    }
113}
114
115/// Result of a Text-node walk over a comrak block.
116enum ControlFlow {
117    Continue,
118    Break,
119}
120
121/// Visit every `Text` descendant of `node` left-to-right. Returns
122/// `true` iff the entire subtree consists of `Text` leaves *only*
123/// (no Strong / Emph / Link / Code / etc. nodes), AND the visitor
124/// closure asked to continue at every step. Used to validate the
125/// "sole block sentinel paragraph" invariant cheaply.
126fn walk_text_only_descendants<'a, F>(node: &'a AstNode<'a>, visit: &mut F) -> bool
127where
128    F: FnMut(&str) -> ControlFlow,
129{
130    for child in node.children() {
131        let data = child.data.borrow();
132        match &data.value {
133            NodeValue::Text(s) => {
134                let s = s.clone();
135                drop(data);
136                match visit(&s) {
137                    ControlFlow::Continue => {}
138                    ControlFlow::Break => return false,
139                }
140                if child.first_child().is_some() && !walk_text_only_descendants(child, visit) {
141                    return false;
142                }
143            }
144            _ => return false,
145        }
146    }
147    true
148}
149
150/// Visit every `Text` descendant of `node` left-to-right, calling
151/// `visit` on each `&str` slice. Unlike
152/// [`walk_text_only_descendants`], this descends into emphasis /
153/// strong / link / code subtrees and ignores their wrappers — we
154/// only care about the leaf text. Used to count sentinels and peek
155/// the registry for paragraph-level dispatch.
156pub(crate) fn for_each_text_descendant<'a, F>(node: &'a AstNode<'a>, mut visit: F)
157where
158    F: FnMut(&str),
159{
160    visit_text_inner(node, &mut visit);
161}
162
163fn visit_text_inner<'a, F>(node: &'a AstNode<'a>, visit: &mut F)
164where
165    F: FnMut(&str),
166{
167    for child in node.children() {
168        let data = child.data.borrow();
169        if let NodeValue::Text(s) = &data.value {
170            let s = s.clone();
171            drop(data);
172            visit(&s);
173        } else {
174            let has_descendants = child.first_child().is_some();
175            drop(data);
176            if has_descendants {
177                visit_text_inner(child, visit);
178            }
179        }
180    }
181}
182
183/// Walk `lex_out.normalized` byte-by-byte; for every PUA sentinel,
184/// query the registry and append the resulting [`NodeRef`] to a
185/// freshly-allocated `Vec` in source order.
186///
187/// Returns an empty vec when the registry is empty (the typical
188/// branch when `Options::aozora_enabled` is `false`).
189pub(crate) fn flatten_registry_in_source_order<'a>(
190    lex_out: &BorrowedLexOutput<'a>,
191) -> Vec<NodeRef<'a>> {
192    if lex_out.registry.is_empty() {
193        return Vec::new();
194    }
195    let mut out = Vec::with_capacity(lex_out.registry.len());
196    for (idx, ch) in lex_out.normalized.char_indices() {
197        if !is_sentinel_char(ch) {
198            continue;
199        }
200        let pos = u32::try_from(idx).expect("normalized text fits u32 (Phase 0 cap)");
201        if let Some(node_ref) = lex_out.registry.node_at(NormalizedOffset(pos)) {
202            out.push(node_ref);
203        }
204    }
205    out
206}
207
208/// Cursor over a flat sentinel-ordered slice of [`NodeRef`].
209///
210/// Both [`crate::post_process`] and [`crate::ir`] share this cursor:
211/// they track their own per-walker state (HTML buffer + container
212/// kind stack vs IR tree builder + open-container stack) but agree on
213/// how to consume the stream.
214pub(crate) struct SentinelCursor<'a, 'src> {
215    nodes: &'a [NodeRef<'src>],
216    idx: usize,
217}
218
219impl<'a, 'src> SentinelCursor<'a, 'src> {
220    pub(crate) fn new(nodes: &'a [NodeRef<'src>]) -> Self {
221        Self { nodes, idx: 0 }
222    }
223
224    /// Peek the registry entry at `offset` past the current cursor.
225    /// `peek(0)` returns the next entry that `next` would produce.
226    pub(crate) fn peek(&self, offset: usize) -> Option<NodeRef<'src>> {
227        self.nodes.get(self.idx + offset).copied()
228    }
229
230    /// Consume and return the next entry, advancing the cursor.
231    pub(crate) fn next(&mut self) -> Option<NodeRef<'src>> {
232        let n = self.nodes.get(self.idx).copied();
233        if n.is_some() {
234            self.idx += 1;
235        }
236        n
237    }
238
239    /// Saturating advance by `n` entries.
240    pub(crate) fn advance(&mut self, n: usize) {
241        self.idx = self.idx.saturating_add(n).min(self.nodes.len());
242    }
243
244    /// Number of entries consumed so far. Used by streaming-mode
245    /// callers (`crate::ir`'s `StreamingIrBuilder`) to thread the
246    /// cursor across per-block walks.
247    pub(crate) fn position(&self) -> usize {
248        self.idx
249    }
250
251    /// Construct with an explicit starting cursor position.
252    /// Saturating: positions past the end clamp to `nodes.len()`.
253    pub(crate) fn with_position(nodes: &'a [NodeRef<'src>], pos: usize) -> Self {
254        Self {
255            nodes,
256            idx: pos.min(nodes.len()),
257        }
258    }
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264
265    #[test]
266    fn is_sentinel_char_recognises_all_four() {
267        for ch in [
268            INLINE_SENTINEL,
269            BLOCK_LEAF_SENTINEL,
270            BLOCK_OPEN_SENTINEL,
271            BLOCK_CLOSE_SENTINEL,
272        ] {
273            assert!(is_sentinel_char(ch), "{ch:?} should be a sentinel");
274        }
275    }
276
277    #[test]
278    fn is_sentinel_char_rejects_neighbours() {
279        // Codepoints adjacent to the sentinel range must NOT match.
280        assert!(!is_sentinel_char('\u{E000}'));
281        assert!(!is_sentinel_char('\u{E005}'));
282        assert!(!is_sentinel_char('a'));
283        assert!(!is_sentinel_char('\0'));
284    }
285
286    #[test]
287    fn block_sentinel_kind_from_char_round_trips() {
288        assert_eq!(
289            BlockSentinelKind::from_char(BLOCK_LEAF_SENTINEL),
290            Some(BlockSentinelKind::Leaf)
291        );
292        assert_eq!(
293            BlockSentinelKind::from_char(BLOCK_OPEN_SENTINEL),
294            Some(BlockSentinelKind::Open)
295        );
296        assert_eq!(
297            BlockSentinelKind::from_char(BLOCK_CLOSE_SENTINEL),
298            Some(BlockSentinelKind::Close)
299        );
300        // Inline does NOT count as a block sentinel.
301        assert!(BlockSentinelKind::from_char(INLINE_SENTINEL).is_none());
302        assert!(BlockSentinelKind::from_char('a').is_none());
303    }
304
305    #[test]
306    fn sole_block_sentinel_accepts_block_with_whitespace_around() {
307        assert_eq!(
308            sole_block_sentinel(&format!("\n{BLOCK_LEAF_SENTINEL}\n")),
309            Some(BlockSentinelKind::Leaf)
310        );
311    }
312
313    #[test]
314    fn sole_block_sentinel_rejects_inline() {
315        assert!(sole_block_sentinel(&format!("{INLINE_SENTINEL}")).is_none());
316    }
317
318    #[test]
319    fn sole_block_sentinel_rejects_multiple() {
320        let s = format!("{BLOCK_LEAF_SENTINEL}{BLOCK_OPEN_SENTINEL}");
321        assert!(sole_block_sentinel(&s).is_none());
322    }
323
324    #[test]
325    fn sentinel_cursor_peeks_and_consumes_in_order() {
326        // Synthesise a small slice of NodeRefs for cursor mechanics.
327        use aozora_syntax::ContainerKind;
328        use aozora_syntax::borrowed::AozoraNode;
329        let entries: &[NodeRef<'static>] = &[
330            NodeRef::Inline(AozoraNode::PageBreak),
331            NodeRef::BlockOpen(ContainerKind::Keigakomi),
332            NodeRef::BlockClose(ContainerKind::Keigakomi),
333        ];
334        let mut cursor = SentinelCursor::new(entries);
335        assert!(matches!(
336            cursor.peek(0),
337            Some(NodeRef::Inline(AozoraNode::PageBreak))
338        ));
339        assert!(matches!(
340            cursor.peek(2),
341            Some(NodeRef::BlockClose(ContainerKind::Keigakomi))
342        ));
343        assert!(cursor.peek(3).is_none());
344        let _ = cursor.next();
345        assert!(matches!(
346            cursor.next(),
347            Some(NodeRef::BlockOpen(ContainerKind::Keigakomi))
348        ));
349        cursor.advance(99); // saturating
350        assert!(cursor.next().is_none());
351    }
352}