Skip to main content

aozora_syntax/borrowed/
registry.rs

1//! Sentinel-position → [`AozoraNode`] lookup table.
2//!
3//! The registry pairs every PUA sentinel position written into the
4//! lexer's normalized text with the [`AozoraNode`] (or
5//! [`crate::extension::ContainerKind`]) that originated it.
6//! Downstream renderers walk the normalized text, encounter a
7//! sentinel, and `node_at(pos)` to recover the structured node.
8//!
9//! # Layout decision
10//!
11//! Stored as **one** [`aozora_veb::EytzingerMap`] keyed by normalized
12//! byte position. Every entry's payload is a [`NodeRef`] enum that
13//! discriminates inline / block-leaf / block-open / block-close
14//! hits — pre-Phase-D the four sentinel kinds lived in four
15//! independent tables and `node_at` did a 4-way linear sweep. The
16//! single-table layout means one binary search per lookup; renderers
17//! pattern-match on the `NodeRef` variant inline.
18//!
19//! Entries are inserted in monotonically increasing position order
20//! during the lex pipeline (the classifier emits spans in source
21//! order, every sentinel position is therefore strictly greater than
22//! the previous), so construction can short-circuit the sort step
23//! that a general-purpose builder would need.
24//!
25//! Position-keyed map from `NormalizedOffset` to AST node, backed by
26//! [`aozora_veb::EytzingerMap`] for cache-friendly lookups during
27//! render-time traversal.
28
29use crate::extension::ContainerKind;
30
31use aozora_spec::{NormalizedOffset, Sentinel};
32use aozora_veb::EytzingerMap;
33
34use super::types::AozoraNode;
35
36/// Unified view over a registry hit, returned by [`Registry::node_at`].
37///
38/// Each variant tags the sentinel kind that fired; renderers
39/// pattern-match the variant once, then handle the inline payload
40/// (a borrowed [`AozoraNode`]) or the container payload (a
41/// [`ContainerKind`] enum) accordingly.
42#[derive(Debug, Clone, Copy, PartialEq)]
43#[non_exhaustive]
44pub enum NodeRef<'src> {
45    /// Hit on an inline-sentinel position
46    /// ([`aozora_spec::Sentinel::Inline`]).
47    Inline(AozoraNode<'src>),
48    /// Hit on a block-leaf-sentinel position
49    /// ([`aozora_spec::Sentinel::BlockLeaf`]).
50    BlockLeaf(AozoraNode<'src>),
51    /// Hit on a block-container-open position
52    /// ([`aozora_spec::Sentinel::BlockOpen`]).
53    BlockOpen(ContainerKind),
54    /// Hit on a block-container-close position
55    /// ([`aozora_spec::Sentinel::BlockClose`]).
56    BlockClose(ContainerKind),
57}
58
59impl NodeRef<'_> {
60    /// Sentinel kind that produced this entry.
61    ///
62    /// Useful for tests / tooling that want to bucket registry
63    /// entries by sentinel kind without depending on the variant
64    /// payload shape.
65    #[must_use]
66    pub const fn sentinel_kind(self) -> Sentinel {
67        match self {
68            Self::Inline(_) => Sentinel::Inline,
69            Self::BlockLeaf(_) => Sentinel::BlockLeaf,
70            Self::BlockOpen(_) => Sentinel::BlockOpen,
71            Self::BlockClose(_) => Sentinel::BlockClose,
72        }
73    }
74
75    /// Cross-cutting [`crate::NodeKind`] tag for this entry.
76    ///
77    /// Inline / block-leaf hits project to the underlying
78    /// [`AozoraNode::kind`] tag; container open / close hits flatten
79    /// into [`NodeKind::ContainerOpen`](crate::NodeKind::ContainerOpen)
80    /// / [`ContainerClose`](crate::NodeKind::ContainerClose) because
81    /// the wire format places container kind detail in the inline
82    /// span rather than on the open/close marker.
83    #[must_use]
84    pub const fn kind(self) -> crate::NodeKind {
85        match self {
86            Self::Inline(node) | Self::BlockLeaf(node) => node.kind(),
87            Self::BlockOpen(_) => crate::NodeKind::ContainerOpen,
88            Self::BlockClose(_) => crate::NodeKind::ContainerClose,
89        }
90    }
91}
92
93/// Whole-document registry — single Eytzinger-keyed table.
94///
95/// `node_at` is one binary search, and every entry's sentinel kind is
96/// encoded by the [`NodeRef`] variant — renderers pattern-match the
97/// hit inline rather than dispatching across per-kind tables.
98#[derive(Debug, Clone)]
99pub struct Registry<'src> {
100    /// Single `SoA` lookup table keyed by normalized byte position.
101    /// Built once at pipeline-build time from the classifier's emit
102    /// stream; entries arrive in strictly increasing position order
103    /// because the classifier tiles spans contiguously.
104    table: EytzingerMap<u32, NodeRef<'src>>,
105}
106
107impl<'src> Registry<'src> {
108    /// Construct a registry from a position-sorted slice of
109    /// `(position, NodeRef)` entries.
110    ///
111    /// # Panics
112    ///
113    /// Inherits [`EytzingerMap::from_sorted_slice`]'s precondition:
114    /// the slice must be sorted by key. The lex pipeline always emits
115    /// in source order, so this is satisfied by construction.
116    #[must_use]
117    pub fn from_sorted_slice(entries: &[(u32, NodeRef<'src>)]) -> Self {
118        Self {
119            table: EytzingerMap::from_sorted_slice(entries),
120        }
121    }
122
123    /// Empty registry. Useful as a starting point for incremental
124    /// construction (the lex driver pushes into a builder vec that
125    /// later collapses into the Eytzinger table).
126    #[must_use]
127    pub const fn empty() -> Self {
128        Self {
129            table: EytzingerMap::new(),
130        }
131    }
132
133    /// True iff the registry holds no entries.
134    #[must_use]
135    pub fn is_empty(&self) -> bool {
136        self.table.is_empty()
137    }
138
139    /// Total number of entries across all sentinel kinds. O(1).
140    #[must_use]
141    pub fn len(&self) -> usize {
142        self.table.len()
143    }
144
145    /// Look up the registry entry at the given *normalized-text* byte
146    /// position. Returns `None` if no sentinel landed at that
147    /// position.
148    ///
149    /// The argument is a [`NormalizedOffset`] newtype rather than a
150    /// raw `u32` — editor surfaces that hold a source-coordinate byte
151    /// offset must first translate via
152    /// `BorrowedLexOutput::node_at_source` (which walks a
153    /// source-keyed side-table built during the lex pipeline) instead
154    /// of casting between the two coordinate spaces.
155    #[must_use]
156    pub fn node_at(&self, pos: NormalizedOffset) -> Option<NodeRef<'src>> {
157        self.table.get(&pos.get()).copied()
158    }
159
160    /// Iterate over `(position, NodeRef)` entries in ascending
161    /// position order. Useful for tests and tooling that want to
162    /// enumerate everything the registry holds.
163    pub fn iter_sorted(&self) -> impl Iterator<Item = (u32, NodeRef<'src>)> + '_ {
164        self.table.iter_sorted().map(|(&p, &nr)| (p, nr))
165    }
166
167    /// Iterate over entries whose [`NodeRef::sentinel_kind`] matches
168    /// `kind`. O(n) but the filter is a constant-time variant
169    /// discriminant compare.
170    pub fn iter_kind(&self, kind: Sentinel) -> impl Iterator<Item = (u32, NodeRef<'src>)> + '_ {
171        self.iter_sorted()
172            .filter(move |(_, nr)| nr.sentinel_kind() == kind)
173    }
174
175    /// Count entries whose [`NodeRef::sentinel_kind`] matches `kind`.
176    ///
177    /// O(n) over the table. Cardinality assertions in unit tests
178    /// drive this; production lookups go through [`Self::node_at`].
179    #[must_use]
180    pub fn count_kind(&self, kind: Sentinel) -> usize {
181        self.iter_kind(kind).count()
182    }
183}
184
185impl Default for Registry<'_> {
186    fn default() -> Self {
187        Self::empty()
188    }
189}
190
191/// Resolved (open, close) container-marker pair, in normalized
192/// coordinates.
193///
194/// The pipeline tracks an open-stack while it walks the classifier
195/// output; `ContainerPair` surfaces that pairing explicitly so editor
196/// surfaces and renderers asking "where is the close marker for this
197/// open?" can index this slice directly instead of re-running the
198/// matching logic over the registry's
199/// [`NodeRef::BlockOpen`] / [`NodeRef::BlockClose`] entries.
200///
201/// Coordinates are [`NormalizedOffset`] — they index the
202/// `BorrowedLexOutput::normalized` text, the same coordinate space
203/// the [`Registry`] uses.
204#[derive(Debug, Clone, Copy, PartialEq, Eq)]
205pub struct ContainerPair {
206    /// The container kind. The builder constructs the pair from the
207    /// open-stack pop, so `kind` reflects the open marker
208    /// authoritatively (rather than the close-side payload).
209    pub kind: ContainerKind,
210    /// Normalized byte offset of the open sentinel (`U+E003`).
211    pub open: NormalizedOffset,
212    /// Normalized byte offset of the close sentinel (`U+E004`).
213    pub close: NormalizedOffset,
214}
215
216impl ContainerPair {
217    /// Construct a pair. Helper for builder tests; in production the
218    /// pipeline emits these directly.
219    #[must_use]
220    pub const fn new(kind: ContainerKind, open: NormalizedOffset, close: NormalizedOffset) -> Self {
221        Self { kind, open, close }
222    }
223}
224
225#[cfg(test)]
226mod tests {
227    use super::*;
228    use crate::Indent;
229
230    #[test]
231    fn empty_registry_reports_empty() {
232        let r: Registry<'static> = Registry::empty();
233        assert!(r.is_empty());
234        assert_eq!(r.len(), 0);
235    }
236
237    #[test]
238    fn default_registry_is_empty() {
239        let r: Registry<'static> = Registry::default();
240        assert!(r.is_empty());
241    }
242
243    #[test]
244    fn node_at_returns_inline_payload_for_inline_sentinel_position() {
245        let r: Registry<'static> = Registry::from_sorted_slice(&[
246            (
247                10u32,
248                NodeRef::Inline(AozoraNode::Indent(Indent { amount: 1 })),
249            ),
250            (20u32, NodeRef::Inline(AozoraNode::PageBreak)),
251            (
252                30u32,
253                NodeRef::Inline(AozoraNode::Indent(Indent { amount: 3 })),
254            ),
255        ]);
256        assert!(!r.is_empty());
257        assert_eq!(r.len(), 3);
258        let got = r.node_at(NormalizedOffset::new(20));
259        assert!(matches!(got, Some(NodeRef::Inline(AozoraNode::PageBreak))));
260        assert!(r.node_at(NormalizedOffset::new(15)).is_none());
261    }
262
263    #[test]
264    fn node_at_dispatches_to_correct_variant() {
265        let r: Registry<'static> = Registry::from_sorted_slice(&[
266            (10u32, NodeRef::Inline(AozoraNode::PageBreak)),
267            (20u32, NodeRef::BlockLeaf(AozoraNode::PageBreak)),
268            (30u32, NodeRef::BlockOpen(ContainerKind::Keigakomi)),
269            (40u32, NodeRef::BlockClose(ContainerKind::Keigakomi)),
270        ]);
271        assert!(matches!(
272            r.node_at(NormalizedOffset::new(10)),
273            Some(NodeRef::Inline(AozoraNode::PageBreak))
274        ));
275        assert!(matches!(
276            r.node_at(NormalizedOffset::new(20)),
277            Some(NodeRef::BlockLeaf(AozoraNode::PageBreak))
278        ));
279        assert!(matches!(
280            r.node_at(NormalizedOffset::new(30)),
281            Some(NodeRef::BlockOpen(ContainerKind::Keigakomi))
282        ));
283        assert!(matches!(
284            r.node_at(NormalizedOffset::new(40)),
285            Some(NodeRef::BlockClose(ContainerKind::Keigakomi))
286        ));
287        assert!(r.node_at(NormalizedOffset::new(99)).is_none());
288    }
289
290    #[test]
291    fn count_kind_buckets_entries_by_sentinel() {
292        let r: Registry<'static> = Registry::from_sorted_slice(&[
293            (
294                5u32,
295                NodeRef::BlockOpen(ContainerKind::Indent { amount: 2 }),
296            ),
297            (10u32, NodeRef::BlockOpen(ContainerKind::Keigakomi)),
298            (15u32, NodeRef::Inline(AozoraNode::PageBreak)),
299            (20u32, NodeRef::BlockClose(ContainerKind::Keigakomi)),
300        ]);
301        assert_eq!(r.count_kind(Sentinel::BlockOpen), 2);
302        assert_eq!(r.count_kind(Sentinel::Inline), 1);
303        assert_eq!(r.count_kind(Sentinel::BlockClose), 1);
304        assert_eq!(r.count_kind(Sentinel::BlockLeaf), 0);
305    }
306
307    #[test]
308    fn node_ref_sentinel_kind_round_trips() {
309        let inline = NodeRef::Inline(AozoraNode::PageBreak);
310        let block_leaf = NodeRef::BlockLeaf(AozoraNode::PageBreak);
311        let block_open = NodeRef::BlockOpen(ContainerKind::Keigakomi);
312        let block_close = NodeRef::BlockClose(ContainerKind::Keigakomi);
313        assert_eq!(inline.sentinel_kind(), Sentinel::Inline);
314        assert_eq!(block_leaf.sentinel_kind(), Sentinel::BlockLeaf);
315        assert_eq!(block_open.sentinel_kind(), Sentinel::BlockOpen);
316        assert_eq!(block_close.sentinel_kind(), Sentinel::BlockClose);
317    }
318}