aozora_syntax/borrowed/registry.rs
1//! Sentinel-position → [`AozoraNode`] lookup table.
2//!
3//! The registry pairs every PUA sentinel position written into the
4//! lexer's normalized text with the [`AozoraNode`] (or
5//! [`crate::extension::ContainerKind`]) that originated it.
6//! Downstream renderers walk the normalized text, encounter a
7//! sentinel, and `node_at(pos)` to recover the structured node.
8//!
9//! # Layout decision
10//!
11//! Stored as **one** [`aozora_veb::EytzingerMap`] keyed by normalized
12//! byte position. Every entry's payload is a [`NodeRef`] enum that
13//! discriminates inline / block-leaf / block-open / block-close
14//! hits — pre-Phase-D the four sentinel kinds lived in four
15//! independent tables and `node_at` did a 4-way linear sweep. The
16//! single-table layout means one binary search per lookup; renderers
17//! pattern-match on the `NodeRef` variant inline.
18//!
19//! Entries are inserted in monotonically increasing position order
20//! during the lex pipeline (the classifier emits spans in source
21//! order, every sentinel position is therefore strictly greater than
22//! the previous), so construction can short-circuit the sort step
23//! that a general-purpose builder would need.
24//!
25//! Position-keyed map from `NormalizedOffset` to AST node, backed by
26//! [`aozora_veb::EytzingerMap`] for cache-friendly lookups during
27//! render-time traversal.
28
29use crate::extension::ContainerKind;
30
31use aozora_spec::{NormalizedOffset, Sentinel};
32use aozora_veb::EytzingerMap;
33
34use super::types::AozoraNode;
35
36/// Unified view over a registry hit, returned by [`Registry::node_at`].
37///
38/// Each variant tags the sentinel kind that fired; renderers
39/// pattern-match the variant once, then handle the inline payload
40/// (a borrowed [`AozoraNode`]) or the container payload (a
41/// [`ContainerKind`] enum) accordingly.
42#[derive(Debug, Clone, Copy, PartialEq)]
43#[non_exhaustive]
44pub enum NodeRef<'src> {
45 /// Hit on an inline-sentinel position
46 /// ([`aozora_spec::Sentinel::Inline`]).
47 Inline(AozoraNode<'src>),
48 /// Hit on a block-leaf-sentinel position
49 /// ([`aozora_spec::Sentinel::BlockLeaf`]).
50 BlockLeaf(AozoraNode<'src>),
51 /// Hit on a block-container-open position
52 /// ([`aozora_spec::Sentinel::BlockOpen`]).
53 BlockOpen(ContainerKind),
54 /// Hit on a block-container-close position
55 /// ([`aozora_spec::Sentinel::BlockClose`]).
56 BlockClose(ContainerKind),
57}
58
59impl NodeRef<'_> {
60 /// Sentinel kind that produced this entry.
61 ///
62 /// Useful for tests / tooling that want to bucket registry
63 /// entries by sentinel kind without depending on the variant
64 /// payload shape.
65 #[must_use]
66 pub const fn sentinel_kind(self) -> Sentinel {
67 match self {
68 Self::Inline(_) => Sentinel::Inline,
69 Self::BlockLeaf(_) => Sentinel::BlockLeaf,
70 Self::BlockOpen(_) => Sentinel::BlockOpen,
71 Self::BlockClose(_) => Sentinel::BlockClose,
72 }
73 }
74
75 /// Cross-cutting [`crate::NodeKind`] tag for this entry.
76 ///
77 /// Inline / block-leaf hits project to the underlying
78 /// [`AozoraNode::kind`] tag; container open / close hits flatten
79 /// into [`NodeKind::ContainerOpen`](crate::NodeKind::ContainerOpen)
80 /// / [`ContainerClose`](crate::NodeKind::ContainerClose) because
81 /// the wire format places container kind detail in the inline
82 /// span rather than on the open/close marker.
83 #[must_use]
84 pub const fn kind(self) -> crate::NodeKind {
85 match self {
86 Self::Inline(node) | Self::BlockLeaf(node) => node.kind(),
87 Self::BlockOpen(_) => crate::NodeKind::ContainerOpen,
88 Self::BlockClose(_) => crate::NodeKind::ContainerClose,
89 }
90 }
91}
92
93/// Whole-document registry — single Eytzinger-keyed table.
94///
95/// `node_at` is one binary search, and every entry's sentinel kind is
96/// encoded by the [`NodeRef`] variant — renderers pattern-match the
97/// hit inline rather than dispatching across per-kind tables.
98#[derive(Debug, Clone)]
99pub struct Registry<'src> {
100 /// Single `SoA` lookup table keyed by normalized byte position.
101 /// Built once at pipeline-build time from the classifier's emit
102 /// stream; entries arrive in strictly increasing position order
103 /// because the classifier tiles spans contiguously.
104 table: EytzingerMap<u32, NodeRef<'src>>,
105}
106
107impl<'src> Registry<'src> {
108 /// Construct a registry from a position-sorted slice of
109 /// `(position, NodeRef)` entries.
110 ///
111 /// # Panics
112 ///
113 /// Inherits [`EytzingerMap::from_sorted_slice`]'s precondition:
114 /// the slice must be sorted by key. The lex pipeline always emits
115 /// in source order, so this is satisfied by construction.
116 #[must_use]
117 pub fn from_sorted_slice(entries: &[(u32, NodeRef<'src>)]) -> Self {
118 Self {
119 table: EytzingerMap::from_sorted_slice(entries),
120 }
121 }
122
123 /// Empty registry. Useful as a starting point for incremental
124 /// construction (the lex driver pushes into a builder vec that
125 /// later collapses into the Eytzinger table).
126 #[must_use]
127 pub const fn empty() -> Self {
128 Self {
129 table: EytzingerMap::new(),
130 }
131 }
132
133 /// True iff the registry holds no entries.
134 #[must_use]
135 pub fn is_empty(&self) -> bool {
136 self.table.is_empty()
137 }
138
139 /// Total number of entries across all sentinel kinds. O(1).
140 #[must_use]
141 pub fn len(&self) -> usize {
142 self.table.len()
143 }
144
145 /// Look up the registry entry at the given *normalized-text* byte
146 /// position. Returns `None` if no sentinel landed at that
147 /// position.
148 ///
149 /// The argument is a [`NormalizedOffset`] newtype rather than a
150 /// raw `u32` — editor surfaces that hold a source-coordinate byte
151 /// offset must first translate via
152 /// `BorrowedLexOutput::node_at_source` (which walks a
153 /// source-keyed side-table built during the lex pipeline) instead
154 /// of casting between the two coordinate spaces.
155 #[must_use]
156 pub fn node_at(&self, pos: NormalizedOffset) -> Option<NodeRef<'src>> {
157 self.table.get(&pos.get()).copied()
158 }
159
160 /// Iterate over `(position, NodeRef)` entries in ascending
161 /// position order. Useful for tests and tooling that want to
162 /// enumerate everything the registry holds.
163 pub fn iter_sorted(&self) -> impl Iterator<Item = (u32, NodeRef<'src>)> + '_ {
164 self.table.iter_sorted().map(|(&p, &nr)| (p, nr))
165 }
166
167 /// Iterate over entries whose [`NodeRef::sentinel_kind`] matches
168 /// `kind`. O(n) but the filter is a constant-time variant
169 /// discriminant compare.
170 pub fn iter_kind(&self, kind: Sentinel) -> impl Iterator<Item = (u32, NodeRef<'src>)> + '_ {
171 self.iter_sorted()
172 .filter(move |(_, nr)| nr.sentinel_kind() == kind)
173 }
174
175 /// Count entries whose [`NodeRef::sentinel_kind`] matches `kind`.
176 ///
177 /// O(n) over the table. Cardinality assertions in unit tests
178 /// drive this; production lookups go through [`Self::node_at`].
179 #[must_use]
180 pub fn count_kind(&self, kind: Sentinel) -> usize {
181 self.iter_kind(kind).count()
182 }
183}
184
185impl Default for Registry<'_> {
186 fn default() -> Self {
187 Self::empty()
188 }
189}
190
191/// Resolved (open, close) container-marker pair, in normalized
192/// coordinates.
193///
194/// The pipeline tracks an open-stack while it walks the classifier
195/// output; `ContainerPair` surfaces that pairing explicitly so editor
196/// surfaces and renderers asking "where is the close marker for this
197/// open?" can index this slice directly instead of re-running the
198/// matching logic over the registry's
199/// [`NodeRef::BlockOpen`] / [`NodeRef::BlockClose`] entries.
200///
201/// Coordinates are [`NormalizedOffset`] — they index the
202/// `BorrowedLexOutput::normalized` text, the same coordinate space
203/// the [`Registry`] uses.
204#[derive(Debug, Clone, Copy, PartialEq, Eq)]
205pub struct ContainerPair {
206 /// The container kind. The builder constructs the pair from the
207 /// open-stack pop, so `kind` reflects the open marker
208 /// authoritatively (rather than the close-side payload).
209 pub kind: ContainerKind,
210 /// Normalized byte offset of the open sentinel (`U+E003`).
211 pub open: NormalizedOffset,
212 /// Normalized byte offset of the close sentinel (`U+E004`).
213 pub close: NormalizedOffset,
214}
215
216impl ContainerPair {
217 /// Construct a pair. Helper for builder tests; in production the
218 /// pipeline emits these directly.
219 #[must_use]
220 pub const fn new(kind: ContainerKind, open: NormalizedOffset, close: NormalizedOffset) -> Self {
221 Self { kind, open, close }
222 }
223}
224
225#[cfg(test)]
226mod tests {
227 use super::*;
228 use crate::Indent;
229
230 #[test]
231 fn empty_registry_reports_empty() {
232 let r: Registry<'static> = Registry::empty();
233 assert!(r.is_empty());
234 assert_eq!(r.len(), 0);
235 }
236
237 #[test]
238 fn default_registry_is_empty() {
239 let r: Registry<'static> = Registry::default();
240 assert!(r.is_empty());
241 }
242
243 #[test]
244 fn node_at_returns_inline_payload_for_inline_sentinel_position() {
245 let r: Registry<'static> = Registry::from_sorted_slice(&[
246 (
247 10u32,
248 NodeRef::Inline(AozoraNode::Indent(Indent { amount: 1 })),
249 ),
250 (20u32, NodeRef::Inline(AozoraNode::PageBreak)),
251 (
252 30u32,
253 NodeRef::Inline(AozoraNode::Indent(Indent { amount: 3 })),
254 ),
255 ]);
256 assert!(!r.is_empty());
257 assert_eq!(r.len(), 3);
258 let got = r.node_at(NormalizedOffset::new(20));
259 assert!(matches!(got, Some(NodeRef::Inline(AozoraNode::PageBreak))));
260 assert!(r.node_at(NormalizedOffset::new(15)).is_none());
261 }
262
263 #[test]
264 fn node_at_dispatches_to_correct_variant() {
265 let r: Registry<'static> = Registry::from_sorted_slice(&[
266 (10u32, NodeRef::Inline(AozoraNode::PageBreak)),
267 (20u32, NodeRef::BlockLeaf(AozoraNode::PageBreak)),
268 (30u32, NodeRef::BlockOpen(ContainerKind::Keigakomi)),
269 (40u32, NodeRef::BlockClose(ContainerKind::Keigakomi)),
270 ]);
271 assert!(matches!(
272 r.node_at(NormalizedOffset::new(10)),
273 Some(NodeRef::Inline(AozoraNode::PageBreak))
274 ));
275 assert!(matches!(
276 r.node_at(NormalizedOffset::new(20)),
277 Some(NodeRef::BlockLeaf(AozoraNode::PageBreak))
278 ));
279 assert!(matches!(
280 r.node_at(NormalizedOffset::new(30)),
281 Some(NodeRef::BlockOpen(ContainerKind::Keigakomi))
282 ));
283 assert!(matches!(
284 r.node_at(NormalizedOffset::new(40)),
285 Some(NodeRef::BlockClose(ContainerKind::Keigakomi))
286 ));
287 assert!(r.node_at(NormalizedOffset::new(99)).is_none());
288 }
289
290 #[test]
291 fn count_kind_buckets_entries_by_sentinel() {
292 let r: Registry<'static> = Registry::from_sorted_slice(&[
293 (
294 5u32,
295 NodeRef::BlockOpen(ContainerKind::Indent { amount: 2 }),
296 ),
297 (10u32, NodeRef::BlockOpen(ContainerKind::Keigakomi)),
298 (15u32, NodeRef::Inline(AozoraNode::PageBreak)),
299 (20u32, NodeRef::BlockClose(ContainerKind::Keigakomi)),
300 ]);
301 assert_eq!(r.count_kind(Sentinel::BlockOpen), 2);
302 assert_eq!(r.count_kind(Sentinel::Inline), 1);
303 assert_eq!(r.count_kind(Sentinel::BlockClose), 1);
304 assert_eq!(r.count_kind(Sentinel::BlockLeaf), 0);
305 }
306
307 #[test]
308 fn node_ref_sentinel_kind_round_trips() {
309 let inline = NodeRef::Inline(AozoraNode::PageBreak);
310 let block_leaf = NodeRef::BlockLeaf(AozoraNode::PageBreak);
311 let block_open = NodeRef::BlockOpen(ContainerKind::Keigakomi);
312 let block_close = NodeRef::BlockClose(ContainerKind::Keigakomi);
313 assert_eq!(inline.sentinel_kind(), Sentinel::Inline);
314 assert_eq!(block_leaf.sentinel_kind(), Sentinel::BlockLeaf);
315 assert_eq!(block_open.sentinel_kind(), Sentinel::BlockOpen);
316 assert_eq!(block_close.sentinel_kind(), Sentinel::BlockClose);
317 }
318}