aozora_syntax/borrowed/intern.rs
1//! Arena-backed string interner.
2//!
3//! Open-addressing hash table over `&'a str` slots whose payloads
4//! point into a paired [`Arena`]. The first call to [`Interner::intern`]
5//! with a given byte content allocates the string in the arena and
6//! records the resulting `&'a str` in the table; every subsequent
7//! call with byte-equal content reuses that pointer in O(1) amortised
8//! time, with no further allocation.
9//!
10//! ## Why an interner here
11//!
12//! Aozora corpora are dense in **short, frequently repeated strings**
13//! — single-mora ruby readings (`の`, `に`, `を`, `で`, `が`), bouten
14//! kind keywords, container kind labels, kaeriten marks. A naive
15//! `arena.alloc_str(s)` for every visit copies the same bytes into
16//! the arena dozens to hundreds of times per document. Interning
17//! collapses those duplicates into a single allocation; the table's
18//! per-string overhead is one `&str` slot (16 bytes) plus a fraction
19//! of probe load, which is more than recovered by the eliminated
20//! duplicate-byte allocations on any document with non-trivial
21//! repetition.
22//!
23//! ## Algorithm
24//!
25//! - Open addressing with linear probing.
26//! - Power-of-two capacity (initial 256) so slot index is a single
27//! bitmask, no `%` modulo.
28//! - Resize at 7/8 load factor; growth doubles capacity and rebuilds
29//! the table via fresh probing.
30//! - **FxHash-style multiply-then-xor** mix on the byte stream:
31//! one `wrapping_mul` + one xor-shift per byte. Fast on short
32//! strings (< 32 bytes — the dominant length class for Aozora ruby
33//! readings); no per-call state allocation that std `SipHash` would
34//! incur.
35//! - **Inline cache** for the most recent intern result. Long runs
36//! of identical interns (e.g. 100 consecutive `《の》` ruby readings
37//! in a poem) skip both the hash and the probe.
38//! - **Length threshold**: strings longer than 64 bytes bypass the
39//! table entirely and go straight to `arena.alloc_str`. The hash +
40//! probe cost is dominated by the byte scan for long strings, and
41//! long strings are very rarely repeated in practice (raw
42//! annotations, gaiji descriptions). Skipping them avoids
43//! polluting the table with unique entries that will never be
44//! queried again.
45//!
46//! ## Memory model
47//!
48//! The interner does not own the underlying arena — it borrows it
49//! immutably. This is sound because `Arena::alloc_str` takes
50//! `&self` (bumpalo allows interior mutation through a shared
51//! reference). A single arena can therefore back multiple interners
52//! (e.g. one per worker in a parallel parse path), each with its own
53//! probe table.
54
55use bumpalo::collections::Vec as BumpVec;
56
57use super::arena::Arena;
58
59/// FxHash-style mix constant. The same constant rustc internally uses
60/// for `FxHasher`; chosen for fast diffusion on short inputs.
61const FX_PRIME: u64 = 0x517c_c1b7_2722_0a95;
62
63/// Length threshold beyond which strings bypass the intern table.
64/// Picked from corpus profile: ruby readings, kaeriten marks, and
65/// container kind labels are all under 32 bytes; raw annotations and
66/// gaiji descriptions can exceed 64 bytes and almost never repeat.
67const INTERN_LENGTH_LIMIT: usize = 64;
68
69/// Initial table capacity. Power of two so probe-index = `hash & mask`.
70/// 256 covers the typical small-to-medium document without growth;
71/// large (multi-MB) documents grow once or twice on the way to ~2048.
72const INITIAL_CAPACITY: usize = 256;
73
74/// Open-addressing intern table over arena-allocated strings.
75#[derive(Debug)]
76pub struct Interner<'a> {
77 arena: &'a Arena,
78 /// Slots: `None` = empty, `Some(&'a str)` = occupied.
79 /// `BumpVec` keeps the table itself inside the arena, so the
80 /// interner allocates exactly zero bytes outside the arena.
81 table: BumpVec<'a, Option<&'a str>>,
82 /// `capacity - 1`. `capacity` is always a power of two.
83 mask: usize,
84 /// Number of occupied slots.
85 occupied: usize,
86 /// Inline cache: last successfully-interned string. Long runs of
87 /// identical interns short-circuit on this single pointer compare.
88 ///
89 /// A 2-slot LRU cache (intended to catch Ruby's alternating
90 /// `(base, reading, base, reading, …)` access pattern) was tried
91 /// and reverted. Corpus dedup ratio stayed at p50 0.275 /
92 /// mean 0.308 (identical to the 1-slot baseline), throughput
93 /// moved within noise. The pattern that would benefit —
94 /// consecutive rubies sharing a base or reading
95 /// — is rarer than the design assumed; distinct rubies on
96 /// distinct words dominate.
97 last: Option<&'a str>,
98 /// Diagnostic counters. Updated on every intern call. Useful for
99 /// benchmarks and the corpus-sweep dedup-ratio report.
100 pub stats: InternStats,
101}
102
103/// Diagnostic counters surfaced by [`Interner::stats`].
104#[derive(Debug, Clone, Copy, Default)]
105pub struct InternStats {
106 /// Total `intern` calls (every entry into the API).
107 pub calls: u64,
108 /// Calls served from the inline cache.
109 pub cache_hits: u64,
110 /// Calls that landed on an existing table entry (no allocation).
111 pub table_hits: u64,
112 /// Calls that allocated a new entry into the arena.
113 pub allocs: u64,
114 /// Calls that bypassed the table because the string exceeded
115 /// `INTERN_LENGTH_LIMIT` — counted as an alloc as well.
116 pub long_bypass: u64,
117 /// Total resize events the table performed.
118 pub resizes: u64,
119 /// Total probe steps walked across all `intern` calls. Divided by
120 /// `calls - cache_hits` gives the average probe length, the
121 /// canonical hash-table health metric.
122 pub probe_steps: u64,
123}
124
125impl<'a> Interner<'a> {
126 /// Empty interner backed by `arena`. Initial capacity is
127 /// `INITIAL_CAPACITY`.
128 #[must_use]
129 pub fn new_in(arena: &'a Arena) -> Self {
130 Self::with_capacity_in(INITIAL_CAPACITY, arena)
131 }
132
133 /// Empty interner backed by `arena`, with the table pre-sized to
134 /// at least `capacity_hint` slots (rounded up to a power of two).
135 /// Use when the caller can estimate the unique-string count up
136 /// front (e.g. the lex driver knows the registry size).
137 #[must_use]
138 pub fn with_capacity_in(capacity_hint: usize, arena: &'a Arena) -> Self {
139 let cap = capacity_hint.next_power_of_two().max(8);
140 let mut table = BumpVec::with_capacity_in(cap, arena.bump());
141 table.resize(cap, None);
142 Self {
143 arena,
144 table,
145 mask: cap - 1,
146 occupied: 0,
147 last: None,
148 stats: InternStats::default(),
149 }
150 }
151
152 /// Intern `s` and return a stable `&'a str` pointer to the arena
153 /// copy. Subsequent calls with byte-equal content return the same
154 /// pointer.
155 pub fn intern(&mut self, s: &str) -> &'a str {
156 self.stats.calls += 1;
157
158 // Inline cache: long identical-intern runs short-circuit on
159 // a single pointer-content compare. Equality on `&str`
160 // compares lengths first then bytes, which is fast for the
161 // typical mismatch.
162 if let Some(cached) = self.last
163 && cached == s
164 {
165 self.stats.cache_hits += 1;
166 return cached;
167 }
168
169 let bytes = s.as_bytes();
170
171 // Length threshold bypass — long strings rarely repeat and
172 // hashing them costs more than the alloc they would save.
173 if bytes.len() > INTERN_LENGTH_LIMIT {
174 self.stats.long_bypass += 1;
175 self.stats.allocs += 1;
176 let dst = self.arena.alloc_str(s);
177 self.last = Some(dst);
178 return dst;
179 }
180
181 let hash = fx_hash(bytes);
182
183 // Grow before probe if load factor would exceed 7/8.
184 // Power-of-two table size makes load-factor check a single
185 // multiply + compare; no division.
186 if self.occupied.saturating_mul(8) >= self.table.len().saturating_mul(7) {
187 self.grow();
188 }
189
190 #[allow(
191 clippy::cast_possible_truncation,
192 reason = "low bits of u64 hash extracted as usize on purpose"
193 )]
194 let mut idx = (hash as usize) & self.mask;
195 loop {
196 self.stats.probe_steps += 1;
197 match self.table[idx] {
198 Some(existing) if existing == s => {
199 self.stats.table_hits += 1;
200 self.last = Some(existing);
201 return existing;
202 }
203 None => {
204 let dst = self.arena.alloc_str(s);
205 self.table[idx] = Some(dst);
206 self.occupied += 1;
207 self.stats.allocs += 1;
208 self.last = Some(dst);
209 return dst;
210 }
211 Some(_) => idx = (idx + 1) & self.mask,
212 }
213 }
214 }
215
216 /// Number of unique strings currently held in the table.
217 #[must_use]
218 pub fn unique_strings(&self) -> usize {
219 self.occupied
220 }
221
222 /// Current table capacity.
223 #[must_use]
224 pub fn capacity(&self) -> usize {
225 self.table.len()
226 }
227
228 /// Average probe length per non-cache-hit lookup. Returns `0.0`
229 /// when no probed lookups have occurred. Used by benchmarks to
230 /// confirm the hash function and load-factor policy keep the
231 /// table healthy.
232 #[must_use]
233 pub fn avg_probe_length(&self) -> f64 {
234 let probed = self.stats.calls.saturating_sub(self.stats.cache_hits);
235 if probed == 0 {
236 0.0
237 } else {
238 #[allow(
239 clippy::cast_precision_loss,
240 reason = "probe count fits in f64 mantissa for any plausible workload"
241 )]
242 let avg = self.stats.probe_steps as f64 / probed as f64;
243 avg
244 }
245 }
246
247 /// Doubles capacity and rebuilds the table via fresh probing.
248 fn grow(&mut self) {
249 let new_cap = self.table.len().saturating_mul(2);
250 let new_mask = new_cap - 1;
251 let mut new_table: BumpVec<'a, Option<&'a str>> =
252 BumpVec::with_capacity_in(new_cap, self.arena.bump());
253 new_table.resize(new_cap, None);
254 for s in self.table.iter().copied().flatten() {
255 let h = fx_hash(s.as_bytes());
256 #[allow(
257 clippy::cast_possible_truncation,
258 reason = "low bits of u64 hash extracted as usize on purpose"
259 )]
260 let mut idx = (h as usize) & new_mask;
261 while new_table[idx].is_some() {
262 idx = (idx + 1) & new_mask;
263 }
264 new_table[idx] = Some(s);
265 }
266 self.table = new_table;
267 self.mask = new_mask;
268 self.stats.resizes += 1;
269 }
270}
271
272/// `wrapping_mul`-and-xor mix loop. Fast on short inputs (the dominant
273/// case for Aozora ruby readings); avoids the per-call state setup
274/// cost of std `SipHash`.
275///
276/// An 8-byte-chunk fast path with an xxHash-style avalanche was
277/// tried and reverted. For the typical
278/// 3-byte single-codepoint reading the avalanche's two extra
279/// multiplications cost more than the per-byte loop saves; corpus
280/// throughput moved within noise (-4 % to +2 % depending on band).
281/// The byte loop fits in a few cycles on short inputs and is hard to
282/// beat without a different hash family. Keeping the simple shape.
283#[inline]
284fn fx_hash(bytes: &[u8]) -> u64 {
285 let mut h: u64 = 0;
286 for &b in bytes {
287 h = h.rotate_left(5) ^ u64::from(b);
288 h = h.wrapping_mul(FX_PRIME);
289 }
290 h
291}
292
293// Bump access for structures that need their own arena-backed storage
294// (e.g. the [`Interner`]'s probe table) lives as an inherent method
295// on Arena in `arena.rs`.
296
297#[cfg(test)]
298mod tests {
299 use core::ptr;
300
301 use super::*;
302
303 fn arena() -> Arena {
304 Arena::new()
305 }
306
307 #[test]
308 fn empty_interner_has_no_unique_strings() {
309 let a = arena();
310 let i = Interner::new_in(&a);
311 assert_eq!(i.unique_strings(), 0);
312 assert_eq!(i.capacity(), INITIAL_CAPACITY);
313 }
314
315 #[test]
316 fn intern_returns_stable_pointer_for_byte_equal_input() {
317 let a = arena();
318 let mut i = Interner::new_in(&a);
319 let s1 = i.intern("hello");
320 let s2 = i.intern("hello");
321 assert!(
322 ptr::eq(s1.as_ptr(), s2.as_ptr()),
323 "byte-equal intern must return same arena pointer"
324 );
325 assert_eq!(i.unique_strings(), 1);
326 }
327
328 #[test]
329 fn intern_returns_distinct_pointers_for_distinct_inputs() {
330 let a = arena();
331 let mut i = Interner::new_in(&a);
332 let s1 = i.intern("hello");
333 let s2 = i.intern("world");
334 assert!(!ptr::eq(s1.as_ptr(), s2.as_ptr()));
335 assert_eq!(i.unique_strings(), 2);
336 }
337
338 #[test]
339 fn intern_handles_empty_string() {
340 let a = arena();
341 let mut i = Interner::new_in(&a);
342 let s1 = i.intern("");
343 let s2 = i.intern("");
344 assert_eq!(s1, "");
345 assert!(ptr::eq(s1.as_ptr(), s2.as_ptr()));
346 }
347
348 #[test]
349 fn inline_cache_serves_repeated_calls_without_probe() {
350 let a = arena();
351 let mut i = Interner::new_in(&a);
352 // First call probes; next 99 hit the inline cache.
353 for _ in 0..100 {
354 i.intern("repeated");
355 }
356 assert_eq!(i.unique_strings(), 1);
357 assert!(i.stats.cache_hits >= 99);
358 }
359
360 #[test]
361 fn long_strings_bypass_table_but_still_alloc() {
362 let a = arena();
363 let mut i = Interner::new_in(&a);
364 // String beyond INTERN_LENGTH_LIMIT (64 bytes).
365 let long: String = "x".repeat(128);
366 let s = i.intern(&long);
367 assert_eq!(s.len(), 128);
368 // First long-string call took the bypass path (no cache hit
369 // because the cache was empty). The bypass also primes the
370 // inline cache so the second call short-circuits — cache + bypass
371 // together cover both calls.
372 let _ = i.intern(&long);
373 assert_eq!(
374 i.stats.long_bypass + i.stats.cache_hits,
375 2,
376 "every call must be served by either the cache or the bypass path"
377 );
378 assert_eq!(i.stats.long_bypass, 1, "only the first long call bypasses");
379 assert_eq!(i.stats.cache_hits, 1, "second long call hits the cache");
380 // No table slot consumed by long strings.
381 assert_eq!(i.unique_strings(), 0);
382
383 // A *different* long string forces a fresh bypass even if the
384 // cache is primed (cache compares full content).
385 let other: String = "y".repeat(128);
386 let _ = i.intern(&other);
387 assert_eq!(i.stats.long_bypass, 2);
388 }
389
390 #[test]
391 fn many_unique_strings_trigger_resize() {
392 let a = arena();
393 let mut i = Interner::with_capacity_in(8, &a);
394 // 8-slot table; resize at 7 occupied. Insert 100 unique
395 // strings — capacity must grow to >= 256.
396 for k in 0..100 {
397 let s = format!("unique-string-{k}");
398 i.intern(&s);
399 }
400 assert_eq!(i.unique_strings(), 100);
401 assert!(i.capacity() >= 128);
402 assert!(i.stats.resizes >= 4); // 8 -> 16 -> 32 -> 64 -> 128 -> 256
403 }
404
405 #[test]
406 fn average_probe_length_stays_low_at_typical_load() {
407 let a = arena();
408 let mut i = Interner::new_in(&a);
409 // Insert 100 unique short strings (well below 7/8 of 256).
410 for k in 0..100 {
411 let s = format!("k{k}");
412 i.intern(&s);
413 }
414 // Average probe length must stay small for a healthy hash.
415 // 100 entries in 256 slots = 39% load, expect <2 probes/lookup.
416 assert!(
417 i.avg_probe_length() < 2.0,
418 "avg probe {} too high — hash function may be degenerate",
419 i.avg_probe_length()
420 );
421 }
422
423 #[test]
424 fn aozora_corpus_short_readings_dedup_aggressively() {
425 let a = arena();
426 let mut i = Interner::new_in(&a);
427 // Simulate Aozora corpus pattern: 5 unique short readings
428 // hit 200 times each (700 total interns) — should land 5
429 // unique entries with hundreds of cache+table hits.
430 let readings = ["の", "に", "を", "で", "が"];
431 for _ in 0..200 {
432 for r in readings {
433 i.intern(r);
434 }
435 }
436 assert_eq!(i.unique_strings(), 5);
437 // The dedup ratio (cache + table hits) / total calls should
438 // be very high — 5 misses, 995 reuses.
439 let reuses = i.stats.cache_hits + i.stats.table_hits;
440 assert!(
441 reuses >= 995,
442 "expected >=995 reuses, got {reuses} (calls={})",
443 i.stats.calls
444 );
445 }
446
447 #[test]
448 fn intern_preserves_utf8_bytes_exactly() {
449 let a = arena();
450 let mut i = Interner::new_in(&a);
451 let inputs = ["青梅", "おうめ", "明治の頃", "※[#ほげ]", "🍣"];
452 for s in inputs {
453 assert_eq!(i.intern(s), s);
454 }
455 assert_eq!(i.unique_strings(), inputs.len());
456 }
457}