Skip to main content

aozora_syntax/borrowed/
intern.rs

1//! Arena-backed string interner.
2//!
3//! Open-addressing hash table over `&'a str` slots whose payloads
4//! point into a paired [`Arena`]. The first call to [`Interner::intern`]
5//! with a given byte content allocates the string in the arena and
6//! records the resulting `&'a str` in the table; every subsequent
7//! call with byte-equal content reuses that pointer in O(1) amortised
8//! time, with no further allocation.
9//!
10//! ## Why an interner here
11//!
12//! Aozora corpora are dense in **short, frequently repeated strings**
13//! — single-mora ruby readings (`の`, `に`, `を`, `で`, `が`), bouten
14//! kind keywords, container kind labels, kaeriten marks. A naive
15//! `arena.alloc_str(s)` for every visit copies the same bytes into
16//! the arena dozens to hundreds of times per document. Interning
17//! collapses those duplicates into a single allocation; the table's
18//! per-string overhead is one `&str` slot (16 bytes) plus a fraction
19//! of probe load, which is more than recovered by the eliminated
20//! duplicate-byte allocations on any document with non-trivial
21//! repetition.
22//!
23//! ## Algorithm
24//!
25//! - Open addressing with linear probing.
26//! - Power-of-two capacity (initial 256) so slot index is a single
27//!   bitmask, no `%` modulo.
28//! - Resize at 7/8 load factor; growth doubles capacity and rebuilds
29//!   the table via fresh probing.
30//! - **FxHash-style multiply-then-xor** mix on the byte stream:
31//!   one `wrapping_mul` + one xor-shift per byte. Fast on short
32//!   strings (< 32 bytes — the dominant length class for Aozora ruby
33//!   readings); no per-call state allocation that std `SipHash` would
34//!   incur.
35//! - **Inline cache** for the most recent intern result. Long runs
36//!   of identical interns (e.g. 100 consecutive `《の》` ruby readings
37//!   in a poem) skip both the hash and the probe.
38//! - **Length threshold**: strings longer than 64 bytes bypass the
39//!   table entirely and go straight to `arena.alloc_str`. The hash +
40//!   probe cost is dominated by the byte scan for long strings, and
41//!   long strings are very rarely repeated in practice (raw
42//!   annotations, gaiji descriptions). Skipping them avoids
43//!   polluting the table with unique entries that will never be
44//!   queried again.
45//!
46//! ## Memory model
47//!
48//! The interner does not own the underlying arena — it borrows it
49//! immutably. This is sound because `Arena::alloc_str` takes
50//! `&self` (bumpalo allows interior mutation through a shared
51//! reference). A single arena can therefore back multiple interners
52//! (e.g. one per worker in a parallel parse path), each with its own
53//! probe table.
54
55use bumpalo::collections::Vec as BumpVec;
56
57use super::arena::Arena;
58
59/// FxHash-style mix constant. The same constant rustc internally uses
60/// for `FxHasher`; chosen for fast diffusion on short inputs.
61const FX_PRIME: u64 = 0x517c_c1b7_2722_0a95;
62
63/// Length threshold beyond which strings bypass the intern table.
64/// Picked from corpus profile: ruby readings, kaeriten marks, and
65/// container kind labels are all under 32 bytes; raw annotations and
66/// gaiji descriptions can exceed 64 bytes and almost never repeat.
67const INTERN_LENGTH_LIMIT: usize = 64;
68
69/// Initial table capacity. Power of two so probe-index = `hash & mask`.
70/// 256 covers the typical small-to-medium document without growth;
71/// large (multi-MB) documents grow once or twice on the way to ~2048.
72const INITIAL_CAPACITY: usize = 256;
73
74/// Open-addressing intern table over arena-allocated strings.
75#[derive(Debug)]
76pub struct Interner<'a> {
77    arena: &'a Arena,
78    /// Slots: `None` = empty, `Some(&'a str)` = occupied.
79    /// `BumpVec` keeps the table itself inside the arena, so the
80    /// interner allocates exactly zero bytes outside the arena.
81    table: BumpVec<'a, Option<&'a str>>,
82    /// `capacity - 1`. `capacity` is always a power of two.
83    mask: usize,
84    /// Number of occupied slots.
85    occupied: usize,
86    /// Inline cache: last successfully-interned string. Long runs of
87    /// identical interns short-circuit on this single pointer compare.
88    ///
89    /// A 2-slot LRU cache (intended to catch Ruby's alternating
90    /// `(base, reading, base, reading, …)` access pattern) was tried
91    /// and reverted. Corpus dedup ratio stayed at p50 0.275 /
92    /// mean 0.308 (identical to the 1-slot baseline), throughput
93    /// moved within noise. The pattern that would benefit —
94    /// consecutive rubies sharing a base or reading
95    /// — is rarer than the design assumed; distinct rubies on
96    /// distinct words dominate.
97    last: Option<&'a str>,
98    /// Diagnostic counters. Updated on every intern call. Useful for
99    /// benchmarks and the corpus-sweep dedup-ratio report.
100    pub stats: InternStats,
101}
102
103/// Diagnostic counters surfaced by [`Interner::stats`].
104#[derive(Debug, Clone, Copy, Default)]
105pub struct InternStats {
106    /// Total `intern` calls (every entry into the API).
107    pub calls: u64,
108    /// Calls served from the inline cache.
109    pub cache_hits: u64,
110    /// Calls that landed on an existing table entry (no allocation).
111    pub table_hits: u64,
112    /// Calls that allocated a new entry into the arena.
113    pub allocs: u64,
114    /// Calls that bypassed the table because the string exceeded
115    /// `INTERN_LENGTH_LIMIT` — counted as an alloc as well.
116    pub long_bypass: u64,
117    /// Total resize events the table performed.
118    pub resizes: u64,
119    /// Total probe steps walked across all `intern` calls. Divided by
120    /// `calls - cache_hits` gives the average probe length, the
121    /// canonical hash-table health metric.
122    pub probe_steps: u64,
123}
124
125impl<'a> Interner<'a> {
126    /// Empty interner backed by `arena`. Initial capacity is
127    /// `INITIAL_CAPACITY`.
128    #[must_use]
129    pub fn new_in(arena: &'a Arena) -> Self {
130        Self::with_capacity_in(INITIAL_CAPACITY, arena)
131    }
132
133    /// Empty interner backed by `arena`, with the table pre-sized to
134    /// at least `capacity_hint` slots (rounded up to a power of two).
135    /// Use when the caller can estimate the unique-string count up
136    /// front (e.g. the lex driver knows the registry size).
137    #[must_use]
138    pub fn with_capacity_in(capacity_hint: usize, arena: &'a Arena) -> Self {
139        let cap = capacity_hint.next_power_of_two().max(8);
140        let mut table = BumpVec::with_capacity_in(cap, arena.bump());
141        table.resize(cap, None);
142        Self {
143            arena,
144            table,
145            mask: cap - 1,
146            occupied: 0,
147            last: None,
148            stats: InternStats::default(),
149        }
150    }
151
152    /// Intern `s` and return a stable `&'a str` pointer to the arena
153    /// copy. Subsequent calls with byte-equal content return the same
154    /// pointer.
155    pub fn intern(&mut self, s: &str) -> &'a str {
156        self.stats.calls += 1;
157
158        // Inline cache: long identical-intern runs short-circuit on
159        // a single pointer-content compare. Equality on `&str`
160        // compares lengths first then bytes, which is fast for the
161        // typical mismatch.
162        if let Some(cached) = self.last
163            && cached == s
164        {
165            self.stats.cache_hits += 1;
166            return cached;
167        }
168
169        let bytes = s.as_bytes();
170
171        // Length threshold bypass — long strings rarely repeat and
172        // hashing them costs more than the alloc they would save.
173        if bytes.len() > INTERN_LENGTH_LIMIT {
174            self.stats.long_bypass += 1;
175            self.stats.allocs += 1;
176            let dst = self.arena.alloc_str(s);
177            self.last = Some(dst);
178            return dst;
179        }
180
181        let hash = fx_hash(bytes);
182
183        // Grow before probe if load factor would exceed 7/8.
184        // Power-of-two table size makes load-factor check a single
185        // multiply + compare; no division.
186        if self.occupied.saturating_mul(8) >= self.table.len().saturating_mul(7) {
187            self.grow();
188        }
189
190        #[allow(
191            clippy::cast_possible_truncation,
192            reason = "low bits of u64 hash extracted as usize on purpose"
193        )]
194        let mut idx = (hash as usize) & self.mask;
195        loop {
196            self.stats.probe_steps += 1;
197            match self.table[idx] {
198                Some(existing) if existing == s => {
199                    self.stats.table_hits += 1;
200                    self.last = Some(existing);
201                    return existing;
202                }
203                None => {
204                    let dst = self.arena.alloc_str(s);
205                    self.table[idx] = Some(dst);
206                    self.occupied += 1;
207                    self.stats.allocs += 1;
208                    self.last = Some(dst);
209                    return dst;
210                }
211                Some(_) => idx = (idx + 1) & self.mask,
212            }
213        }
214    }
215
216    /// Number of unique strings currently held in the table.
217    #[must_use]
218    pub fn unique_strings(&self) -> usize {
219        self.occupied
220    }
221
222    /// Current table capacity.
223    #[must_use]
224    pub fn capacity(&self) -> usize {
225        self.table.len()
226    }
227
228    /// Average probe length per non-cache-hit lookup. Returns `0.0`
229    /// when no probed lookups have occurred. Used by benchmarks to
230    /// confirm the hash function and load-factor policy keep the
231    /// table healthy.
232    #[must_use]
233    pub fn avg_probe_length(&self) -> f64 {
234        let probed = self.stats.calls.saturating_sub(self.stats.cache_hits);
235        if probed == 0 {
236            0.0
237        } else {
238            #[allow(
239                clippy::cast_precision_loss,
240                reason = "probe count fits in f64 mantissa for any plausible workload"
241            )]
242            let avg = self.stats.probe_steps as f64 / probed as f64;
243            avg
244        }
245    }
246
247    /// Doubles capacity and rebuilds the table via fresh probing.
248    fn grow(&mut self) {
249        let new_cap = self.table.len().saturating_mul(2);
250        let new_mask = new_cap - 1;
251        let mut new_table: BumpVec<'a, Option<&'a str>> =
252            BumpVec::with_capacity_in(new_cap, self.arena.bump());
253        new_table.resize(new_cap, None);
254        for s in self.table.iter().copied().flatten() {
255            let h = fx_hash(s.as_bytes());
256            #[allow(
257                clippy::cast_possible_truncation,
258                reason = "low bits of u64 hash extracted as usize on purpose"
259            )]
260            let mut idx = (h as usize) & new_mask;
261            while new_table[idx].is_some() {
262                idx = (idx + 1) & new_mask;
263            }
264            new_table[idx] = Some(s);
265        }
266        self.table = new_table;
267        self.mask = new_mask;
268        self.stats.resizes += 1;
269    }
270}
271
272/// `wrapping_mul`-and-xor mix loop. Fast on short inputs (the dominant
273/// case for Aozora ruby readings); avoids the per-call state setup
274/// cost of std `SipHash`.
275///
276/// An 8-byte-chunk fast path with an xxHash-style avalanche was
277/// tried and reverted. For the typical
278/// 3-byte single-codepoint reading the avalanche's two extra
279/// multiplications cost more than the per-byte loop saves; corpus
280/// throughput moved within noise (-4 % to +2 % depending on band).
281/// The byte loop fits in a few cycles on short inputs and is hard to
282/// beat without a different hash family. Keeping the simple shape.
283#[inline]
284fn fx_hash(bytes: &[u8]) -> u64 {
285    let mut h: u64 = 0;
286    for &b in bytes {
287        h = h.rotate_left(5) ^ u64::from(b);
288        h = h.wrapping_mul(FX_PRIME);
289    }
290    h
291}
292
293// Bump access for structures that need their own arena-backed storage
294// (e.g. the [`Interner`]'s probe table) lives as an inherent method
295// on Arena in `arena.rs`.
296
297#[cfg(test)]
298mod tests {
299    use core::ptr;
300
301    use super::*;
302
303    fn arena() -> Arena {
304        Arena::new()
305    }
306
307    #[test]
308    fn empty_interner_has_no_unique_strings() {
309        let a = arena();
310        let i = Interner::new_in(&a);
311        assert_eq!(i.unique_strings(), 0);
312        assert_eq!(i.capacity(), INITIAL_CAPACITY);
313    }
314
315    #[test]
316    fn intern_returns_stable_pointer_for_byte_equal_input() {
317        let a = arena();
318        let mut i = Interner::new_in(&a);
319        let s1 = i.intern("hello");
320        let s2 = i.intern("hello");
321        assert!(
322            ptr::eq(s1.as_ptr(), s2.as_ptr()),
323            "byte-equal intern must return same arena pointer"
324        );
325        assert_eq!(i.unique_strings(), 1);
326    }
327
328    #[test]
329    fn intern_returns_distinct_pointers_for_distinct_inputs() {
330        let a = arena();
331        let mut i = Interner::new_in(&a);
332        let s1 = i.intern("hello");
333        let s2 = i.intern("world");
334        assert!(!ptr::eq(s1.as_ptr(), s2.as_ptr()));
335        assert_eq!(i.unique_strings(), 2);
336    }
337
338    #[test]
339    fn intern_handles_empty_string() {
340        let a = arena();
341        let mut i = Interner::new_in(&a);
342        let s1 = i.intern("");
343        let s2 = i.intern("");
344        assert_eq!(s1, "");
345        assert!(ptr::eq(s1.as_ptr(), s2.as_ptr()));
346    }
347
348    #[test]
349    fn inline_cache_serves_repeated_calls_without_probe() {
350        let a = arena();
351        let mut i = Interner::new_in(&a);
352        // First call probes; next 99 hit the inline cache.
353        for _ in 0..100 {
354            i.intern("repeated");
355        }
356        assert_eq!(i.unique_strings(), 1);
357        assert!(i.stats.cache_hits >= 99);
358    }
359
360    #[test]
361    fn long_strings_bypass_table_but_still_alloc() {
362        let a = arena();
363        let mut i = Interner::new_in(&a);
364        // String beyond INTERN_LENGTH_LIMIT (64 bytes).
365        let long: String = "x".repeat(128);
366        let s = i.intern(&long);
367        assert_eq!(s.len(), 128);
368        // First long-string call took the bypass path (no cache hit
369        // because the cache was empty). The bypass also primes the
370        // inline cache so the second call short-circuits — cache + bypass
371        // together cover both calls.
372        let _ = i.intern(&long);
373        assert_eq!(
374            i.stats.long_bypass + i.stats.cache_hits,
375            2,
376            "every call must be served by either the cache or the bypass path"
377        );
378        assert_eq!(i.stats.long_bypass, 1, "only the first long call bypasses");
379        assert_eq!(i.stats.cache_hits, 1, "second long call hits the cache");
380        // No table slot consumed by long strings.
381        assert_eq!(i.unique_strings(), 0);
382
383        // A *different* long string forces a fresh bypass even if the
384        // cache is primed (cache compares full content).
385        let other: String = "y".repeat(128);
386        let _ = i.intern(&other);
387        assert_eq!(i.stats.long_bypass, 2);
388    }
389
390    #[test]
391    fn many_unique_strings_trigger_resize() {
392        let a = arena();
393        let mut i = Interner::with_capacity_in(8, &a);
394        // 8-slot table; resize at 7 occupied. Insert 100 unique
395        // strings — capacity must grow to >= 256.
396        for k in 0..100 {
397            let s = format!("unique-string-{k}");
398            i.intern(&s);
399        }
400        assert_eq!(i.unique_strings(), 100);
401        assert!(i.capacity() >= 128);
402        assert!(i.stats.resizes >= 4); // 8 -> 16 -> 32 -> 64 -> 128 -> 256
403    }
404
405    #[test]
406    fn average_probe_length_stays_low_at_typical_load() {
407        let a = arena();
408        let mut i = Interner::new_in(&a);
409        // Insert 100 unique short strings (well below 7/8 of 256).
410        for k in 0..100 {
411            let s = format!("k{k}");
412            i.intern(&s);
413        }
414        // Average probe length must stay small for a healthy hash.
415        // 100 entries in 256 slots = 39% load, expect <2 probes/lookup.
416        assert!(
417            i.avg_probe_length() < 2.0,
418            "avg probe {} too high — hash function may be degenerate",
419            i.avg_probe_length()
420        );
421    }
422
423    #[test]
424    fn aozora_corpus_short_readings_dedup_aggressively() {
425        let a = arena();
426        let mut i = Interner::new_in(&a);
427        // Simulate Aozora corpus pattern: 5 unique short readings
428        // hit 200 times each (700 total interns) — should land 5
429        // unique entries with hundreds of cache+table hits.
430        let readings = ["の", "に", "を", "で", "が"];
431        for _ in 0..200 {
432            for r in readings {
433                i.intern(r);
434            }
435        }
436        assert_eq!(i.unique_strings(), 5);
437        // The dedup ratio (cache + table hits) / total calls should
438        // be very high — 5 misses, 995 reuses.
439        let reuses = i.stats.cache_hits + i.stats.table_hits;
440        assert!(
441            reuses >= 995,
442            "expected >=995 reuses, got {reuses} (calls={})",
443            i.stats.calls
444        );
445    }
446
447    #[test]
448    fn intern_preserves_utf8_bytes_exactly() {
449        let a = arena();
450        let mut i = Interner::new_in(&a);
451        let inputs = ["青梅", "おうめ", "明治の頃", "※[#ほげ]", "🍣"];
452        for s in inputs {
453            assert_eq!(i.intern(s), s);
454        }
455        assert_eq!(i.unique_strings(), inputs.len());
456    }
457}