Skip to main content

aozora_encoding/
gaiji.rs

1//! Gaiji (外字) resolution — mapping `※[#…、mencode]` references
2//! to real Unicode characters.
3//!
4//! Two incoming shapes per the Aozora annotation manual:
5//!
6//! ```text
7//!   ※[#「description」、第3水準1-85-54]    ← JIS X 0213 plane-row-cell
8//!   ※[#「description」、U+XXXX、page-line] ← explicit Unicode codepoint
9//! ```
10//!
11//! The lexer's Phase 3 recogniser (`aozora-lexer::phase3_classify::recognize_gaiji`)
12//! captures `description` and `mencode` verbatim and leaves `ucs = None`;
13//! this module turns that reference into a concrete [`Resolved`] by
14//! consulting two `phf::Map`s compiled into the binary
15//! (one for the single-codepoint majority and one for the 25
16//! combining-sequence cells) and, for `U+XXXX` shaped mencodes,
17//! parsing the hex digits directly.
18//!
19//! ## Why a `Resolved` enum
20//!
21//! 25 cells in JIS X 0213:2004 plane 1 (Ainu か゚ family, IPA tone marks,
22//! a handful of accented Latin) decode to a *combining sequence* — two
23//! Unicode scalars that must travel together. A single `char` cannot
24//! carry them, so the resolved value is either a [`char`] (the
25//! ~99.4% common path) or a `&'static str` borrowed from the
26//! generated combo table. Both variants are `Copy`, so embedding
27//! `Option<Resolved>` in the parser's `Gaiji` payload does not
28//! perturb its `Copy`-able tree.
29//!
30//! ## Lookup order
31//!
32//! 1. **`existing`** — the caller-provided codepoint (e.g. extracted
33//!    by an earlier escape recogniser); short-circuit identity.
34//! 2. **Combo table** — checked first for `mencode` because it is the
35//!    only way to honour a 2-codepoint cell.
36//! 3. **Single-char table** — the bulk path; one perfect-hash probe
37//!    in `.rodata`.
38//! 4. **`U+XXXX` prefix** — `U+` followed by 1–6 hex digits. Parsed
39//!    as a hex integer, validated via [`char::from_u32`].
40//! 5. **Description fallback** — small secondary table keyed by the
41//!    literal description text (well-known shapes like 〓, 〻).
42//! 6. **None** — unresolved. Renderer falls back to the raw
43//!    `description` bytes.
44//!
45//! ## Why two PHF maps rather than one enum-valued map
46//!
47//! The single-char map is 4 329 entries; the combo map is 25.
48//! Storing the common path as `phf::Map<&str, char>` keeps each value
49//! at 4 bytes (vs 16-byte `&str`) and the cache footprint of the hot
50//! lookup path tight. The combo map is consulted second; misses
51//! there cost a single probe.
52
53use core::fmt;
54
55use crate::jisx0213_table::{
56    DESCRIPTION_TO_CHAR, JISX0213_MENCODE_TO_CHAR, JISX0213_MENCODE_TO_STR,
57};
58
59/// Resolution outcome — either a single Unicode scalar or a static
60/// string covering a combining sequence.
61///
62/// `Copy` so it can sit inside `Gaiji` without breaking the parser
63/// tree's `Copy` chain.
64#[derive(Debug, Clone, Copy, PartialEq, Eq)]
65pub enum Resolved {
66    /// Common path: the mencode mapped to a single Unicode scalar
67    /// (~99.4% of JIS X 0213:2004 cells, plus all `U+XXXX` shapes
68    /// and the description fallback).
69    Char(char),
70    /// JIS X 0213 combining-sequence cell — 25 entries in plane 1
71    /// (Ainu か゚ family, IPA tone marks, accented Latin). The string
72    /// is borrowed from a static `phf::Map` value.
73    Multi(&'static str),
74}
75
76impl Resolved {
77    /// Convenience: write the resolved char(s) into any [`fmt::Write`].
78    /// Renderer / hover / inlay-hint paths all take this shape.
79    ///
80    /// # Errors
81    /// Propagates the writer's own errors verbatim.
82    pub fn write_to<W: fmt::Write>(self, w: &mut W) -> fmt::Result {
83        match self {
84            Self::Char(c) => w.write_char(c),
85            Self::Multi(s) => w.write_str(s),
86        }
87    }
88
89    /// Returns the resolved single `char` if and only if this is a
90    /// [`Resolved::Char`]. Combo cells return `None`.
91    #[must_use]
92    pub fn as_char(self) -> Option<char> {
93        match self {
94            Self::Char(c) => Some(c),
95            Self::Multi(_) => None,
96        }
97    }
98
99    /// Total UTF-8 length of the resolved value (1..=8 bytes in
100    /// practice).
101    #[must_use]
102    pub fn utf8_len(self) -> usize {
103        match self {
104            Self::Char(c) => c.len_utf8(),
105            Self::Multi(s) => s.len(),
106        }
107    }
108}
109
110/// Pure-function lookup used by `aozora-lexer`'s Phase 3 classifier
111/// to populate `borrowed::Gaiji::ucs` at construction time.
112///
113/// `existing` is the short-circuit for callers that already extracted
114/// a codepoint from the source. Pass `None` to fall through to the
115/// table layers.
116#[must_use]
117pub fn lookup(
118    existing: Option<char>,
119    mencode: Option<&str>,
120    description: &str,
121) -> Option<Resolved> {
122    if let Some(ch) = existing {
123        return Some(Resolved::Char(ch));
124    }
125    if let Some(m) = mencode {
126        // Combo table first: the 25 multi-codepoint cells live only
127        // here. A miss is a single PHF probe — cheap.
128        if let Some(&s) = JISX0213_MENCODE_TO_STR.get(m) {
129            return Some(Resolved::Multi(s));
130        }
131        if let Some(&ch) = JISX0213_MENCODE_TO_CHAR.get(m) {
132            return Some(Resolved::Char(ch));
133        }
134        if let Some(ch) = parse_u_plus(m) {
135            return Some(Resolved::Char(ch));
136        }
137    }
138    if let Some(&ch) = DESCRIPTION_TO_CHAR.get(description) {
139        return Some(Resolved::Char(ch));
140    }
141    // Smart fallback: a description that is *itself* a single
142    // character resolves to that character. Common in real corpora
143    // when the author CAN type the kanji (e.g. on a modern IME) but
144    // wants the reader to see a `※[#…]` annotation pointing at the
145    // JIS source. Mencode/dictionary tiers above already short-
146    // circuited any case where the table had a more specific answer,
147    // so this only fires when description is a one-glyph payload
148    // and nothing else matched.
149    //
150    // Counts grapheme clusters by Unicode scalars: a base-plus-
151    // combining sequence (e.g. アクセント分解) returns >1 char and
152    // falls through to the final `None`. Surrogate halves can't
153    // appear in `&str` so single-`char` is unambiguous here.
154    let mut chars = description.chars();
155    if let Some(only) = chars.next()
156        && chars.next().is_none()
157    {
158        return Some(Resolved::Char(only));
159    }
160    None
161}
162
163/// Parse a `U+XXXX` style mencode — 1 to 6 hex digits after the
164/// literal `U+` prefix — and validate the result via
165/// [`char::from_u32`]. Returns `None` for surrogates, non-characters,
166/// and out-of-range integers, rather than panicking, so malformed
167/// input falls cleanly through to the description fallback.
168#[must_use]
169fn parse_u_plus(mencode: &str) -> Option<char> {
170    let hex = mencode.strip_prefix("U+")?;
171    // Reject empty / oversized; `u32::from_str_radix` would accept
172    // 10-digit inputs but those can't fit a Unicode scalar.
173    if hex.is_empty() || hex.len() > 6 {
174        return None;
175    }
176    let code = u32::from_str_radix(hex, 16).ok()?;
177    char::from_u32(code)
178}
179
180// Gaiji descriptions (the text inside `「…」`) that resolve to a
181// canonical character without depending on the mencode tail. Sourced
182// from `crates/aozora-encoding/data/aozora-gaiji-chuki.tsv` (the
183// official 8th-edition 外字注記辞書, ~8 800 entries) plus
184// `aozora-gaiji-special.tsv` (hand-curated 〓 / 〻 placeholders).
185// Generated by `xtask gaiji-gen` and exported from
186// `crate::jisx0213_table::DESCRIPTION_TO_CHAR` (alias-imported at
187// the top of this module).
188
189/// Pretty-printer for tests and diagnostics. Returns
190/// `(single_char_count, combo_count, description_count)`.
191#[must_use]
192pub fn table_sizes() -> (usize, usize, usize) {
193    (
194        JISX0213_MENCODE_TO_CHAR.len(),
195        JISX0213_MENCODE_TO_STR.len(),
196        DESCRIPTION_TO_CHAR.len(),
197    )
198}
199
200#[cfg(test)]
201mod tests {
202    use super::*;
203
204    #[test]
205    fn lookup_prefers_existing_ucs_when_already_set() {
206        // The "existing" short-circuit returns the caller-provided
207        // codepoint without consulting either table.
208        assert_eq!(
209            lookup(Some('\u{1234}'), Some("第3水準1-85-54"), "木+吶のつくり"),
210            Some(Resolved::Char('\u{1234}'))
211        );
212    }
213
214    #[test]
215    fn lookup_via_mencode_table_when_ucs_missing() {
216        // 罪と罰 fixture: `木+吶のつくり` with 第3水準1-85-54.
217        // Per JIS X 0213:2004 plane 1, row 85, cell 54 = 枘 (U+6798).
218        // ("吶のつくり" = right-side component of 吶 = 内, so 木+内 = 枘.)
219        assert_eq!(
220            lookup(None, Some("第3水準1-85-54"), "木+吶のつくり"),
221            Some(Resolved::Char('\u{6798}'))
222        );
223    }
224
225    #[test]
226    fn lookup_via_combo_table_returns_multi() {
227        // 第3水準1-4-87 = か゚ = U+304B U+309A (combining handakuten).
228        // The combo path is the *only* way to honour these 25 cells.
229        assert_eq!(
230            lookup(None, Some("第3水準1-4-87"), ""),
231            Some(Resolved::Multi("\u{304B}\u{309A}"))
232        );
233    }
234
235    #[test]
236    fn combo_resolution_writes_both_codepoints() {
237        // End-to-end: combo lookup + write_to should yield the full
238        // 2-codepoint sequence (6 UTF-8 bytes for か + handakuten).
239        let resolved = lookup(None, Some("第3水準1-4-87"), "").expect("combo resolves");
240        let mut s = String::new();
241        resolved
242            .write_to(&mut s)
243            .expect("write to String never fails");
244        assert_eq!(s, "\u{304B}\u{309A}");
245        assert_eq!(s.chars().count(), 2);
246    }
247
248    #[test]
249    fn lookup_via_u_plus_form() {
250        assert_eq!(
251            lookup(None, Some("U+01F5"), "Latin Small Letter G With Acute"),
252            Some(Resolved::Char('\u{01F5}'))
253        );
254    }
255
256    #[test]
257    fn lookup_via_u_plus_max_six_hex_digits() {
258        // U+10FFFF is the Unicode max; any shape past 6 digits is rejected.
259        assert_eq!(
260            lookup(None, Some("U+10FFFF"), ""),
261            Some(Resolved::Char('\u{10FFFF}'))
262        );
263    }
264
265    #[test]
266    fn lookup_rejects_u_plus_beyond_seven_hex_digits() {
267        assert_eq!(lookup(None, Some("U+1234567"), ""), None);
268    }
269
270    #[test]
271    fn lookup_rejects_u_plus_surrogate() {
272        assert_eq!(lookup(None, Some("U+D800"), ""), None);
273    }
274
275    #[test]
276    fn lookup_rejects_u_plus_non_hex() {
277        assert_eq!(lookup(None, Some("U+GG12"), ""), None);
278    }
279
280    #[test]
281    fn lookup_rejects_u_plus_without_digits() {
282        assert_eq!(lookup(None, Some("U+"), ""), None);
283    }
284
285    #[test]
286    fn lookup_via_description_fallback_when_mencode_absent() {
287        assert_eq!(lookup(None, None, "〓"), Some(Resolved::Char('\u{3013}')));
288    }
289
290    #[test]
291    fn lookup_returns_none_when_all_paths_miss() {
292        // Multi-char description AND missing mencode → no resolution.
293        assert_eq!(
294            lookup(None, Some("not-in-any-table"), "unresolved gaiji"),
295            None
296        );
297    }
298
299    #[test]
300    fn lookup_falls_back_to_description_self_when_single_char() {
301        // 丂 is in the JIS X 0213 plane 2 table at row 1 cell 2 — but
302        // a real-world author wrote `※[#「丂」、第4水準2-16-1]` with a
303        // mencode that doesn't exist in the table. The description IS
304        // the kanji itself, so the smart fallback resolves to it.
305        assert_eq!(
306            lookup(None, Some("第4水準2-16-1"), "丂"),
307            Some(Resolved::Char('\u{4E02}'))
308        );
309        // Same for descriptions with no mencode at all.
310        assert_eq!(lookup(None, None, "畺"), Some(Resolved::Char('\u{757A}')));
311        assert_eq!(lookup(None, None, "龔"), Some(Resolved::Char('\u{9F94}')));
312    }
313
314    #[test]
315    fn single_char_fallback_does_not_override_dictionary_hit() {
316        // `〓` is in the special-placeholder table mapping to
317        // `〓 U+3013`. (Yes, that's a no-op mapping, but it exercises
318        // the dictionary path winning over the single-char fallback.)
319        // If the fallback fired in spite of the table hit, the
320        // dictionary's value would still match here — so the contract
321        // is "fallback only fires when nothing else matched".
322        assert_eq!(lookup(None, None, "〓"), Some(Resolved::Char('\u{3013}')));
323    }
324
325    #[test]
326    fn single_char_fallback_does_not_fire_for_multi_char_descriptions() {
327        // Multi-char description not in any table → must still be None.
328        // Confirms the early-return on `chars.next().is_none()`.
329        assert_eq!(lookup(None, None, "未知の字形"), None);
330        assert_eq!(lookup(None, None, "ab"), None);
331    }
332
333    #[test]
334    fn mencode_table_covers_the_fixture_gaiji() {
335        // Pin the corrected 罪と罰 fixture mapping (枘 U+6798, not the
336        // pre-regen hand-seed's wrong U+6903 椃).
337        assert_eq!(
338            JISX0213_MENCODE_TO_CHAR.get("第3水準1-85-54"),
339            Some(&'\u{6798}')
340        );
341    }
342
343    #[test]
344    fn table_sizes_match_jisx0213_2004_spec() {
345        // Pinned against the JIS X 0213:2004 normative count + the
346        // 外字注記辞書 8th edition (8 881 entries) + 2 hand-curated
347        // specials (〓 / 〻). Both data sources are checked into
348        // `crates/aozora-encoding/data/`.
349        use crate::jisx0213_table::{
350            DESCRIPTION_COUNT, JISX0213_COMBO_COUNT, JISX0213_PLANE1_COUNT, JISX0213_PLANE2_COUNT,
351        };
352        let (single, combo, description) = table_sizes();
353        assert_eq!(single, JISX0213_PLANE1_COUNT + JISX0213_PLANE2_COUNT);
354        assert_eq!(combo, JISX0213_COMBO_COUNT);
355        assert_eq!(description, DESCRIPTION_COUNT);
356        assert_eq!(
357            JISX0213_PLANE1_COUNT, 1893,
358            "第3水準 must equal the spec count",
359        );
360        assert_eq!(
361            JISX0213_PLANE2_COUNT, 2436,
362            "第4水準 must equal the spec count",
363        );
364        assert_eq!(
365            JISX0213_COMBO_COUNT, 25,
366            "combining-sequence cells must equal spec",
367        );
368        assert!(
369            description >= 8_000,
370            "description-fallback table looks too small ({description}) — \
371             did the gaiji-chuki extraction drop entries?",
372        );
373    }
374
375    #[test]
376    fn description_table_resolves_a_known_dictionary_entry() {
377        // 「木+吶のつくり」 is a hallmark fixture description for 枘
378        // (U+6798, JIS X 0213 plane 1 row 85 cell 54). The dictionary
379        // path resolves the same character as the mencode path, so a
380        // test with description-only (no mencode) must hit U+6798.
381        assert_eq!(
382            lookup(None, None, "木+吶のつくり"),
383            Some(Resolved::Char('\u{6798}')),
384        );
385    }
386
387    #[test]
388    fn description_table_preserves_special_placeholders() {
389        // 〓 / 〻 are hand-curated specials kept in
390        // `aozora-gaiji-special.tsv` and merged into the generated map.
391        assert_eq!(lookup(None, None, "〓"), Some(Resolved::Char('\u{3013}')));
392        assert_eq!(lookup(None, None, "〻"), Some(Resolved::Char('\u{303B}')));
393    }
394
395    #[test]
396    fn full_jisx0213_table_covers_a_known_plane1_third_tier_kanji() {
397        // 第3水準1-85-9 = 敧 (U+6567) per JIS X 0213:2004.
398        assert_eq!(
399            JISX0213_MENCODE_TO_CHAR.get("第3水準1-85-9"),
400            Some(&'\u{6567}')
401        );
402    }
403
404    #[test]
405    fn full_jisx0213_table_covers_a_known_plane2_fourth_tier_entry() {
406        // 第4水準2-1-1 = 𠂉 (U+20089) — first plane-2 cell.
407        assert_eq!(
408            JISX0213_MENCODE_TO_CHAR.get("第4水準2-1-1"),
409            Some(&'\u{20089}')
410        );
411    }
412
413    #[test]
414    fn resolved_utf8_len_matches_actual_encoding() {
415        assert_eq!(Resolved::Char('A').utf8_len(), 1);
416        assert_eq!(Resolved::Char('あ').utf8_len(), 3);
417        assert_eq!(Resolved::Char('𠂉').utf8_len(), 4);
418        assert_eq!(Resolved::Multi("\u{304B}\u{309A}").utf8_len(), 6);
419    }
420
421    #[test]
422    fn resolved_as_char_returns_none_for_combos() {
423        assert_eq!(Resolved::Char('A').as_char(), Some('A'));
424        assert_eq!(Resolved::Multi("か゚").as_char(), None);
425    }
426
427    #[test]
428    fn lookup_is_identity_on_the_ucs_input_when_set() {
429        // The "existing" short-circuit honours the caller-provided
430        // scalar without a wasted table probe.
431        assert_eq!(
432            lookup(Some('あ'), Some("anything"), "anything"),
433            Some(Resolved::Char('あ'))
434        );
435    }
436}