aozora_encoding/gaiji.rs
1//! Gaiji (外字) resolution — mapping `※[#…、mencode]` references
2//! to real Unicode characters.
3//!
4//! Two incoming shapes per the Aozora annotation manual:
5//!
6//! ```text
7//! ※[#「description」、第3水準1-85-54] ← JIS X 0213 plane-row-cell
8//! ※[#「description」、U+XXXX、page-line] ← explicit Unicode codepoint
9//! ```
10//!
11//! The lexer's Phase 3 recogniser (`aozora-lexer::phase3_classify::recognize_gaiji`)
12//! captures `description` and `mencode` verbatim and leaves `ucs = None`;
13//! this module turns that reference into a concrete [`Resolved`] by
14//! consulting two `phf::Map`s compiled into the binary
15//! (one for the single-codepoint majority and one for the 25
16//! combining-sequence cells) and, for `U+XXXX` shaped mencodes,
17//! parsing the hex digits directly.
18//!
19//! ## Why a `Resolved` enum
20//!
21//! 25 cells in JIS X 0213:2004 plane 1 (Ainu か゚ family, IPA tone marks,
22//! a handful of accented Latin) decode to a *combining sequence* — two
23//! Unicode scalars that must travel together. A single `char` cannot
24//! carry them, so the resolved value is either a [`char`] (the
25//! ~99.4% common path) or a `&'static str` borrowed from the
26//! generated combo table. Both variants are `Copy`, so embedding
27//! `Option<Resolved>` in the parser's `Gaiji` payload does not
28//! perturb its `Copy`-able tree.
29//!
30//! ## Lookup order
31//!
32//! 1. **`existing`** — the caller-provided codepoint (e.g. extracted
33//! by an earlier escape recogniser); short-circuit identity.
34//! 2. **Combo table** — checked first for `mencode` because it is the
35//! only way to honour a 2-codepoint cell.
36//! 3. **Single-char table** — the bulk path; one perfect-hash probe
37//! in `.rodata`.
38//! 4. **`U+XXXX` prefix** — `U+` followed by 1–6 hex digits. Parsed
39//! as a hex integer, validated via [`char::from_u32`].
40//! 5. **Description fallback** — small secondary table keyed by the
41//! literal description text (well-known shapes like 〓, 〻).
42//! 6. **None** — unresolved. Renderer falls back to the raw
43//! `description` bytes.
44//!
45//! ## Why two PHF maps rather than one enum-valued map
46//!
47//! The single-char map is 4 329 entries; the combo map is 25.
48//! Storing the common path as `phf::Map<&str, char>` keeps each value
49//! at 4 bytes (vs 16-byte `&str`) and the cache footprint of the hot
50//! lookup path tight. The combo map is consulted second; misses
51//! there cost a single probe.
52
53use core::fmt;
54
55use crate::jisx0213_table::{
56 DESCRIPTION_TO_CHAR, JISX0213_MENCODE_TO_CHAR, JISX0213_MENCODE_TO_STR,
57};
58
59/// Resolution outcome — either a single Unicode scalar or a static
60/// string covering a combining sequence.
61///
62/// `Copy` so it can sit inside `Gaiji` without breaking the parser
63/// tree's `Copy` chain.
64#[derive(Debug, Clone, Copy, PartialEq, Eq)]
65pub enum Resolved {
66 /// Common path: the mencode mapped to a single Unicode scalar
67 /// (~99.4% of JIS X 0213:2004 cells, plus all `U+XXXX` shapes
68 /// and the description fallback).
69 Char(char),
70 /// JIS X 0213 combining-sequence cell — 25 entries in plane 1
71 /// (Ainu か゚ family, IPA tone marks, accented Latin). The string
72 /// is borrowed from a static `phf::Map` value.
73 Multi(&'static str),
74}
75
76impl Resolved {
77 /// Convenience: write the resolved char(s) into any [`fmt::Write`].
78 /// Renderer / hover / inlay-hint paths all take this shape.
79 ///
80 /// # Errors
81 /// Propagates the writer's own errors verbatim.
82 pub fn write_to<W: fmt::Write>(self, w: &mut W) -> fmt::Result {
83 match self {
84 Self::Char(c) => w.write_char(c),
85 Self::Multi(s) => w.write_str(s),
86 }
87 }
88
89 /// Returns the resolved single `char` if and only if this is a
90 /// [`Resolved::Char`]. Combo cells return `None`.
91 #[must_use]
92 pub fn as_char(self) -> Option<char> {
93 match self {
94 Self::Char(c) => Some(c),
95 Self::Multi(_) => None,
96 }
97 }
98
99 /// Total UTF-8 length of the resolved value (1..=8 bytes in
100 /// practice).
101 #[must_use]
102 pub fn utf8_len(self) -> usize {
103 match self {
104 Self::Char(c) => c.len_utf8(),
105 Self::Multi(s) => s.len(),
106 }
107 }
108}
109
110/// Pure-function lookup used by `aozora-lexer`'s Phase 3 classifier
111/// to populate `borrowed::Gaiji::ucs` at construction time.
112///
113/// `existing` is the short-circuit for callers that already extracted
114/// a codepoint from the source. Pass `None` to fall through to the
115/// table layers.
116#[must_use]
117pub fn lookup(
118 existing: Option<char>,
119 mencode: Option<&str>,
120 description: &str,
121) -> Option<Resolved> {
122 if let Some(ch) = existing {
123 return Some(Resolved::Char(ch));
124 }
125 if let Some(m) = mencode {
126 // Combo table first: the 25 multi-codepoint cells live only
127 // here. A miss is a single PHF probe — cheap.
128 if let Some(&s) = JISX0213_MENCODE_TO_STR.get(m) {
129 return Some(Resolved::Multi(s));
130 }
131 if let Some(&ch) = JISX0213_MENCODE_TO_CHAR.get(m) {
132 return Some(Resolved::Char(ch));
133 }
134 if let Some(ch) = parse_u_plus(m) {
135 return Some(Resolved::Char(ch));
136 }
137 }
138 if let Some(&ch) = DESCRIPTION_TO_CHAR.get(description) {
139 return Some(Resolved::Char(ch));
140 }
141 // Smart fallback: a description that is *itself* a single
142 // character resolves to that character. Common in real corpora
143 // when the author CAN type the kanji (e.g. on a modern IME) but
144 // wants the reader to see a `※[#…]` annotation pointing at the
145 // JIS source. Mencode/dictionary tiers above already short-
146 // circuited any case where the table had a more specific answer,
147 // so this only fires when description is a one-glyph payload
148 // and nothing else matched.
149 //
150 // Counts grapheme clusters by Unicode scalars: a base-plus-
151 // combining sequence (e.g. アクセント分解) returns >1 char and
152 // falls through to the final `None`. Surrogate halves can't
153 // appear in `&str` so single-`char` is unambiguous here.
154 let mut chars = description.chars();
155 if let Some(only) = chars.next()
156 && chars.next().is_none()
157 {
158 return Some(Resolved::Char(only));
159 }
160 None
161}
162
163/// Parse a `U+XXXX` style mencode — 1 to 6 hex digits after the
164/// literal `U+` prefix — and validate the result via
165/// [`char::from_u32`]. Returns `None` for surrogates, non-characters,
166/// and out-of-range integers, rather than panicking, so malformed
167/// input falls cleanly through to the description fallback.
168#[must_use]
169fn parse_u_plus(mencode: &str) -> Option<char> {
170 let hex = mencode.strip_prefix("U+")?;
171 // Reject empty / oversized; `u32::from_str_radix` would accept
172 // 10-digit inputs but those can't fit a Unicode scalar.
173 if hex.is_empty() || hex.len() > 6 {
174 return None;
175 }
176 let code = u32::from_str_radix(hex, 16).ok()?;
177 char::from_u32(code)
178}
179
180// Gaiji descriptions (the text inside `「…」`) that resolve to a
181// canonical character without depending on the mencode tail. Sourced
182// from `crates/aozora-encoding/data/aozora-gaiji-chuki.tsv` (the
183// official 8th-edition 外字注記辞書, ~8 800 entries) plus
184// `aozora-gaiji-special.tsv` (hand-curated 〓 / 〻 placeholders).
185// Generated by `xtask gaiji-gen` and exported from
186// `crate::jisx0213_table::DESCRIPTION_TO_CHAR` (alias-imported at
187// the top of this module).
188
189/// Pretty-printer for tests and diagnostics. Returns
190/// `(single_char_count, combo_count, description_count)`.
191#[must_use]
192pub fn table_sizes() -> (usize, usize, usize) {
193 (
194 JISX0213_MENCODE_TO_CHAR.len(),
195 JISX0213_MENCODE_TO_STR.len(),
196 DESCRIPTION_TO_CHAR.len(),
197 )
198}
199
200#[cfg(test)]
201mod tests {
202 use super::*;
203
204 #[test]
205 fn lookup_prefers_existing_ucs_when_already_set() {
206 // The "existing" short-circuit returns the caller-provided
207 // codepoint without consulting either table.
208 assert_eq!(
209 lookup(Some('\u{1234}'), Some("第3水準1-85-54"), "木+吶のつくり"),
210 Some(Resolved::Char('\u{1234}'))
211 );
212 }
213
214 #[test]
215 fn lookup_via_mencode_table_when_ucs_missing() {
216 // 罪と罰 fixture: `木+吶のつくり` with 第3水準1-85-54.
217 // Per JIS X 0213:2004 plane 1, row 85, cell 54 = 枘 (U+6798).
218 // ("吶のつくり" = right-side component of 吶 = 内, so 木+内 = 枘.)
219 assert_eq!(
220 lookup(None, Some("第3水準1-85-54"), "木+吶のつくり"),
221 Some(Resolved::Char('\u{6798}'))
222 );
223 }
224
225 #[test]
226 fn lookup_via_combo_table_returns_multi() {
227 // 第3水準1-4-87 = か゚ = U+304B U+309A (combining handakuten).
228 // The combo path is the *only* way to honour these 25 cells.
229 assert_eq!(
230 lookup(None, Some("第3水準1-4-87"), ""),
231 Some(Resolved::Multi("\u{304B}\u{309A}"))
232 );
233 }
234
235 #[test]
236 fn combo_resolution_writes_both_codepoints() {
237 // End-to-end: combo lookup + write_to should yield the full
238 // 2-codepoint sequence (6 UTF-8 bytes for か + handakuten).
239 let resolved = lookup(None, Some("第3水準1-4-87"), "").expect("combo resolves");
240 let mut s = String::new();
241 resolved
242 .write_to(&mut s)
243 .expect("write to String never fails");
244 assert_eq!(s, "\u{304B}\u{309A}");
245 assert_eq!(s.chars().count(), 2);
246 }
247
248 #[test]
249 fn lookup_via_u_plus_form() {
250 assert_eq!(
251 lookup(None, Some("U+01F5"), "Latin Small Letter G With Acute"),
252 Some(Resolved::Char('\u{01F5}'))
253 );
254 }
255
256 #[test]
257 fn lookup_via_u_plus_max_six_hex_digits() {
258 // U+10FFFF is the Unicode max; any shape past 6 digits is rejected.
259 assert_eq!(
260 lookup(None, Some("U+10FFFF"), ""),
261 Some(Resolved::Char('\u{10FFFF}'))
262 );
263 }
264
265 #[test]
266 fn lookup_rejects_u_plus_beyond_seven_hex_digits() {
267 assert_eq!(lookup(None, Some("U+1234567"), ""), None);
268 }
269
270 #[test]
271 fn lookup_rejects_u_plus_surrogate() {
272 assert_eq!(lookup(None, Some("U+D800"), ""), None);
273 }
274
275 #[test]
276 fn lookup_rejects_u_plus_non_hex() {
277 assert_eq!(lookup(None, Some("U+GG12"), ""), None);
278 }
279
280 #[test]
281 fn lookup_rejects_u_plus_without_digits() {
282 assert_eq!(lookup(None, Some("U+"), ""), None);
283 }
284
285 #[test]
286 fn lookup_via_description_fallback_when_mencode_absent() {
287 assert_eq!(lookup(None, None, "〓"), Some(Resolved::Char('\u{3013}')));
288 }
289
290 #[test]
291 fn lookup_returns_none_when_all_paths_miss() {
292 // Multi-char description AND missing mencode → no resolution.
293 assert_eq!(
294 lookup(None, Some("not-in-any-table"), "unresolved gaiji"),
295 None
296 );
297 }
298
299 #[test]
300 fn lookup_falls_back_to_description_self_when_single_char() {
301 // 丂 is in the JIS X 0213 plane 2 table at row 1 cell 2 — but
302 // a real-world author wrote `※[#「丂」、第4水準2-16-1]` with a
303 // mencode that doesn't exist in the table. The description IS
304 // the kanji itself, so the smart fallback resolves to it.
305 assert_eq!(
306 lookup(None, Some("第4水準2-16-1"), "丂"),
307 Some(Resolved::Char('\u{4E02}'))
308 );
309 // Same for descriptions with no mencode at all.
310 assert_eq!(lookup(None, None, "畺"), Some(Resolved::Char('\u{757A}')));
311 assert_eq!(lookup(None, None, "龔"), Some(Resolved::Char('\u{9F94}')));
312 }
313
314 #[test]
315 fn single_char_fallback_does_not_override_dictionary_hit() {
316 // `〓` is in the special-placeholder table mapping to
317 // `〓 U+3013`. (Yes, that's a no-op mapping, but it exercises
318 // the dictionary path winning over the single-char fallback.)
319 // If the fallback fired in spite of the table hit, the
320 // dictionary's value would still match here — so the contract
321 // is "fallback only fires when nothing else matched".
322 assert_eq!(lookup(None, None, "〓"), Some(Resolved::Char('\u{3013}')));
323 }
324
325 #[test]
326 fn single_char_fallback_does_not_fire_for_multi_char_descriptions() {
327 // Multi-char description not in any table → must still be None.
328 // Confirms the early-return on `chars.next().is_none()`.
329 assert_eq!(lookup(None, None, "未知の字形"), None);
330 assert_eq!(lookup(None, None, "ab"), None);
331 }
332
333 #[test]
334 fn mencode_table_covers_the_fixture_gaiji() {
335 // Pin the corrected 罪と罰 fixture mapping (枘 U+6798, not the
336 // pre-regen hand-seed's wrong U+6903 椃).
337 assert_eq!(
338 JISX0213_MENCODE_TO_CHAR.get("第3水準1-85-54"),
339 Some(&'\u{6798}')
340 );
341 }
342
343 #[test]
344 fn table_sizes_match_jisx0213_2004_spec() {
345 // Pinned against the JIS X 0213:2004 normative count + the
346 // 外字注記辞書 8th edition (8 881 entries) + 2 hand-curated
347 // specials (〓 / 〻). Both data sources are checked into
348 // `crates/aozora-encoding/data/`.
349 use crate::jisx0213_table::{
350 DESCRIPTION_COUNT, JISX0213_COMBO_COUNT, JISX0213_PLANE1_COUNT, JISX0213_PLANE2_COUNT,
351 };
352 let (single, combo, description) = table_sizes();
353 assert_eq!(single, JISX0213_PLANE1_COUNT + JISX0213_PLANE2_COUNT);
354 assert_eq!(combo, JISX0213_COMBO_COUNT);
355 assert_eq!(description, DESCRIPTION_COUNT);
356 assert_eq!(
357 JISX0213_PLANE1_COUNT, 1893,
358 "第3水準 must equal the spec count",
359 );
360 assert_eq!(
361 JISX0213_PLANE2_COUNT, 2436,
362 "第4水準 must equal the spec count",
363 );
364 assert_eq!(
365 JISX0213_COMBO_COUNT, 25,
366 "combining-sequence cells must equal spec",
367 );
368 assert!(
369 description >= 8_000,
370 "description-fallback table looks too small ({description}) — \
371 did the gaiji-chuki extraction drop entries?",
372 );
373 }
374
375 #[test]
376 fn description_table_resolves_a_known_dictionary_entry() {
377 // 「木+吶のつくり」 is a hallmark fixture description for 枘
378 // (U+6798, JIS X 0213 plane 1 row 85 cell 54). The dictionary
379 // path resolves the same character as the mencode path, so a
380 // test with description-only (no mencode) must hit U+6798.
381 assert_eq!(
382 lookup(None, None, "木+吶のつくり"),
383 Some(Resolved::Char('\u{6798}')),
384 );
385 }
386
387 #[test]
388 fn description_table_preserves_special_placeholders() {
389 // 〓 / 〻 are hand-curated specials kept in
390 // `aozora-gaiji-special.tsv` and merged into the generated map.
391 assert_eq!(lookup(None, None, "〓"), Some(Resolved::Char('\u{3013}')));
392 assert_eq!(lookup(None, None, "〻"), Some(Resolved::Char('\u{303B}')));
393 }
394
395 #[test]
396 fn full_jisx0213_table_covers_a_known_plane1_third_tier_kanji() {
397 // 第3水準1-85-9 = 敧 (U+6567) per JIS X 0213:2004.
398 assert_eq!(
399 JISX0213_MENCODE_TO_CHAR.get("第3水準1-85-9"),
400 Some(&'\u{6567}')
401 );
402 }
403
404 #[test]
405 fn full_jisx0213_table_covers_a_known_plane2_fourth_tier_entry() {
406 // 第4水準2-1-1 = 𠂉 (U+20089) — first plane-2 cell.
407 assert_eq!(
408 JISX0213_MENCODE_TO_CHAR.get("第4水準2-1-1"),
409 Some(&'\u{20089}')
410 );
411 }
412
413 #[test]
414 fn resolved_utf8_len_matches_actual_encoding() {
415 assert_eq!(Resolved::Char('A').utf8_len(), 1);
416 assert_eq!(Resolved::Char('あ').utf8_len(), 3);
417 assert_eq!(Resolved::Char('𠂉').utf8_len(), 4);
418 assert_eq!(Resolved::Multi("\u{304B}\u{309A}").utf8_len(), 6);
419 }
420
421 #[test]
422 fn resolved_as_char_returns_none_for_combos() {
423 assert_eq!(Resolved::Char('A').as_char(), Some('A'));
424 assert_eq!(Resolved::Multi("か゚").as_char(), None);
425 }
426
427 #[test]
428 fn lookup_is_identity_on_the_ucs_input_when_set() {
429 // The "existing" short-circuit honours the caller-provided
430 // scalar without a wasted table probe.
431 assert_eq!(
432 lookup(Some('あ'), Some("anything"), "anything"),
433 Some(Resolved::Char('あ'))
434 );
435 }
436}