aozora_syntax/
accent.rs

1//! Aozora Bunko accent decomposition — ASCII digraph → Unicode letter.
2//!
3//! Spec: <https://www.aozora.gr.jp/accent_separation.html>
4//!
5//! The scheme encodes accented Latin letters using a base ASCII letter followed
6//! by a one-character marker. The full 118-entry table from the spec is
7//! encoded here as a compile-time slice so the lexer (for pre-parse
8//! rewriting) and downstream tools share the same authoritative lookup.
9//!
10//! ```
11//! use aozora_syntax::accent::decompose_fragment;
12//! assert_eq!(decompose_fragment("fune`bre"), "funèbre");
13//! assert_eq!(decompose_fragment("ae&on"), "æon");
14//! assert_eq!(decompose_fragment("plain"), "plain");
15//! ```
16//!
17//! # Invariants
18//!
19//! - The table is closed: no ASCII digraph maps to more than one Unicode
20//!   codepoint. Longest-match on ligatures first (`ae&`, `AE&`, `oe&`, `OE&`)
21//!   then single-letter digraphs.
22//! - `decompose_fragment` may **grow** the byte length of some substrings
23//!   (`m'` = ḿ, `e~` = ẽ are BMP codepoints ≥ U+1E00 whose UTF-8 forms are
24//!   3 bytes — larger than their 2-byte ASCII digraphs). Callers that back-map
25//!   diagnostic spans across the rewrite must record a per-position delta.
26//!
27//! # Scope of use
28//!
29//! The function is **only safe to call on the body of a `〔...〕` span**:
30//! aozora restricts accent decomposition to that convention to avoid
31//! false-matching English text like `text,` (which would otherwise be
32//! decomposed to `texţ` via the legitimate-in-Polish `t,` = ţ entry).
33
34use std::borrow::Cow;
35
36/// The full accent decomposition table in spec-page order.
37///
38/// Public for downstream iteration (tests, doc-builders, corpus
39/// tooling). For runtime lookup, `decompose_fragment` uses the
40/// perfect-hash split tables (`ACCENT_DIGRAPHS` for the 110 two-byte
41/// entries; a 4-arm match for the four three-byte ligatures) — the
42/// linear `ACCENT_TABLE` scan is no longer on the hot path.
43pub const ACCENT_TABLE: &[(&str, char)] = &[
44    // --- Ligatures (checked first: 3-char patterns beat the 2-char group) ---
45    ("ae&", 'æ'),
46    ("AE&", 'Æ'),
47    ("oe&", 'œ'),
48    ("OE&", 'Œ'),
49    ("s&", 'ß'), // eszett — `&` on `s` is a ligature, not ring-above
50    // --- 【a】 ---
51    ("a`", 'à'),
52    ("a'", 'á'),
53    ("a^", 'â'),
54    ("a~", 'ã'),
55    ("a:", 'ä'),
56    ("a&", 'å'),
57    ("a_", 'ā'),
58    // --- 【c】 ---
59    ("c,", 'ç'),
60    ("c'", 'ć'),
61    ("c^", 'ĉ'),
62    // --- 【d】 ---
63    ("d/", 'đ'),
64    // --- 【e】 ---
65    ("e`", 'è'),
66    ("e'", 'é'),
67    ("e^", 'ê'),
68    ("e:", 'ë'),
69    ("e_", 'ē'),
70    ("e~", 'ẽ'),
71    // --- 【g】 ---
72    ("g^", 'ĝ'),
73    // --- 【h】 ---
74    ("h^", 'ĥ'),
75    ("h/", 'ħ'),
76    // --- 【i】 ---
77    ("i`", 'ì'),
78    ("i'", 'í'),
79    ("i^", 'î'),
80    ("i:", 'ï'),
81    ("i_", 'ī'),
82    ("i/", 'ɨ'),
83    ("i~", 'ĩ'),
84    // --- 【j】 ---
85    ("j^", 'ĵ'),
86    // --- 【l】 ---
87    ("l/", 'ł'),
88    ("l'", 'ĺ'),
89    // --- 【m】 ---
90    ("m'", 'ḿ'),
91    // --- 【n】 ---
92    ("n`", 'ǹ'),
93    ("n~", 'ñ'),
94    ("n'", 'ń'),
95    // --- 【o】 ---
96    ("o`", 'ò'),
97    ("o'", 'ó'),
98    ("o^", 'ô'),
99    ("o~", 'õ'),
100    ("o:", 'ö'),
101    ("o/", 'ø'),
102    ("o_", 'ō'),
103    // --- 【r】 ---
104    ("r'", 'ŕ'),
105    // --- 【s】 ---
106    ("s'", 'ś'),
107    ("s,", 'ş'),
108    ("s^", 'ŝ'),
109    // --- 【t】 ---
110    ("t,", 'ţ'),
111    // --- 【u】 ---
112    ("u`", 'ù'),
113    ("u'", 'ú'),
114    ("u^", 'û'),
115    ("u:", 'ü'),
116    ("u_", 'ū'),
117    ("u&", 'ů'),
118    ("u~", 'ũ'),
119    // --- 【y】 ---
120    ("y'", 'ý'),
121    ("y:", 'ÿ'),
122    // --- 【z】 ---
123    ("z'", 'ź'),
124    // --- 【A】 ---
125    ("A`", 'À'),
126    ("A'", 'Á'),
127    ("A^", 'Â'),
128    ("A~", 'Ã'),
129    ("A:", 'Ä'),
130    ("A&", 'Å'),
131    ("A_", 'Ā'),
132    // --- 【C】 ---
133    ("C,", 'Ç'),
134    ("C'", 'Ć'),
135    ("C^", 'Ĉ'),
136    // --- 【D】 ---
137    ("D/", 'Đ'),
138    // --- 【E】 ---
139    ("E`", 'È'),
140    ("E'", 'É'),
141    ("E^", 'Ê'),
142    ("E:", 'Ë'),
143    ("E_", 'Ē'),
144    ("E~", 'Ẽ'),
145    // --- 【G】 ---
146    ("G^", 'Ĝ'),
147    // --- 【H】 ---
148    ("H^", 'Ĥ'),
149    // --- 【I】 ---
150    ("I`", 'Ì'),
151    ("I'", 'Í'),
152    ("I^", 'Î'),
153    ("I:", 'Ï'),
154    ("I_", 'Ī'),
155    ("I~", 'Ĩ'),
156    // --- 【J】 ---
157    ("J^", 'Ĵ'),
158    // --- 【L】 ---
159    ("L/", 'Ł'),
160    ("L'", 'Ĺ'),
161    // --- 【M】 ---
162    ("M'", 'Ḿ'),
163    // --- 【N】 ---
164    ("N`", 'Ǹ'),
165    ("N~", 'Ñ'),
166    ("N'", 'Ń'),
167    // --- 【O】 ---
168    ("O`", 'Ò'),
169    ("O'", 'Ó'),
170    ("O^", 'Ô'),
171    ("O~", 'Õ'),
172    ("O:", 'Ö'),
173    ("O/", 'Ø'),
174    ("O_", 'Ō'),
175    // --- 【R】 ---
176    ("R'", 'Ŕ'),
177    // --- 【S】 ---
178    ("S'", 'Ś'),
179    ("S,", 'Ş'),
180    ("S^", 'Ŝ'),
181    // --- 【T】 ---
182    ("T,", 'Ţ'),
183    // --- 【U】 ---
184    ("U`", 'Ù'),
185    ("U'", 'Ú'),
186    ("U^", 'Û'),
187    ("U:", 'Ü'),
188    ("U_", 'Ū'),
189    ("U&", 'Ů'),
190    ("U~", 'Ũ'),
191    // --- 【Y】 ---
192    ("Y'", 'Ý'),
193    // --- 【Z】 ---
194    ("Z'", 'Ź'),
195];
196
197/// ASCII characters that act as accent markers in the spec.
198///
199/// Kept as a `&[u8]` slice for downstream consumers that want to
200/// enumerate the marker bytes; runtime membership checks go through
201/// the `u128` bitmap `ACCENT_MARKER_MASK` instead, which lowers to a
202/// single shift + AND.
203pub const ACCENT_MARKERS: &[u8] = b"'`^:~&,/_";
204
205/// 128-bit bitmap of [`ACCENT_MARKERS`] for branchless ASCII membership
206/// testing. Bit `n` is 1 iff byte `n` is an accent marker. Computed at
207/// compile time from [`ACCENT_MARKERS`] so the two stay in lockstep.
208const ACCENT_MARKER_MASK: u128 = {
209    let mut m: u128 = 0;
210    let bs = ACCENT_MARKERS;
211    let mut i = 0;
212    while i < bs.len() {
213        // All marker bytes are < 128 (ASCII). Compile-time-asserted by
214        // the const block below.
215        m |= 1u128 << bs[i];
216        i += 1;
217    }
218    m
219};
220
221const _: () = {
222    // Pin the marker set to ASCII; if a future spec edit adds a non-ASCII
223    // marker the bitmap shape must change (no longer fits in u128).
224    let bs = ACCENT_MARKERS;
225    let mut i = 0;
226    while i < bs.len() {
227        assert!(bs[i] < 128, "ACCENT_MARKERS must stay ASCII-only");
228        i += 1;
229    }
230};
231
232/// Branchless membership test against [`ACCENT_MARKERS`].
233///
234/// Compiles to `(b < 128) & ((MASK >> b) & 1)` — one cmp, one shift,
235/// one AND — no memory load, no loop, no branch. Replaces the prior
236/// `ACCENT_MARKERS.contains(&b)` linear scan over 9 bytes.
237#[inline]
238#[must_use]
239pub const fn is_accent_marker(b: u8) -> bool {
240    // `b as u32` to avoid `1u128 << 200` overflow if a non-ASCII byte
241    // were ever passed; the AND with the high mask is 0 there anyway,
242    // but the shift itself UB without the guard.
243    (b < 128) && ((ACCENT_MARKER_MASK >> b) & 1) != 0
244}
245
246/// 3-byte ligatures (ASCII keys → Latin char). Only four entries, so a
247/// `match` beats `phf::Map` here: the compiler lowers it to a small
248/// jump table, branch prediction nails the common ASCII miss path, and
249/// the `match` keeps the keys inlined as immediates rather than
250/// reaching out to a static array.
251#[inline]
252fn match_ligature(head: &[u8]) -> Option<char> {
253    debug_assert_eq!(head.len(), 3, "match_ligature requires exactly 3 bytes");
254    match head {
255        b"ae&" => Some('æ'),
256        b"AE&" => Some('Æ'),
257        b"oe&" => Some('œ'),
258        b"OE&" => Some('Œ'),
259        _ => None,
260    }
261}
262
263/// 2-byte digraphs as a compile-time perfect hash table. 110 entries,
264/// `&[u8]` keys (the 2 ASCII bytes), `char` values. `phf::Map::get` is
265/// O(1) and constant-comparison-bounded, replacing the 110-entry
266/// linear scan that the old `ACCENT_TABLE` lookup used.
267static ACCENT_DIGRAPHS: phf::Map<&'static [u8], char> = phf::phf_map! {
268    // s& is grouped as a "ligature" on the spec page but is 2 bytes;
269    // it lives here in the digraph map alongside the rest.
270    b"s&" => 'ß',
271    // --- 【a】 ---
272    b"a`" => 'à', b"a'" => 'á', b"a^" => 'â', b"a~" => 'ã',
273    b"a:" => 'ä', b"a&" => 'å', b"a_" => 'ā',
274    // --- 【c】 ---
275    b"c," => 'ç', b"c'" => 'ć', b"c^" => 'ĉ',
276    // --- 【d】 ---
277    b"d/" => 'đ',
278    // --- 【e】 ---
279    b"e`" => 'è', b"e'" => 'é', b"e^" => 'ê', b"e:" => 'ë',
280    b"e_" => 'ē', b"e~" => 'ẽ',
281    // --- 【g】 ---
282    b"g^" => 'ĝ',
283    // --- 【h】 ---
284    b"h^" => 'ĥ', b"h/" => 'ħ',
285    // --- 【i】 ---
286    b"i`" => 'ì', b"i'" => 'í', b"i^" => 'î', b"i:" => 'ï',
287    b"i_" => 'ī', b"i/" => 'ɨ', b"i~" => 'ĩ',
288    // --- 【j】 ---
289    b"j^" => 'ĵ',
290    // --- 【l】 ---
291    b"l/" => 'ł', b"l'" => 'ĺ',
292    // --- 【m】 ---
293    b"m'" => 'ḿ',
294    // --- 【n】 ---
295    b"n`" => 'ǹ', b"n~" => 'ñ', b"n'" => 'ń',
296    // --- 【o】 ---
297    b"o`" => 'ò', b"o'" => 'ó', b"o^" => 'ô', b"o~" => 'õ',
298    b"o:" => 'ö', b"o/" => 'ø', b"o_" => 'ō',
299    // --- 【r】 ---
300    b"r'" => 'ŕ',
301    // --- 【s】 ---
302    b"s'" => 'ś', b"s," => 'ş', b"s^" => 'ŝ',
303    // --- 【t】 ---
304    b"t," => 'ţ',
305    // --- 【u】 ---
306    b"u`" => 'ù', b"u'" => 'ú', b"u^" => 'û', b"u:" => 'ü',
307    b"u_" => 'ū', b"u&" => 'ů', b"u~" => 'ũ',
308    // --- 【y】 ---
309    b"y'" => 'ý', b"y:" => 'ÿ',
310    // --- 【z】 ---
311    b"z'" => 'ź',
312    // --- 【A】 ---
313    b"A`" => 'À', b"A'" => 'Á', b"A^" => 'Â', b"A~" => 'Ã',
314    b"A:" => 'Ä', b"A&" => 'Å', b"A_" => 'Ā',
315    // --- 【C】 ---
316    b"C," => 'Ç', b"C'" => 'Ć', b"C^" => 'Ĉ',
317    // --- 【D】 ---
318    b"D/" => 'Đ',
319    // --- 【E】 ---
320    b"E`" => 'È', b"E'" => 'É', b"E^" => 'Ê', b"E:" => 'Ë',
321    b"E_" => 'Ē', b"E~" => 'Ẽ',
322    // --- 【G】 ---
323    b"G^" => 'Ĝ',
324    // --- 【H】 ---
325    b"H^" => 'Ĥ',
326    // --- 【I】 ---
327    b"I`" => 'Ì', b"I'" => 'Í', b"I^" => 'Î', b"I:" => 'Ï',
328    b"I_" => 'Ī', b"I~" => 'Ĩ',
329    // --- 【J】 ---
330    b"J^" => 'Ĵ',
331    // --- 【L】 ---
332    b"L/" => 'Ł', b"L'" => 'Ĺ',
333    // --- 【M】 ---
334    b"M'" => 'Ḿ',
335    // --- 【N】 ---
336    b"N`" => 'Ǹ', b"N~" => 'Ñ', b"N'" => 'Ń',
337    // --- 【O】 ---
338    b"O`" => 'Ò', b"O'" => 'Ó', b"O^" => 'Ô', b"O~" => 'Õ',
339    b"O:" => 'Ö', b"O/" => 'Ø', b"O_" => 'Ō',
340    // --- 【R】 ---
341    b"R'" => 'Ŕ',
342    // --- 【S】 ---
343    b"S'" => 'Ś', b"S," => 'Ş', b"S^" => 'Ŝ',
344    // --- 【T】 ---
345    b"T," => 'Ţ',
346    // --- 【U】 ---
347    b"U`" => 'Ù', b"U'" => 'Ú', b"U^" => 'Û', b"U:" => 'Ü',
348    b"U_" => 'Ū', b"U&" => 'Ů', b"U~" => 'Ũ',
349    // --- 【Y】 ---
350    b"Y'" => 'Ý',
351    // --- 【Z】 ---
352    b"Z'" => 'Ź',
353};
354
355const _: () = {
356    // Pin runtime tables to canonical table size: 4 ligatures (in
357    // `match_ligature`) + 110 digraphs = 114 spec entries. Compile-time
358    // assert so a forgotten entry surfaces during build, not at the
359    // first runtime test.
360    assert!(
361        ACCENT_DIGRAPHS.len() == 110,
362        "ACCENT_DIGRAPHS must contain exactly 110 entries (114 spec − 4 ligatures)"
363    );
364};
365
366/// Decompose Aozora accent digraphs anywhere inside `fragment`.
367///
368/// Call this on the **body of a `〔...〕` span** only; the transform is
369/// restricted to that convention so English text (`isn't`, `text,`, `word's`)
370/// doesn't false-match legitimate spec entries (`n'`=ń, `t,`=ţ, and friends).
371///
372/// Guarantees:
373/// - Returns `Cow::Borrowed(fragment)` when no accent **marker byte** appears
374///   (zero alloc on the common Japanese-only case).
375/// - Greedy longest-match: ligatures (3-byte, e.g. `ae&` = æ) beat the 2-byte
376///   digraphs that share a prefix (`a&` = å would otherwise apply).
377/// - Byte length of the output can be up to 3 bytes per 2-byte digraph for the
378///   few entries that land in U+1Exx (`m'` = ḿ, `e~` = ẽ). Most entries shrink
379///   (3-byte ligature → 2-byte UTF-8). The invariant we do hold: the result
380///   is always a valid UTF-8 string.
381///
382/// The implementation is linear in `fragment.len()`: we walk the byte stream
383/// left-to-right, peek `<= 3` bytes at a time, and commit the longest match
384/// that's in the table.
385#[must_use]
386pub fn decompose_fragment(fragment: &str) -> Cow<'_, str> {
387    let bytes = fragment.as_bytes();
388    // Early-out: if no accent marker byte appears at all, the output equals the
389    // input bit-for-bit. Borrow to avoid allocation.
390    //
391    // The membership test goes through the [`ACCENT_MARKER_MASK`] u128
392    // bitmap, which lowers to one cmp + shift + AND per byte — the
393    // tightest path possible without SIMD. SIMD prefilter wouldn't help
394    // here: aozora text is overwhelmingly Japanese (3-byte UTF-8 with
395    // 0xE3 lead byte), so byte-level memchr-style searches don't reduce
396    // the candidate set.
397    if !bytes.iter().any(|b| is_accent_marker(*b)) {
398        return Cow::Borrowed(fragment);
399    }
400
401    let mut out = String::with_capacity(fragment.len());
402    let mut i = 0;
403    while i < bytes.len() {
404        if let Some((pat_len, ch)) = try_match(bytes, i) {
405            out.push(ch);
406            i += pat_len;
407        } else {
408            // Advance one UTF-8 scalar value. Every index we land on is a
409            // valid char boundary because we only stride by `pat_len` (2 or 3
410            // ASCII bytes) or by `ch.len_utf8()`. `.get(i..)` both avoids
411            // `clippy::string_slice` and defends against the stride
412            // invariant breaking: a misaligned index yields `None`, which
413            // breaks the loop cleanly.
414            let Some(ch) = fragment.get(i..).and_then(|s| s.chars().next()) else {
415                break;
416            };
417            out.push(ch);
418            i += ch.len_utf8();
419        }
420    }
421    Cow::Owned(out)
422}
423
424/// Attempt to match a table entry starting at `bytes[i]`. Longest-first
425/// (the spec rule): try 3-byte ligatures before 2-byte digraphs.
426///
427/// - **3-byte path**: a 4-arm `match` against the four ligatures
428///   (`ae&`, `AE&`, `oe&`, `OE&`). `match_ligature` lowers to a tight
429///   jump-table-or-direct-compares form.
430/// - **2-byte path**: O(1) lookup in `ACCENT_DIGRAPHS`, a `phf::Map`
431///   built at compile time over all 110 spec digraph entries.
432///
433/// Returns `(consumed_bytes, replacement_char)` on match.
434#[inline]
435fn try_match(bytes: &[u8], i: usize) -> Option<(usize, char)> {
436    if i + 3 <= bytes.len()
437        && let Some(ch) = match_ligature(&bytes[i..i + 3])
438    {
439        return Some((3, ch));
440    }
441    if i + 2 <= bytes.len()
442        && let Some(&ch) = ACCENT_DIGRAPHS.get(&bytes[i..i + 2])
443    {
444        return Some((2, ch));
445    }
446    None
447}
448
449#[cfg(test)]
450mod tests {
451    use super::*;
452
453    #[test]
454    fn table_size_is_pinned_to_spec_count() {
455        // Verified 2026-04-23 against <https://www.aozora.gr.jp/accent_separation.html>
456        // (archived at docs/specs/aozora/accent_separation.html) by enumerating
457        // every ASCII digraph and ligature in the 【a..z】, 【A..Z】, and 【合字】
458        // groups. A drop below this number means a merge lost table entries;
459        // a rise means the spec added entries and the table needs to grow.
460        const EXPECTED: usize = 114;
461        assert_eq!(
462            ACCENT_TABLE.len(),
463            EXPECTED,
464            "spec count drift — see docs/specs/aozora/accent_separation.html"
465        );
466    }
467
468    #[test]
469    fn every_table_entry_is_representable_ascii_source() {
470        for (pat, _) in ACCENT_TABLE {
471            assert!(
472                pat.is_ascii(),
473                "digraph {pat:?} must be pure ASCII per spec"
474            );
475            assert!(
476                pat.len() == 2 || pat.len() == 3,
477                "digraph {pat:?} must be 2 or 3 bytes"
478            );
479        }
480    }
481
482    #[test]
483    fn every_table_entry_has_unique_pattern() {
484        use std::collections::HashSet;
485        let mut seen: HashSet<&str> = HashSet::new();
486        for (pat, _) in ACCENT_TABLE {
487            assert!(seen.insert(pat), "duplicate digraph {pat:?}");
488        }
489    }
490
491    #[test]
492    fn digraph_size_growth_stays_within_one_extra_byte() {
493        // We don't claim byte-length non-growth (disproved by entries like
494        // `m'` = ḿ U+1E3F which grows 2 → 3 bytes), but we DO pin that no entry
495        // grows by more than one byte: callers budgeting diagnostic span
496        // back-mapping need to allocate at most `input_len + count_of_digraphs`
497        // output bytes.
498        for (pat, ch) in ACCENT_TABLE {
499            let out_len = ch.len_utf8();
500            let in_len = pat.len();
501            let growth = out_len.saturating_sub(in_len);
502            assert!(
503                growth <= 1,
504                "digraph {pat:?} → {ch} grew by {growth} bytes (cap is 1)"
505            );
506        }
507    }
508
509    // --- Specific spec checkpoints (sample across groups to catch table drift) ---
510
511    #[test]
512    fn spec_point_e_grave() {
513        assert_eq!(decompose_fragment("fune`bre"), "funèbre");
514    }
515
516    #[test]
517    fn spec_point_acute_accents() {
518        assert_eq!(decompose_fragment("ve'rite'"), "vérité");
519    }
520
521    #[test]
522    fn spec_point_circumflex_and_cedilla_together() {
523        assert_eq!(decompose_fragment("C,a va^"), "Ça vâ");
524    }
525
526    #[test]
527    fn spec_point_all_vowel_graves() {
528        assert_eq!(decompose_fragment("a` e` i` o` u`"), "à è ì ò ù");
529    }
530
531    #[test]
532    fn spec_point_uppercase_accents() {
533        assert_eq!(decompose_fragment("A` E' N~"), "À É Ñ");
534    }
535
536    #[test]
537    fn spec_point_ligatures_beat_ring_above() {
538        // `s&` = ß (eszett), NOT `s` + ring-above — longest-match ordering.
539        assert_eq!(decompose_fragment("stras&e"), "straße");
540        // Ligature over single-letter: ae& = æ, not a& + e.
541        assert_eq!(decompose_fragment("ae&on"), "æon");
542        assert_eq!(decompose_fragment("OE&uvre"), "Œuvre");
543    }
544
545    #[test]
546    fn spec_point_stroke_and_macron() {
547        assert_eq!(decompose_fragment("d/o_g"), "đōg");
548    }
549
550    #[test]
551    fn input_without_any_marker_byte_is_borrowed() {
552        // Must avoid every ASCII marker: ' ` ^ : ~ & , / _
553        let input = "plain Japanese prose ここはテストです 春夏秋冬";
554        let out = decompose_fragment(input);
555        assert!(
556            matches!(out, Cow::Borrowed(_)),
557            "expected zero-alloc path for {input:?}"
558        );
559        assert_eq!(out, input);
560    }
561
562    #[test]
563    fn isolated_markers_not_preceded_by_table_base_are_preserved() {
564        // A marker that lands without a valid base letter preceding it stays
565        // intact. The call site is the inside of a 〔〕 span, where
566        // these cases represent author typos or genuine punctuation.
567        assert_eq!(decompose_fragment("'tis"), "'tis"); // leading apostrophe
568        assert_eq!(decompose_fragment("5^2"), "5^2"); // digit base not in spec
569        assert_eq!(decompose_fragment("q^"), "q^"); // q not in spec table
570    }
571
572    #[test]
573    fn markers_are_greedy_for_any_valid_preceding_base() {
574        // Even when the user might have intended punctuation, the spec rule is
575        // simple: `<base-letter><marker>` decomposes. Call sites must gate by
576        // the 〔〕 wrapper to avoid false-positives on English text.
577        assert_eq!(decompose_fragment("`hello`"), "`hellò"); // o` → ò
578        assert_eq!(decompose_fragment("text,"), "texţ"); // t, → ţ
579    }
580
581    #[test]
582    fn unknown_base_letters_stay_unchanged() {
583        // f doesn't have entries in the spec; f' must stay.
584        assert_eq!(decompose_fragment("f'x"), "f'x");
585        // q also absent.
586        assert_eq!(decompose_fragment("q^"), "q^");
587    }
588
589    #[test]
590    fn mixed_japanese_and_accents_round_trip_on_japanese() {
591        assert_eq!(
592            decompose_fragment("ここは fune`bre です"),
593            "ここは funèbre です"
594        );
595    }
596
597    #[test]
598    fn empty_input_is_borrowed() {
599        let out = decompose_fragment("");
600        assert!(matches!(out, Cow::Borrowed("")));
601    }
602
603    #[test]
604    fn three_byte_ligatures_shrink_output_byte_length() {
605        // 3-byte ASCII ligature → 2-byte UTF-8: strictly shorter.
606        // `s&` = ß is NOT a 3-byte ligature; it's a 2-byte digraph → 2 UTF-8
607        // bytes, so length is preserved. Covered separately below.
608        for (input, expected) in [("ae&on", "æon"), ("OE&uvre", "Œuvre")] {
609            let out = decompose_fragment(input);
610            assert!(
611                out.len() < input.len(),
612                "3-byte ligature should shrink: {input:?} → {out:?}"
613            );
614            assert_eq!(out, expected);
615        }
616    }
617
618    #[test]
619    fn two_byte_eszett_preserves_output_byte_length() {
620        // `s&` = ß is a 2-byte source → 2-byte UTF-8 output: neutral length.
621        let out = decompose_fragment("stras&e");
622        assert_eq!(out, "straße");
623        assert_eq!(out.len(), "stras&e".len());
624    }
625
626    #[test]
627    fn bmp_above_u1e00_digraphs_may_grow_output() {
628        // `m'` → ḿ U+1E3F is 3 bytes; documented growth path.
629        let out = decompose_fragment("m'a");
630        assert_eq!(out, "ḿa");
631        assert!(out.len() > "m'a".len());
632    }
633
634    #[test]
635    fn property_all_table_entries_round_trip() {
636        // Every table entry, when wrapped in benign context, decomposes to its
637        // target char and only that char.
638        for (pat, ch) in ACCENT_TABLE {
639            let input = format!("_{pat}_");
640            let out = decompose_fragment(&input);
641            let expected: String = format!("_{ch}_");
642            assert_eq!(*out, *expected, "pattern {pat:?} failed");
643        }
644    }
645}
aozora_syntax/accent.rs

aozora_syntax/
accent.rs