aozora_syntax/accent.rs
1//! Aozora Bunko accent decomposition — ASCII digraph → Unicode letter.
2//!
3//! Spec: <https://www.aozora.gr.jp/accent_separation.html>
4//!
5//! The scheme encodes accented Latin letters using a base ASCII letter followed
6//! by a one-character marker. The full 118-entry table from the spec is
7//! encoded here as a compile-time slice so the lexer (for pre-parse
8//! rewriting) and downstream tools share the same authoritative lookup.
9//!
10//! ```
11//! use aozora_syntax::accent::decompose_fragment;
12//! assert_eq!(decompose_fragment("fune`bre"), "funèbre");
13//! assert_eq!(decompose_fragment("ae&on"), "æon");
14//! assert_eq!(decompose_fragment("plain"), "plain");
15//! ```
16//!
17//! # Invariants
18//!
19//! - The table is closed: no ASCII digraph maps to more than one Unicode
20//! codepoint. Longest-match on ligatures first (`ae&`, `AE&`, `oe&`, `OE&`)
21//! then single-letter digraphs.
22//! - `decompose_fragment` may **grow** the byte length of some substrings
23//! (`m'` = ḿ, `e~` = ẽ are BMP codepoints ≥ U+1E00 whose UTF-8 forms are
24//! 3 bytes — larger than their 2-byte ASCII digraphs). Callers that back-map
25//! diagnostic spans across the rewrite must record a per-position delta.
26//!
27//! # Scope of use
28//!
29//! The function is **only safe to call on the body of a `〔...〕` span**:
30//! aozora restricts accent decomposition to that convention to avoid
31//! false-matching English text like `text,` (which would otherwise be
32//! decomposed to `texţ` via the legitimate-in-Polish `t,` = ţ entry).
33
34use std::borrow::Cow;
35
36/// The full accent decomposition table in spec-page order.
37///
38/// Public for downstream iteration (tests, doc-builders, corpus
39/// tooling). For runtime lookup, `decompose_fragment` uses the
40/// perfect-hash split tables (`ACCENT_DIGRAPHS` for the 110 two-byte
41/// entries; a 4-arm match for the four three-byte ligatures) — the
42/// linear `ACCENT_TABLE` scan is no longer on the hot path.
43pub const ACCENT_TABLE: &[(&str, char)] = &[
44 // --- Ligatures (checked first: 3-char patterns beat the 2-char group) ---
45 ("ae&", 'æ'),
46 ("AE&", 'Æ'),
47 ("oe&", 'œ'),
48 ("OE&", 'Œ'),
49 ("s&", 'ß'), // eszett — `&` on `s` is a ligature, not ring-above
50 // --- 【a】 ---
51 ("a`", 'à'),
52 ("a'", 'á'),
53 ("a^", 'â'),
54 ("a~", 'ã'),
55 ("a:", 'ä'),
56 ("a&", 'å'),
57 ("a_", 'ā'),
58 // --- 【c】 ---
59 ("c,", 'ç'),
60 ("c'", 'ć'),
61 ("c^", 'ĉ'),
62 // --- 【d】 ---
63 ("d/", 'đ'),
64 // --- 【e】 ---
65 ("e`", 'è'),
66 ("e'", 'é'),
67 ("e^", 'ê'),
68 ("e:", 'ë'),
69 ("e_", 'ē'),
70 ("e~", 'ẽ'),
71 // --- 【g】 ---
72 ("g^", 'ĝ'),
73 // --- 【h】 ---
74 ("h^", 'ĥ'),
75 ("h/", 'ħ'),
76 // --- 【i】 ---
77 ("i`", 'ì'),
78 ("i'", 'í'),
79 ("i^", 'î'),
80 ("i:", 'ï'),
81 ("i_", 'ī'),
82 ("i/", 'ɨ'),
83 ("i~", 'ĩ'),
84 // --- 【j】 ---
85 ("j^", 'ĵ'),
86 // --- 【l】 ---
87 ("l/", 'ł'),
88 ("l'", 'ĺ'),
89 // --- 【m】 ---
90 ("m'", 'ḿ'),
91 // --- 【n】 ---
92 ("n`", 'ǹ'),
93 ("n~", 'ñ'),
94 ("n'", 'ń'),
95 // --- 【o】 ---
96 ("o`", 'ò'),
97 ("o'", 'ó'),
98 ("o^", 'ô'),
99 ("o~", 'õ'),
100 ("o:", 'ö'),
101 ("o/", 'ø'),
102 ("o_", 'ō'),
103 // --- 【r】 ---
104 ("r'", 'ŕ'),
105 // --- 【s】 ---
106 ("s'", 'ś'),
107 ("s,", 'ş'),
108 ("s^", 'ŝ'),
109 // --- 【t】 ---
110 ("t,", 'ţ'),
111 // --- 【u】 ---
112 ("u`", 'ù'),
113 ("u'", 'ú'),
114 ("u^", 'û'),
115 ("u:", 'ü'),
116 ("u_", 'ū'),
117 ("u&", 'ů'),
118 ("u~", 'ũ'),
119 // --- 【y】 ---
120 ("y'", 'ý'),
121 ("y:", 'ÿ'),
122 // --- 【z】 ---
123 ("z'", 'ź'),
124 // --- 【A】 ---
125 ("A`", 'À'),
126 ("A'", 'Á'),
127 ("A^", 'Â'),
128 ("A~", 'Ã'),
129 ("A:", 'Ä'),
130 ("A&", 'Å'),
131 ("A_", 'Ā'),
132 // --- 【C】 ---
133 ("C,", 'Ç'),
134 ("C'", 'Ć'),
135 ("C^", 'Ĉ'),
136 // --- 【D】 ---
137 ("D/", 'Đ'),
138 // --- 【E】 ---
139 ("E`", 'È'),
140 ("E'", 'É'),
141 ("E^", 'Ê'),
142 ("E:", 'Ë'),
143 ("E_", 'Ē'),
144 ("E~", 'Ẽ'),
145 // --- 【G】 ---
146 ("G^", 'Ĝ'),
147 // --- 【H】 ---
148 ("H^", 'Ĥ'),
149 // --- 【I】 ---
150 ("I`", 'Ì'),
151 ("I'", 'Í'),
152 ("I^", 'Î'),
153 ("I:", 'Ï'),
154 ("I_", 'Ī'),
155 ("I~", 'Ĩ'),
156 // --- 【J】 ---
157 ("J^", 'Ĵ'),
158 // --- 【L】 ---
159 ("L/", 'Ł'),
160 ("L'", 'Ĺ'),
161 // --- 【M】 ---
162 ("M'", 'Ḿ'),
163 // --- 【N】 ---
164 ("N`", 'Ǹ'),
165 ("N~", 'Ñ'),
166 ("N'", 'Ń'),
167 // --- 【O】 ---
168 ("O`", 'Ò'),
169 ("O'", 'Ó'),
170 ("O^", 'Ô'),
171 ("O~", 'Õ'),
172 ("O:", 'Ö'),
173 ("O/", 'Ø'),
174 ("O_", 'Ō'),
175 // --- 【R】 ---
176 ("R'", 'Ŕ'),
177 // --- 【S】 ---
178 ("S'", 'Ś'),
179 ("S,", 'Ş'),
180 ("S^", 'Ŝ'),
181 // --- 【T】 ---
182 ("T,", 'Ţ'),
183 // --- 【U】 ---
184 ("U`", 'Ù'),
185 ("U'", 'Ú'),
186 ("U^", 'Û'),
187 ("U:", 'Ü'),
188 ("U_", 'Ū'),
189 ("U&", 'Ů'),
190 ("U~", 'Ũ'),
191 // --- 【Y】 ---
192 ("Y'", 'Ý'),
193 // --- 【Z】 ---
194 ("Z'", 'Ź'),
195];
196
197/// ASCII characters that act as accent markers in the spec.
198///
199/// Kept as a `&[u8]` slice for downstream consumers that want to
200/// enumerate the marker bytes; runtime membership checks go through
201/// the `u128` bitmap `ACCENT_MARKER_MASK` instead, which lowers to a
202/// single shift + AND.
203pub const ACCENT_MARKERS: &[u8] = b"'`^:~&,/_";
204
205/// 128-bit bitmap of [`ACCENT_MARKERS`] for branchless ASCII membership
206/// testing. Bit `n` is 1 iff byte `n` is an accent marker. Computed at
207/// compile time from [`ACCENT_MARKERS`] so the two stay in lockstep.
208const ACCENT_MARKER_MASK: u128 = {
209 let mut m: u128 = 0;
210 let bs = ACCENT_MARKERS;
211 let mut i = 0;
212 while i < bs.len() {
213 // All marker bytes are < 128 (ASCII). Compile-time-asserted by
214 // the const block below.
215 m |= 1u128 << bs[i];
216 i += 1;
217 }
218 m
219};
220
221const _: () = {
222 // Pin the marker set to ASCII; if a future spec edit adds a non-ASCII
223 // marker the bitmap shape must change (no longer fits in u128).
224 let bs = ACCENT_MARKERS;
225 let mut i = 0;
226 while i < bs.len() {
227 assert!(bs[i] < 128, "ACCENT_MARKERS must stay ASCII-only");
228 i += 1;
229 }
230};
231
232/// Branchless membership test against [`ACCENT_MARKERS`].
233///
234/// Compiles to `(b < 128) & ((MASK >> b) & 1)` — one cmp, one shift,
235/// one AND — no memory load, no loop, no branch. Replaces the prior
236/// `ACCENT_MARKERS.contains(&b)` linear scan over 9 bytes.
237#[inline]
238#[must_use]
239pub const fn is_accent_marker(b: u8) -> bool {
240 // `b as u32` to avoid `1u128 << 200` overflow if a non-ASCII byte
241 // were ever passed; the AND with the high mask is 0 there anyway,
242 // but the shift itself UB without the guard.
243 (b < 128) && ((ACCENT_MARKER_MASK >> b) & 1) != 0
244}
245
246/// 3-byte ligatures (ASCII keys → Latin char). Only four entries, so a
247/// `match` beats `phf::Map` here: the compiler lowers it to a small
248/// jump table, branch prediction nails the common ASCII miss path, and
249/// the `match` keeps the keys inlined as immediates rather than
250/// reaching out to a static array.
251#[inline]
252fn match_ligature(head: &[u8]) -> Option<char> {
253 debug_assert_eq!(head.len(), 3, "match_ligature requires exactly 3 bytes");
254 match head {
255 b"ae&" => Some('æ'),
256 b"AE&" => Some('Æ'),
257 b"oe&" => Some('œ'),
258 b"OE&" => Some('Œ'),
259 _ => None,
260 }
261}
262
263/// 2-byte digraphs as a compile-time perfect hash table. 110 entries,
264/// `&[u8]` keys (the 2 ASCII bytes), `char` values. `phf::Map::get` is
265/// O(1) and constant-comparison-bounded, replacing the 110-entry
266/// linear scan that the old `ACCENT_TABLE` lookup used.
267static ACCENT_DIGRAPHS: phf::Map<&'static [u8], char> = phf::phf_map! {
268 // s& is grouped as a "ligature" on the spec page but is 2 bytes;
269 // it lives here in the digraph map alongside the rest.
270 b"s&" => 'ß',
271 // --- 【a】 ---
272 b"a`" => 'à', b"a'" => 'á', b"a^" => 'â', b"a~" => 'ã',
273 b"a:" => 'ä', b"a&" => 'å', b"a_" => 'ā',
274 // --- 【c】 ---
275 b"c," => 'ç', b"c'" => 'ć', b"c^" => 'ĉ',
276 // --- 【d】 ---
277 b"d/" => 'đ',
278 // --- 【e】 ---
279 b"e`" => 'è', b"e'" => 'é', b"e^" => 'ê', b"e:" => 'ë',
280 b"e_" => 'ē', b"e~" => 'ẽ',
281 // --- 【g】 ---
282 b"g^" => 'ĝ',
283 // --- 【h】 ---
284 b"h^" => 'ĥ', b"h/" => 'ħ',
285 // --- 【i】 ---
286 b"i`" => 'ì', b"i'" => 'í', b"i^" => 'î', b"i:" => 'ï',
287 b"i_" => 'ī', b"i/" => 'ɨ', b"i~" => 'ĩ',
288 // --- 【j】 ---
289 b"j^" => 'ĵ',
290 // --- 【l】 ---
291 b"l/" => 'ł', b"l'" => 'ĺ',
292 // --- 【m】 ---
293 b"m'" => 'ḿ',
294 // --- 【n】 ---
295 b"n`" => 'ǹ', b"n~" => 'ñ', b"n'" => 'ń',
296 // --- 【o】 ---
297 b"o`" => 'ò', b"o'" => 'ó', b"o^" => 'ô', b"o~" => 'õ',
298 b"o:" => 'ö', b"o/" => 'ø', b"o_" => 'ō',
299 // --- 【r】 ---
300 b"r'" => 'ŕ',
301 // --- 【s】 ---
302 b"s'" => 'ś', b"s," => 'ş', b"s^" => 'ŝ',
303 // --- 【t】 ---
304 b"t," => 'ţ',
305 // --- 【u】 ---
306 b"u`" => 'ù', b"u'" => 'ú', b"u^" => 'û', b"u:" => 'ü',
307 b"u_" => 'ū', b"u&" => 'ů', b"u~" => 'ũ',
308 // --- 【y】 ---
309 b"y'" => 'ý', b"y:" => 'ÿ',
310 // --- 【z】 ---
311 b"z'" => 'ź',
312 // --- 【A】 ---
313 b"A`" => 'À', b"A'" => 'Á', b"A^" => 'Â', b"A~" => 'Ã',
314 b"A:" => 'Ä', b"A&" => 'Å', b"A_" => 'Ā',
315 // --- 【C】 ---
316 b"C," => 'Ç', b"C'" => 'Ć', b"C^" => 'Ĉ',
317 // --- 【D】 ---
318 b"D/" => 'Đ',
319 // --- 【E】 ---
320 b"E`" => 'È', b"E'" => 'É', b"E^" => 'Ê', b"E:" => 'Ë',
321 b"E_" => 'Ē', b"E~" => 'Ẽ',
322 // --- 【G】 ---
323 b"G^" => 'Ĝ',
324 // --- 【H】 ---
325 b"H^" => 'Ĥ',
326 // --- 【I】 ---
327 b"I`" => 'Ì', b"I'" => 'Í', b"I^" => 'Î', b"I:" => 'Ï',
328 b"I_" => 'Ī', b"I~" => 'Ĩ',
329 // --- 【J】 ---
330 b"J^" => 'Ĵ',
331 // --- 【L】 ---
332 b"L/" => 'Ł', b"L'" => 'Ĺ',
333 // --- 【M】 ---
334 b"M'" => 'Ḿ',
335 // --- 【N】 ---
336 b"N`" => 'Ǹ', b"N~" => 'Ñ', b"N'" => 'Ń',
337 // --- 【O】 ---
338 b"O`" => 'Ò', b"O'" => 'Ó', b"O^" => 'Ô', b"O~" => 'Õ',
339 b"O:" => 'Ö', b"O/" => 'Ø', b"O_" => 'Ō',
340 // --- 【R】 ---
341 b"R'" => 'Ŕ',
342 // --- 【S】 ---
343 b"S'" => 'Ś', b"S," => 'Ş', b"S^" => 'Ŝ',
344 // --- 【T】 ---
345 b"T," => 'Ţ',
346 // --- 【U】 ---
347 b"U`" => 'Ù', b"U'" => 'Ú', b"U^" => 'Û', b"U:" => 'Ü',
348 b"U_" => 'Ū', b"U&" => 'Ů', b"U~" => 'Ũ',
349 // --- 【Y】 ---
350 b"Y'" => 'Ý',
351 // --- 【Z】 ---
352 b"Z'" => 'Ź',
353};
354
355const _: () = {
356 // Pin runtime tables to canonical table size: 4 ligatures (in
357 // `match_ligature`) + 110 digraphs = 114 spec entries. Compile-time
358 // assert so a forgotten entry surfaces during build, not at the
359 // first runtime test.
360 assert!(
361 ACCENT_DIGRAPHS.len() == 110,
362 "ACCENT_DIGRAPHS must contain exactly 110 entries (114 spec − 4 ligatures)"
363 );
364};
365
366/// Decompose Aozora accent digraphs anywhere inside `fragment`.
367///
368/// Call this on the **body of a `〔...〕` span** only; the transform is
369/// restricted to that convention so English text (`isn't`, `text,`, `word's`)
370/// doesn't false-match legitimate spec entries (`n'`=ń, `t,`=ţ, and friends).
371///
372/// Guarantees:
373/// - Returns `Cow::Borrowed(fragment)` when no accent **marker byte** appears
374/// (zero alloc on the common Japanese-only case).
375/// - Greedy longest-match: ligatures (3-byte, e.g. `ae&` = æ) beat the 2-byte
376/// digraphs that share a prefix (`a&` = å would otherwise apply).
377/// - Byte length of the output can be up to 3 bytes per 2-byte digraph for the
378/// few entries that land in U+1Exx (`m'` = ḿ, `e~` = ẽ). Most entries shrink
379/// (3-byte ligature → 2-byte UTF-8). The invariant we do hold: the result
380/// is always a valid UTF-8 string.
381///
382/// The implementation is linear in `fragment.len()`: we walk the byte stream
383/// left-to-right, peek `<= 3` bytes at a time, and commit the longest match
384/// that's in the table.
385#[must_use]
386pub fn decompose_fragment(fragment: &str) -> Cow<'_, str> {
387 let bytes = fragment.as_bytes();
388 // Early-out: if no accent marker byte appears at all, the output equals the
389 // input bit-for-bit. Borrow to avoid allocation.
390 //
391 // The membership test goes through the [`ACCENT_MARKER_MASK`] u128
392 // bitmap, which lowers to one cmp + shift + AND per byte — the
393 // tightest path possible without SIMD. SIMD prefilter wouldn't help
394 // here: aozora text is overwhelmingly Japanese (3-byte UTF-8 with
395 // 0xE3 lead byte), so byte-level memchr-style searches don't reduce
396 // the candidate set.
397 if !bytes.iter().any(|b| is_accent_marker(*b)) {
398 return Cow::Borrowed(fragment);
399 }
400
401 let mut out = String::with_capacity(fragment.len());
402 let mut i = 0;
403 while i < bytes.len() {
404 if let Some((pat_len, ch)) = try_match(bytes, i) {
405 out.push(ch);
406 i += pat_len;
407 } else {
408 // Advance one UTF-8 scalar value. Every index we land on is a
409 // valid char boundary because we only stride by `pat_len` (2 or 3
410 // ASCII bytes) or by `ch.len_utf8()`. `.get(i..)` both avoids
411 // `clippy::string_slice` and defends against the stride
412 // invariant breaking: a misaligned index yields `None`, which
413 // breaks the loop cleanly.
414 let Some(ch) = fragment.get(i..).and_then(|s| s.chars().next()) else {
415 break;
416 };
417 out.push(ch);
418 i += ch.len_utf8();
419 }
420 }
421 Cow::Owned(out)
422}
423
424/// Attempt to match a table entry starting at `bytes[i]`. Longest-first
425/// (the spec rule): try 3-byte ligatures before 2-byte digraphs.
426///
427/// - **3-byte path**: a 4-arm `match` against the four ligatures
428/// (`ae&`, `AE&`, `oe&`, `OE&`). `match_ligature` lowers to a tight
429/// jump-table-or-direct-compares form.
430/// - **2-byte path**: O(1) lookup in `ACCENT_DIGRAPHS`, a `phf::Map`
431/// built at compile time over all 110 spec digraph entries.
432///
433/// Returns `(consumed_bytes, replacement_char)` on match.
434#[inline]
435fn try_match(bytes: &[u8], i: usize) -> Option<(usize, char)> {
436 if i + 3 <= bytes.len()
437 && let Some(ch) = match_ligature(&bytes[i..i + 3])
438 {
439 return Some((3, ch));
440 }
441 if i + 2 <= bytes.len()
442 && let Some(&ch) = ACCENT_DIGRAPHS.get(&bytes[i..i + 2])
443 {
444 return Some((2, ch));
445 }
446 None
447}
448
449#[cfg(test)]
450mod tests {
451 use super::*;
452
453 #[test]
454 fn table_size_is_pinned_to_spec_count() {
455 // Verified 2026-04-23 against <https://www.aozora.gr.jp/accent_separation.html>
456 // (archived at docs/specs/aozora/accent_separation.html) by enumerating
457 // every ASCII digraph and ligature in the 【a..z】, 【A..Z】, and 【合字】
458 // groups. A drop below this number means a merge lost table entries;
459 // a rise means the spec added entries and the table needs to grow.
460 const EXPECTED: usize = 114;
461 assert_eq!(
462 ACCENT_TABLE.len(),
463 EXPECTED,
464 "spec count drift — see docs/specs/aozora/accent_separation.html"
465 );
466 }
467
468 #[test]
469 fn every_table_entry_is_representable_ascii_source() {
470 for (pat, _) in ACCENT_TABLE {
471 assert!(
472 pat.is_ascii(),
473 "digraph {pat:?} must be pure ASCII per spec"
474 );
475 assert!(
476 pat.len() == 2 || pat.len() == 3,
477 "digraph {pat:?} must be 2 or 3 bytes"
478 );
479 }
480 }
481
482 #[test]
483 fn every_table_entry_has_unique_pattern() {
484 use std::collections::HashSet;
485 let mut seen: HashSet<&str> = HashSet::new();
486 for (pat, _) in ACCENT_TABLE {
487 assert!(seen.insert(pat), "duplicate digraph {pat:?}");
488 }
489 }
490
491 #[test]
492 fn digraph_size_growth_stays_within_one_extra_byte() {
493 // We don't claim byte-length non-growth (disproved by entries like
494 // `m'` = ḿ U+1E3F which grows 2 → 3 bytes), but we DO pin that no entry
495 // grows by more than one byte: callers budgeting diagnostic span
496 // back-mapping need to allocate at most `input_len + count_of_digraphs`
497 // output bytes.
498 for (pat, ch) in ACCENT_TABLE {
499 let out_len = ch.len_utf8();
500 let in_len = pat.len();
501 let growth = out_len.saturating_sub(in_len);
502 assert!(
503 growth <= 1,
504 "digraph {pat:?} → {ch} grew by {growth} bytes (cap is 1)"
505 );
506 }
507 }
508
509 // --- Specific spec checkpoints (sample across groups to catch table drift) ---
510
511 #[test]
512 fn spec_point_e_grave() {
513 assert_eq!(decompose_fragment("fune`bre"), "funèbre");
514 }
515
516 #[test]
517 fn spec_point_acute_accents() {
518 assert_eq!(decompose_fragment("ve'rite'"), "vérité");
519 }
520
521 #[test]
522 fn spec_point_circumflex_and_cedilla_together() {
523 assert_eq!(decompose_fragment("C,a va^"), "Ça vâ");
524 }
525
526 #[test]
527 fn spec_point_all_vowel_graves() {
528 assert_eq!(decompose_fragment("a` e` i` o` u`"), "à è ì ò ù");
529 }
530
531 #[test]
532 fn spec_point_uppercase_accents() {
533 assert_eq!(decompose_fragment("A` E' N~"), "À É Ñ");
534 }
535
536 #[test]
537 fn spec_point_ligatures_beat_ring_above() {
538 // `s&` = ß (eszett), NOT `s` + ring-above — longest-match ordering.
539 assert_eq!(decompose_fragment("stras&e"), "straße");
540 // Ligature over single-letter: ae& = æ, not a& + e.
541 assert_eq!(decompose_fragment("ae&on"), "æon");
542 assert_eq!(decompose_fragment("OE&uvre"), "Œuvre");
543 }
544
545 #[test]
546 fn spec_point_stroke_and_macron() {
547 assert_eq!(decompose_fragment("d/o_g"), "đōg");
548 }
549
550 #[test]
551 fn input_without_any_marker_byte_is_borrowed() {
552 // Must avoid every ASCII marker: ' ` ^ : ~ & , / _
553 let input = "plain Japanese prose ここはテストです 春夏秋冬";
554 let out = decompose_fragment(input);
555 assert!(
556 matches!(out, Cow::Borrowed(_)),
557 "expected zero-alloc path for {input:?}"
558 );
559 assert_eq!(out, input);
560 }
561
562 #[test]
563 fn isolated_markers_not_preceded_by_table_base_are_preserved() {
564 // A marker that lands without a valid base letter preceding it stays
565 // intact. The call site is the inside of a 〔〕 span, where
566 // these cases represent author typos or genuine punctuation.
567 assert_eq!(decompose_fragment("'tis"), "'tis"); // leading apostrophe
568 assert_eq!(decompose_fragment("5^2"), "5^2"); // digit base not in spec
569 assert_eq!(decompose_fragment("q^"), "q^"); // q not in spec table
570 }
571
572 #[test]
573 fn markers_are_greedy_for_any_valid_preceding_base() {
574 // Even when the user might have intended punctuation, the spec rule is
575 // simple: `<base-letter><marker>` decomposes. Call sites must gate by
576 // the 〔〕 wrapper to avoid false-positives on English text.
577 assert_eq!(decompose_fragment("`hello`"), "`hellò"); // o` → ò
578 assert_eq!(decompose_fragment("text,"), "texţ"); // t, → ţ
579 }
580
581 #[test]
582 fn unknown_base_letters_stay_unchanged() {
583 // f doesn't have entries in the spec; f' must stay.
584 assert_eq!(decompose_fragment("f'x"), "f'x");
585 // q also absent.
586 assert_eq!(decompose_fragment("q^"), "q^");
587 }
588
589 #[test]
590 fn mixed_japanese_and_accents_round_trip_on_japanese() {
591 assert_eq!(
592 decompose_fragment("ここは fune`bre です"),
593 "ここは funèbre です"
594 );
595 }
596
597 #[test]
598 fn empty_input_is_borrowed() {
599 let out = decompose_fragment("");
600 assert!(matches!(out, Cow::Borrowed("")));
601 }
602
603 #[test]
604 fn three_byte_ligatures_shrink_output_byte_length() {
605 // 3-byte ASCII ligature → 2-byte UTF-8: strictly shorter.
606 // `s&` = ß is NOT a 3-byte ligature; it's a 2-byte digraph → 2 UTF-8
607 // bytes, so length is preserved. Covered separately below.
608 for (input, expected) in [("ae&on", "æon"), ("OE&uvre", "Œuvre")] {
609 let out = decompose_fragment(input);
610 assert!(
611 out.len() < input.len(),
612 "3-byte ligature should shrink: {input:?} → {out:?}"
613 );
614 assert_eq!(out, expected);
615 }
616 }
617
618 #[test]
619 fn two_byte_eszett_preserves_output_byte_length() {
620 // `s&` = ß is a 2-byte source → 2-byte UTF-8 output: neutral length.
621 let out = decompose_fragment("stras&e");
622 assert_eq!(out, "straße");
623 assert_eq!(out.len(), "stras&e".len());
624 }
625
626 #[test]
627 fn bmp_above_u1e00_digraphs_may_grow_output() {
628 // `m'` → ḿ U+1E3F is 3 bytes; documented growth path.
629 let out = decompose_fragment("m'a");
630 assert_eq!(out, "ḿa");
631 assert!(out.len() > "m'a".len());
632 }
633
634 #[test]
635 fn property_all_table_entries_round_trip() {
636 // Every table entry, when wrapped in benign context, decomposes to its
637 // target char and only that char.
638 for (pat, ch) in ACCENT_TABLE {
639 let input = format!("_{pat}_");
640 let out = decompose_fragment(&input);
641 let expected: String = format!("_{ch}_");
642 assert_eq!(*out, *expected, "pattern {pat:?} failed");
643 }
644 }
645}