1#[inline]
18fn push_code_point(cp: u32, out: &mut Vec<u8>) {
19 if cp < 0x80 {
20 out.push(cp as u8);
21 } else if cp < 0x800 {
22 out.push(0xC0 | (cp >> 6) as u8);
23 out.push(0x80 | (cp & 0x3F) as u8);
24 } else if cp < 0x1_0000 {
25 out.push(0xE0 | (cp >> 12) as u8);
26 out.push(0x80 | ((cp >> 6) & 0x3F) as u8);
27 out.push(0x80 | (cp & 0x3F) as u8);
28 } else {
29 out.push(0xF0 | (cp >> 18) as u8);
30 out.push(0x80 | ((cp >> 12) & 0x3F) as u8);
31 out.push(0x80 | ((cp >> 6) & 0x3F) as u8);
32 out.push(0x80 | (cp & 0x3F) as u8);
33 }
34}
35
36#[inline]
37const fn utf8_len(cp: u32) -> usize {
38 match cp {
39 0..0x80 => 1,
40 0x80..0x800 => 2,
41 0x800..0x1_0000 => 3,
42 _ => 4,
43 }
44}
45
46#[inline]
49fn fold_char(c: char) -> char {
50 if c.is_ascii() {
51 return c.to_ascii_lowercase();
52 }
53 let mut it = c.to_lowercase();
54 match (it.next(), it.next()) {
55 (Some(l), None) if utf8_len(l as u32) == utf8_len(c as u32) => l,
56 _ => c,
57 }
58}
59
60pub fn push_wtf8_pair(units: &[u16], name_out: &mut Vec<u8>, lower_out: &mut Vec<u8>) {
64 let mut i = 0;
65 while i < units.len() {
66 let u = units[i];
67 let cp: u32 = if (0xD800..=0xDBFF).contains(&u)
68 && i + 1 < units.len()
69 && (0xDC00..=0xDFFF).contains(&units[i + 1])
70 {
71 let hi = (u as u32 - 0xD800) << 10;
72 let lo = units[i + 1] as u32 - 0xDC00;
73 i += 2;
74 0x1_0000 + hi + lo
75 } else {
76 i += 1;
77 u as u32
78 };
79
80 push_code_point(cp, name_out);
81 match char::from_u32(cp) {
82 Some(c) => push_code_point(fold_char(c) as u32, lower_out),
83 None => push_code_point(cp, lower_out),
85 }
86 }
87}
88
89pub fn fold_str(s: &str) -> String {
91 s.chars().map(fold_char).collect()
92}
93
94#[must_use]
97pub fn has_uppercase(s: &str) -> bool {
98 s.chars().any(|c| fold_char(c) != c)
99}
100
101pub fn wtf8_to_utf16(bytes: &[u8], out: &mut Vec<u16>) {
104 let mut i = 0;
105 while i < bytes.len() {
106 let b0 = bytes[i] as u32;
107 let (cp, adv) = if b0 < 0x80 {
108 (b0, 1)
109 } else if b0 < 0xE0 {
110 ((b0 & 0x1F) << 6 | (bytes[i + 1] as u32 & 0x3F), 2)
111 } else if b0 < 0xF0 {
112 (
113 (b0 & 0x0F) << 12
114 | (bytes[i + 1] as u32 & 0x3F) << 6
115 | (bytes[i + 2] as u32 & 0x3F),
116 3,
117 )
118 } else {
119 (
120 (b0 & 0x07) << 18
121 | (bytes[i + 1] as u32 & 0x3F) << 12
122 | (bytes[i + 2] as u32 & 0x3F) << 6
123 | (bytes[i + 3] as u32 & 0x3F),
124 4,
125 )
126 };
127 i += adv;
128 if cp >= 0x1_0000 {
129 let v = cp - 0x1_0000;
130 out.push(0xD800 + (v >> 10) as u16);
131 out.push(0xDC00 + (v & 0x3FF) as u16);
132 } else {
133 out.push(cp as u16);
134 }
135 }
136}
137
138#[cfg(test)]
139mod tests {
140 use super::*;
141
142 fn pair(units: &[u16]) -> (Vec<u8>, Vec<u8>) {
143 let (mut a, mut b) = (Vec::new(), Vec::new());
144 push_wtf8_pair(units, &mut a, &mut b);
145 (a, b)
146 }
147
148 #[test]
149 fn ascii_roundtrip_and_fold() {
150 let units: Vec<u16> = "File.TXT".encode_utf16().collect();
151 let (name, lower) = pair(&units);
152 assert_eq!(name, b"File.TXT");
153 assert_eq!(lower, b"file.txt");
154 }
155
156 #[test]
157 fn pools_always_same_length() {
158 for s in ["日本語ファイル.txt", "Straße", "İstanbul", "ΣΟΦΟΣ", "Ⱥb"] {
159 let units: Vec<u16> = s.encode_utf16().collect();
160 let (name, lower) = pair(&units);
161 assert_eq!(name.len(), lower.len(), "length mismatch for {s}");
162 assert_eq!(name, s.as_bytes());
163 }
164 }
165
166 #[test]
167 fn greek_sigma_folds_in_place() {
168 let units: Vec<u16> = "Σ".encode_utf16().collect();
170 let (_, lower) = pair(&units);
171 assert_eq!(lower, "σ".as_bytes());
172 }
173
174 #[test]
175 fn istanbul_dotted_i_kept_unfolded() {
176 let units: Vec<u16> = "İ".encode_utf16().collect();
178 let (name, lower) = pair(&units);
179 assert_eq!(name, lower);
180 }
181
182 #[test]
183 fn supplementary_plane_roundtrip() {
184 let s = "𠮷野家🦀.txt"; let units: Vec<u16> = s.encode_utf16().collect();
186 let (name, _) = pair(&units);
187 assert_eq!(name, s.as_bytes());
188 let mut back = Vec::new();
189 wtf8_to_utf16(&name, &mut back);
190 assert_eq!(back, units);
191 }
192
193 #[test]
194 fn unpaired_surrogate_roundtrip() {
195 let units = vec![0x0041, 0xD800, 0x0042]; let (name, lower) = pair(&units);
198 assert_eq!(name.len(), lower.len());
199 let mut back = Vec::new();
200 wtf8_to_utf16(&name, &mut back);
201 assert_eq!(back, units);
202 assert_eq!(lower[0], b'a');
204 assert_eq!(*lower.last().unwrap(), b'b');
205 }
206
207 #[test]
208 fn fold_str_matches_pool_folding() {
209 for s in ["File.TXT", "Straße", "İstanbul", "ΣΟΦΟΣ", "日本語"] {
210 let units: Vec<u16> = s.encode_utf16().collect();
211 let (_, lower) = pair(&units);
212 assert_eq!(
213 fold_str(s).as_bytes(),
214 &lower[..],
215 "needle/pool fold diverged for {s}"
216 );
217 }
218 }
219
220 #[test]
221 fn has_uppercase_detects_foldable_chars_only() {
222 assert!(has_uppercase("Abc"));
223 assert!(has_uppercase("ΣΟΦΟΣ"));
224 assert!(!has_uppercase("abc.txt"));
225 assert!(!has_uppercase("日本語"));
226 assert!(!has_uppercase("İ")); }
228}
229
230#[cfg(test)]
231mod proptests {
232 use proptest::collection::vec as prop_vec;
233 use proptest::prelude::{Strategy, any};
234 use proptest::sample::select;
235 use proptest::{prop_assert, prop_assert_eq, prop_oneof, proptest};
236
237 use super::{fold_str, has_uppercase, push_wtf8_pair, wtf8_to_utf16};
238
239 const TRICKY_CHARS: &[char] = &[
247 'İ', 'ı', 'I', 'i', 'ß', 'A', 'a', 'Z', 'Σ', 'σ', 'Ⱥ', 'A', 'z', '.',
248 ];
249
250 const TRICKY_UNITS: &[u16] = &[0xD800, 0xDBFF, 0xDC00, 0xDFFF, 0xD83E, 0xDD80];
254
255 fn tricky_unit() -> impl Strategy<Value = u16> {
258 let char_units: Vec<u16> = TRICKY_CHARS
259 .iter()
260 .flat_map(|c| {
261 let mut buf = [0u16; 2];
262 c.encode_utf16(&mut buf).to_vec()
263 })
264 .collect();
265 prop_oneof![
266 select(TRICKY_UNITS.to_vec()),
267 select(char_units),
268 any::<u16>(),
269 ]
270 }
271
272 fn tricky_units() -> impl Strategy<Value = Vec<u16>> {
274 prop_vec(tricky_unit(), 0usize..32)
275 }
276
277 fn tricky_string() -> impl Strategy<Value = String> {
280 prop_vec(
281 prop_oneof![select(TRICKY_CHARS.to_vec()), any::<char>()],
282 0usize..32,
283 )
284 .prop_map(|chars| chars.into_iter().collect())
285 }
286
287 proptest! {
288 #[test]
291 fn pools_same_length_for_any_units(units in prop_vec(any::<u16>(), 0usize..64)) {
292 let (mut name, mut lower) = (Vec::new(), Vec::new());
293 push_wtf8_pair(&units, &mut name, &mut lower);
294 prop_assert_eq!(name.len(), lower.len());
295 }
296
297 #[test]
301 fn pools_same_length_for_tricky_units(units in tricky_units()) {
302 let (mut name, mut lower) = (Vec::new(), Vec::new());
303 push_wtf8_pair(&units, &mut name, &mut lower);
304 prop_assert_eq!(name.len(), lower.len());
305 }
306
307 #[test]
310 fn name_roundtrips_through_utf16(units in prop_vec(any::<u16>(), 0usize..64)) {
311 let (mut name, mut lower) = (Vec::new(), Vec::new());
312 push_wtf8_pair(&units, &mut name, &mut lower);
313 let mut back = Vec::new();
314 wtf8_to_utf16(&name, &mut back);
315 prop_assert_eq!(back, units);
316 }
317
318 #[test]
321 fn tricky_name_roundtrips_through_utf16(units in tricky_units()) {
322 let (mut name, mut lower) = (Vec::new(), Vec::new());
323 push_wtf8_pair(&units, &mut name, &mut lower);
324 let mut back = Vec::new();
325 wtf8_to_utf16(&name, &mut back);
326 prop_assert_eq!(back, units);
327 }
328
329 #[test]
331 fn fold_str_length_preserving_and_idempotent(s in ".*") {
332 let folded = fold_str(&s);
333 prop_assert_eq!(folded.len(), s.len());
334 let twice = fold_str(&folded);
335 prop_assert_eq!(twice, folded);
336 }
337
338 #[test]
342 fn fold_str_length_preserving_and_idempotent_tricky(s in tricky_string()) {
343 let folded = fold_str(&s);
344 prop_assert_eq!(folded.len(), s.len());
345 let twice = fold_str(&folded);
346 prop_assert_eq!(twice, folded);
347 }
348
349 #[test]
353 fn has_uppercase_agrees_with_fold_changing(s in ".*") {
354 prop_assert_eq!(has_uppercase(&s), fold_str(&s) != s);
355 }
356
357 #[test]
361 fn has_uppercase_agrees_with_fold_changing_tricky(s in tricky_string()) {
362 prop_assert_eq!(has_uppercase(&s), fold_str(&s) != s);
363 }
364
365 #[test]
371 fn needle_fold_matches_pool_fold(s in ".*") {
372 let units: Vec<u16> = s.encode_utf16().collect();
373 let (mut name, mut lower) = (Vec::new(), Vec::new());
374 push_wtf8_pair(&units, &mut name, &mut lower);
375 prop_assert_eq!(name, s.as_bytes());
376 let folded = fold_str(&s);
377 prop_assert_eq!(folded.as_bytes(), &lower[..]);
378 }
379
380 #[test]
384 fn folded_string_has_no_uppercase(s in tricky_string()) {
385 prop_assert!(!has_uppercase(&fold_str(&s)));
386 }
387 }
388}