Skip to main content

fmf_core/
wtf8.rs

1//! WTF-8 name encoding and search-oriented case folding.
2//!
3//! NTFS file names are arbitrary u16 sequences and may contain unpaired
4//! surrogates. Storing them as lossy UTF-8 would corrupt such names and make
5//! them impossible to open from the UI (docs/ARCHITECTURE.md). WTF-8 encodes
6//! unpaired surrogates as their 3-byte sequences, is a superset of UTF-8, and
7//! round-trips back to the original UTF-16.
8//!
9//! The index keeps two pools with shared offsets (docs/ARCHITECTURE.md,
10//! ADR-0003), so the folded form of a name must have exactly the same byte
11//! length as the original. Folding therefore lowercases a code point only
12//! when the result is a single code point of identical encoded length;
13//! everything else is kept as-is. The same rule must be applied to query
14//! needles (`fold_str`) or case-insensitive matches would misalign.
15
16/// Append the WTF-8 encoding of a single code point (may be a lone surrogate).
17#[inline]
18fn push_code_point(cp: u32, out: &mut Vec<u8>) {
19    if cp < 0x80 {
20        out.push(cp as u8);
21    } else if cp < 0x800 {
22        out.push(0xC0 | (cp >> 6) as u8);
23        out.push(0x80 | (cp & 0x3F) as u8);
24    } else if cp < 0x1_0000 {
25        out.push(0xE0 | (cp >> 12) as u8);
26        out.push(0x80 | ((cp >> 6) & 0x3F) as u8);
27        out.push(0x80 | (cp & 0x3F) as u8);
28    } else {
29        out.push(0xF0 | (cp >> 18) as u8);
30        out.push(0x80 | ((cp >> 12) & 0x3F) as u8);
31        out.push(0x80 | ((cp >> 6) & 0x3F) as u8);
32        out.push(0x80 | (cp & 0x3F) as u8);
33    }
34}
35
36#[inline]
37const fn utf8_len(cp: u32) -> usize {
38    match cp {
39        0..0x80 => 1,
40        0x80..0x800 => 2,
41        0x800..0x1_0000 => 3,
42        _ => 4,
43    }
44}
45
46/// Lowercase `c` only if the result is a single char with the same encoded
47/// length; otherwise return `c` unchanged.
48#[inline]
49fn fold_char(c: char) -> char {
50    if c.is_ascii() {
51        return c.to_ascii_lowercase();
52    }
53    let mut it = c.to_lowercase();
54    match (it.next(), it.next()) {
55        (Some(l), None) if utf8_len(l as u32) == utf8_len(c as u32) => l,
56        _ => c,
57    }
58}
59
60/// Decode UTF-16 (with possible unpaired surrogates) and append both the
61/// WTF-8 original and its folded form. The two outputs always grow by the
62/// same number of bytes.
63pub fn push_wtf8_pair(units: &[u16], name_out: &mut Vec<u8>, lower_out: &mut Vec<u8>) {
64    let mut i = 0;
65    while i < units.len() {
66        let u = units[i];
67        let cp: u32 = if (0xD800..=0xDBFF).contains(&u)
68            && i + 1 < units.len()
69            && (0xDC00..=0xDFFF).contains(&units[i + 1])
70        {
71            let hi = (u as u32 - 0xD800) << 10;
72            let lo = units[i + 1] as u32 - 0xDC00;
73            i += 2;
74            0x1_0000 + hi + lo
75        } else {
76            i += 1;
77            u as u32
78        };
79
80        push_code_point(cp, name_out);
81        match char::from_u32(cp) {
82            Some(c) => push_code_point(fold_char(c) as u32, lower_out),
83            // Lone surrogate: no case to fold, mirror the original bytes.
84            None => push_code_point(cp, lower_out),
85        }
86    }
87}
88
89/// Fold a valid UTF-8 string (query needle) with the same rule as the pool.
90pub fn fold_str(s: &str) -> String {
91    s.chars().map(fold_char).collect()
92}
93
94/// True if folding would change `s` — i.e. the needle benefits from the
95/// case-insensitive pool at all.
96#[must_use]
97pub fn has_uppercase(s: &str) -> bool {
98    s.chars().any(|c| fold_char(c) != c)
99}
100
101/// Decode WTF-8 back to UTF-16 units (inverse of `push_wtf8_pair`'s name
102/// output). Used when handing names across the FFI boundary.
103pub fn wtf8_to_utf16(bytes: &[u8], out: &mut Vec<u16>) {
104    let mut i = 0;
105    while i < bytes.len() {
106        let b0 = bytes[i] as u32;
107        let (cp, adv) = if b0 < 0x80 {
108            (b0, 1)
109        } else if b0 < 0xE0 {
110            ((b0 & 0x1F) << 6 | (bytes[i + 1] as u32 & 0x3F), 2)
111        } else if b0 < 0xF0 {
112            (
113                (b0 & 0x0F) << 12
114                    | (bytes[i + 1] as u32 & 0x3F) << 6
115                    | (bytes[i + 2] as u32 & 0x3F),
116                3,
117            )
118        } else {
119            (
120                (b0 & 0x07) << 18
121                    | (bytes[i + 1] as u32 & 0x3F) << 12
122                    | (bytes[i + 2] as u32 & 0x3F) << 6
123                    | (bytes[i + 3] as u32 & 0x3F),
124                4,
125            )
126        };
127        i += adv;
128        if cp >= 0x1_0000 {
129            let v = cp - 0x1_0000;
130            out.push(0xD800 + (v >> 10) as u16);
131            out.push(0xDC00 + (v & 0x3FF) as u16);
132        } else {
133            out.push(cp as u16);
134        }
135    }
136}
137
138#[cfg(test)]
139mod tests {
140    use super::*;
141
142    fn pair(units: &[u16]) -> (Vec<u8>, Vec<u8>) {
143        let (mut a, mut b) = (Vec::new(), Vec::new());
144        push_wtf8_pair(units, &mut a, &mut b);
145        (a, b)
146    }
147
148    #[test]
149    fn ascii_roundtrip_and_fold() {
150        let units: Vec<u16> = "File.TXT".encode_utf16().collect();
151        let (name, lower) = pair(&units);
152        assert_eq!(name, b"File.TXT");
153        assert_eq!(lower, b"file.txt");
154    }
155
156    #[test]
157    fn pools_always_same_length() {
158        for s in ["日本語ファイル.txt", "Straße", "İstanbul", "ΣΟΦΟΣ", "Ⱥb"] {
159            let units: Vec<u16> = s.encode_utf16().collect();
160            let (name, lower) = pair(&units);
161            assert_eq!(name.len(), lower.len(), "length mismatch for {s}");
162            assert_eq!(name, s.as_bytes());
163        }
164    }
165
166    #[test]
167    fn greek_sigma_folds_in_place() {
168        // Σ (2 bytes) → σ (2 bytes): same length, folded.
169        let units: Vec<u16> = "Σ".encode_utf16().collect();
170        let (_, lower) = pair(&units);
171        assert_eq!(lower, "σ".as_bytes());
172    }
173
174    #[test]
175    fn istanbul_dotted_i_kept_unfolded() {
176        // İ lowercases to "i\u{307}" (multi-char) → kept as-is.
177        let units: Vec<u16> = "İ".encode_utf16().collect();
178        let (name, lower) = pair(&units);
179        assert_eq!(name, lower);
180    }
181
182    #[test]
183    fn supplementary_plane_roundtrip() {
184        let s = "𠮷野家🦀.txt"; // surrogate pairs in UTF-16
185        let units: Vec<u16> = s.encode_utf16().collect();
186        let (name, _) = pair(&units);
187        assert_eq!(name, s.as_bytes());
188        let mut back = Vec::new();
189        wtf8_to_utf16(&name, &mut back);
190        assert_eq!(back, units);
191    }
192
193    #[test]
194    fn unpaired_surrogate_roundtrip() {
195        // Legal as an NTFS name, impossible as UTF-8: must survive intact.
196        let units = vec![0x0041, 0xD800, 0x0042]; // "A<lone high surrogate>B"
197        let (name, lower) = pair(&units);
198        assert_eq!(name.len(), lower.len());
199        let mut back = Vec::new();
200        wtf8_to_utf16(&name, &mut back);
201        assert_eq!(back, units);
202        // The ASCII letters around the surrogate still fold.
203        assert_eq!(lower[0], b'a');
204        assert_eq!(*lower.last().unwrap(), b'b');
205    }
206
207    #[test]
208    fn fold_str_matches_pool_folding() {
209        for s in ["File.TXT", "Straße", "İstanbul", "ΣΟΦΟΣ", "日本語"] {
210            let units: Vec<u16> = s.encode_utf16().collect();
211            let (_, lower) = pair(&units);
212            assert_eq!(
213                fold_str(s).as_bytes(),
214                &lower[..],
215                "needle/pool fold diverged for {s}"
216            );
217        }
218    }
219
220    #[test]
221    fn has_uppercase_detects_foldable_chars_only() {
222        assert!(has_uppercase("Abc"));
223        assert!(has_uppercase("ΣΟΦΟΣ"));
224        assert!(!has_uppercase("abc.txt"));
225        assert!(!has_uppercase("日本語"));
226        assert!(!has_uppercase("İ")); // unfoldable by our rule → not "uppercase" for smart case
227    }
228}
229
230#[cfg(test)]
231mod proptests {
232    use proptest::collection::vec as prop_vec;
233    use proptest::prelude::{Strategy, any};
234    use proptest::sample::select;
235    use proptest::{prop_assert, prop_assert_eq, prop_oneof, proptest};
236
237    use super::{fold_str, has_uppercase, push_wtf8_pair, wtf8_to_utf16};
238
239    /// Code points whose folding stresses the length-preserving rule
240    /// (ADR-0003): Turkish dotted I (İ lowercases to two chars → must be kept),
241    /// dotless ı / ASCII I, German sharp-s (ß has no single-char lowering),
242    /// full-width Latin (A→a, both 3 bytes), a non-ASCII same-length foldable
243    /// (Σ→σ), and Ⱥ (lowercases to a shorter encoding → must be kept). A `.*`
244    /// strategy reaches these only by luck; pinning them keeps the hard cases
245    /// in the input space on every run.
246    const TRICKY_CHARS: &[char] = &[
247        'İ', 'ı', 'I', 'i', 'ß', 'A', 'a', 'Z', 'Σ', 'σ', 'Ⱥ', 'A', 'z', '.',
248    ];
249
250    /// UTF-16 units that exercise surrogate handling: lone high/low surrogates
251    /// (legal NTFS names, ill-formed UTF-16) plus the halves of an emoji
252    /// surrogate pair, so the paired and unpaired branches both get hit.
253    const TRICKY_UNITS: &[u16] = &[0xD800, 0xDBFF, 0xDC00, 0xDFFF, 0xD83E, 0xDD80];
254
255    /// A single UTF-16 unit drawn from the tricky set, a tricky char's units,
256    /// or the whole `u16` space.
257    fn tricky_unit() -> impl Strategy<Value = u16> {
258        let char_units: Vec<u16> = TRICKY_CHARS
259            .iter()
260            .flat_map(|c| {
261                let mut buf = [0u16; 2];
262                c.encode_utf16(&mut buf).to_vec()
263            })
264            .collect();
265        prop_oneof![
266            select(TRICKY_UNITS.to_vec()),
267            select(char_units),
268            any::<u16>(),
269        ]
270    }
271
272    /// A `Vec<u16>` heavily seeded with surrogates and tricky-fold chars.
273    fn tricky_units() -> impl Strategy<Value = Vec<u16>> {
274        prop_vec(tricky_unit(), 0usize..32)
275    }
276
277    /// A `String` built from the tricky-fold chars interleaved with arbitrary
278    /// `char`s — always valid UTF-8, but rich in the fold edge cases.
279    fn tricky_string() -> impl Strategy<Value = String> {
280        prop_vec(
281            prop_oneof![select(TRICKY_CHARS.to_vec()), any::<char>()],
282            0usize..32,
283        )
284        .prop_map(|chars| chars.into_iter().collect())
285    }
286
287    proptest! {
288        // The two pools grow by the same byte count for ANY UTF-16 name — the
289        // shared-offset invariant (ADR-0003), across the whole input space.
290        #[test]
291        fn pools_same_length_for_any_units(units in prop_vec(any::<u16>(), 0usize..64)) {
292            let (mut name, mut lower) = (Vec::new(), Vec::new());
293            push_wtf8_pair(&units, &mut name, &mut lower);
294            prop_assert_eq!(name.len(), lower.len());
295        }
296
297        // Same shared-offset invariant, now forced onto the hard fold cases
298        // (Turkish İ, ß, full-width Latin, surrogate pairs and lone surrogates)
299        // every run instead of relying on `any::<u16>()` to stumble onto them.
300        #[test]
301        fn pools_same_length_for_tricky_units(units in tricky_units()) {
302            let (mut name, mut lower) = (Vec::new(), Vec::new());
303            push_wtf8_pair(&units, &mut name, &mut lower);
304            prop_assert_eq!(name.len(), lower.len());
305        }
306
307        // The WTF-8 name output round-trips back to the original UTF-16 units
308        // (including unpaired surrogates) for ANY input.
309        #[test]
310        fn name_roundtrips_through_utf16(units in prop_vec(any::<u16>(), 0usize..64)) {
311            let (mut name, mut lower) = (Vec::new(), Vec::new());
312            push_wtf8_pair(&units, &mut name, &mut lower);
313            let mut back = Vec::new();
314            wtf8_to_utf16(&name, &mut back);
315            prop_assert_eq!(back, units);
316        }
317
318        // Round-trip on the surrogate-heavy generator: lone surrogates and
319        // emoji pairs must survive the WTF-8 encode/decode unchanged.
320        #[test]
321        fn tricky_name_roundtrips_through_utf16(units in tricky_units()) {
322            let (mut name, mut lower) = (Vec::new(), Vec::new());
323            push_wtf8_pair(&units, &mut name, &mut lower);
324            let mut back = Vec::new();
325            wtf8_to_utf16(&name, &mut back);
326            prop_assert_eq!(back, units);
327        }
328
329        // `fold_str` preserves byte length and is idempotent for ANY UTF-8 string.
330        #[test]
331        fn fold_str_length_preserving_and_idempotent(s in ".*") {
332            let folded = fold_str(&s);
333            prop_assert_eq!(folded.len(), s.len());
334            let twice = fold_str(&folded);
335            prop_assert_eq!(twice, folded);
336        }
337
338        // Same length + idempotence, forced onto the tricky-fold chars — these
339        // are exactly the code points where a non-length-preserving lowering
340        // (İ→i̇, ß→ss, Ⱥ→its shorter form) would break the shared-offset pool.
341        #[test]
342        fn fold_str_length_preserving_and_idempotent_tricky(s in tricky_string()) {
343            let folded = fold_str(&s);
344            prop_assert_eq!(folded.len(), s.len());
345            let twice = fold_str(&folded);
346            prop_assert_eq!(twice, folded);
347        }
348
349        // `has_uppercase` is exactly the "folding changes something" predicate:
350        // it must agree with `fold_str(s) != s` for ANY UTF-8 string. A
351        // disagreement would desync smart-case from the pool it selects.
352        #[test]
353        fn has_uppercase_agrees_with_fold_changing(s in ".*") {
354            prop_assert_eq!(has_uppercase(&s), fold_str(&s) != s);
355        }
356
357        // The same agreement on the fold edge cases, where a smart-case
358        // mistake is most likely (İ is "uppercase" to Unicode but unfoldable by
359        // our rule, so it must read as has_uppercase == false).
360        #[test]
361        fn has_uppercase_agrees_with_fold_changing_tricky(s in tricky_string()) {
362            prop_assert_eq!(has_uppercase(&s), fold_str(&s) != s);
363        }
364
365        // The needle fold (`fold_str`) and the pool fold (`push_wtf8_pair`'s
366        // lower output) must produce identical bytes for any well-formed UTF-8
367        // name — otherwise a case-insensitive needle would not align with the
368        // folded pool it scans. Restricted to inputs free of lone surrogates,
369        // where `String` and the UTF-16 units agree.
370        #[test]
371        fn needle_fold_matches_pool_fold(s in ".*") {
372            let units: Vec<u16> = s.encode_utf16().collect();
373            let (mut name, mut lower) = (Vec::new(), Vec::new());
374            push_wtf8_pair(&units, &mut name, &mut lower);
375            prop_assert_eq!(name, s.as_bytes());
376            let folded = fold_str(&s);
377            prop_assert_eq!(folded.as_bytes(), &lower[..]);
378        }
379
380        // A folded string never contains a code point our rule would still
381        // fold — fixed-point reached in one pass (a sharper idempotence: not
382        // just `fold(fold(s)) == fold(s)` but "nothing left to fold").
383        #[test]
384        fn folded_string_has_no_uppercase(s in tricky_string()) {
385            prop_assert!(!has_uppercase(&fold_str(&s)));
386        }
387    }
388}