Skip to main content

fmf_core\query/
exec.rs

1//! Query execution. Each AND group is driven by a single SIMD sweep over the
2//! contiguous name pool (pool-scan / prefix / suffix drivers) that yields a
3//! sparse candidate list; residual matchers then verify only those
4//! candidates. Groups without a usable literal fall back to a chunked
5//! full scan, and the empty query walks the permutation directly. Results
6//! materialize as O(1)-pageable, sort-ordered id arrays
7//! (docs/ARCHITECTURE.md "query-time materialization").
8
9use rayon::prelude::*;
10
11use super::QueryOptions;
12use super::compile::{CTerm, CompiledQuery, Driver};
13use super::matchers::{EvalCtx, terms_match, terms_match_iter};
14use super::memo::{DirPathsLower, DirPathsOrig, MtimePerm, OffsetTable, PathMemos, SizePerm};
15use super::sweep::driver_candidates;
16use crate::index::{EntryId, SortKey, VolumeIndex};
17
18/// Build (or incrementally extend) exactly the dir-path memos this query
19/// reads — `None` pools cost nothing, which is the whole point of keeping
20/// folded and original-case memos in separate cache slots.
21fn path_memos(idx: &VolumeIndex, q: &CompiledQuery) -> PathMemos {
22    PathMemos {
23        lower: q.needs_folded_paths.then(|| {
24            idx.cached_derived_or_update(|prev| match prev {
25                Some(p) => DirPathsLower::extend_from(idx, p),
26                None => DirPathsLower::build(idx),
27            })
28        }),
29        orig: q.needs_orig_paths.then(|| {
30            idx.cached_derived_or_update(|prev| match prev {
31                Some(p) => DirPathsOrig::extend_from(idx, p),
32                None => DirPathsOrig::build(idx),
33            })
34        }),
35    }
36}
37
38/// 65536 entries per parallel task for full scans.
39const CHUNK: usize = 1 << 16;
40
41/// One volume's query result: the matching ids plus the index generations
42/// they were computed against (for staleness checks and incremental refine).
43pub struct SearchResult {
44    /// Matching entries in the requested sort order.
45    pub ids: Vec<EntryId>,
46    /// Content generation (name/data edits) of the index when these ids were produced.
47    pub content_generation: u64,
48    /// Structural generation (live-set / tree shape) of the index when these ids were produced.
49    pub structural_generation: u64,
50}
51
52/// Per-volume stage timings for [`crate::metrics::QueryTrace`].
53#[derive(Debug, Default, Clone)]
54pub struct SearchMetrics {
55    /// Human-readable label of the driver that executed this query (e.g. `perm-walk`).
56    pub driver: String,
57    /// Time spent building or extending the cached memo/offset structures (µs).
58    pub memo_us: u64,
59    /// Time spent sweeping pools and evaluating residual matchers (µs).
60    pub scan_us: u64,
61    /// Time spent walking the sort permutation to materialize the id array (µs).
62    pub materialize_us: u64,
63    /// Number of entries examined during the scan (count).
64    pub entries_scanned: u64,
65    /// Number of entries skipped because they are hidden/system and excluded (count).
66    pub excluded_skipped: u64,
67}
68
69// ── Search ──────────────────────────────────────────────────────────────
70
71/// Execute a compiled query against one volume index.
72///
73/// # Panics
74///
75/// Panics if a pool-scanning driver runs without its offset table — an
76/// invariant guaranteed here, since the table is built whenever any group
77/// needs it.
78pub fn search(
79    idx: &VolumeIndex,
80    q: &CompiledQuery,
81    opt: &QueryOptions,
82) -> (SearchResult, SearchMetrics) {
83    let mut metrics = SearchMetrics {
84        driver: q.driver_label(),
85        ..Default::default()
86    };
87    let mut stage = crate::metrics::Stage::start();
88    let skip_excluded = !opt.include_hidden_system;
89
90    // Fast path: the empty query (single MatchAll group, no residuals) walks
91    // the permutation directly — no bitmap, no scan.
92    if q.groups.len() == 1
93        && matches!(q.groups[0].driver, Driver::MatchAll)
94        && q.groups[0].terms.is_empty()
95    {
96        metrics.driver = "perm-walk".to_string();
97        metrics.memo_us = stage.lap();
98        let ids = materialize_filtered(idx, opt, |id| {
99            idx.is_live(id) && !(skip_excluded && idx.is_excluded(id))
100        });
101        metrics.entries_scanned = idx.len() as u64;
102        metrics.materialize_us = stage.lap();
103        return (
104            SearchResult {
105                ids,
106                content_generation: idx.content_generation(),
107                structural_generation: idx.structural_generation(),
108            },
109            metrics,
110        );
111    }
112
113    // Generation-cached lookup structures.
114    let memo = path_memos(idx, q);
115    let needs_table = q
116        .groups
117        .iter()
118        .any(|g| !matches!(g.driver, Driver::FullScan | Driver::MatchAll));
119    let table: Option<std::sync::Arc<OffsetTable>> = needs_table.then(|| {
120        idx.cached_derived_or_update(|prev| match prev {
121            Some(p) => OffsetTable::extend_from(idx, p),
122            None => OffsetTable::build(idx),
123        })
124    });
125    metrics.memo_us = stage.lap();
126
127    let n = idx.len();
128    let mut bitmap: Vec<u64> = vec![0u64; n.div_ceil(64)];
129
130    for group in &q.groups {
131        match &group.driver {
132            // MatchAll: no terms in this OR-branch → every live entry matches.
133            // FullScan: a group with no usable literal driver. Both evaluate
134            // the residuals (if any) over every live entry.
135            Driver::MatchAll | Driver::FullScan => {
136                full_scan_group(
137                    idx,
138                    &memo,
139                    &group.terms,
140                    skip_excluded,
141                    &mut bitmap,
142                    &mut metrics,
143                );
144            }
145            driver => {
146                let table = table.as_ref().expect("offset table built");
147                let candidates = driver_candidates(idx, table, driver, skip_excluded);
148                metrics.entries_scanned += candidates.len() as u64;
149                // A case-exact source term makes the folded sweep a superset:
150                // its exact comparison joins the residual pass.
151                if group.terms.is_empty() && group.driver_exact {
152                    for id in candidates {
153                        bitmap[id as usize / 64] |= 1u64 << (id as usize % 64);
154                    }
155                } else {
156                    let passed: Vec<Vec<EntryId>> = candidates
157                        .par_chunks(2048)
158                        .map(|chunk| {
159                            let mut ctx = EvalCtx::default();
160                            chunk
161                                .iter()
162                                .copied()
163                                .filter(|&id| {
164                                    terms_match_iter(
165                                        idx,
166                                        &memo,
167                                        &mut ctx,
168                                        group.residual_terms(),
169                                        id,
170                                    )
171                                })
172                                .collect()
173                        })
174                        .collect();
175                    for ids in passed {
176                        for id in ids {
177                            bitmap[id as usize / 64] |= 1u64 << (id as usize % 64);
178                        }
179                    }
180                }
181            }
182        }
183    }
184    metrics.scan_us = stage.lap();
185
186    let ids = materialize_filtered(idx, opt, |id| {
187        bitmap[id as usize / 64] >> (id as usize % 64) & 1 == 1
188    });
189    metrics.materialize_us = stage.lap();
190
191    (
192        SearchResult {
193            ids,
194            content_generation: idx.content_generation(),
195            structural_generation: idx.structural_generation(),
196        },
197        metrics,
198    )
199}
200
201/// Incremental refinement: when the previous query's result provably
202/// contains the next one's (subsume.rs) and the index generation is
203/// unchanged, filter the cached ids with a *complete* per-entry evaluation
204/// instead of sweeping pools and walking the whole permutation. `prev_ids`
205/// are already in the requested order, so the filtered subsequence is the
206/// answer — O(previous hits) instead of O(index).
207pub fn refine(
208    idx: &VolumeIndex,
209    q: &CompiledQuery,
210    opt: &QueryOptions,
211    prev_ids: &[EntryId],
212) -> (SearchResult, SearchMetrics) {
213    const REFINE_CHUNK: usize = 4096;
214    let mut metrics = SearchMetrics {
215        driver: q.driver_label(),
216        ..Default::default()
217    };
218    let mut stage = crate::metrics::Stage::start();
219    let skip_excluded = !opt.include_hidden_system;
220
221    let memo = path_memos(idx, q);
222    metrics.memo_us = stage.lap();
223
224    let chunks: Vec<Vec<EntryId>> = prev_ids
225        .par_chunks(REFINE_CHUNK)
226        .map(|chunk| {
227            let mut ctx = EvalCtx::default();
228            chunk
229                .iter()
230                .copied()
231                .filter(|&id| {
232                    idx.is_live(id)
233                        && !(skip_excluded && idx.is_excluded(id))
234                        && q.groups
235                            .iter()
236                            .any(|g| terms_match_iter(idx, &memo, &mut ctx, g.all_terms(), id))
237                })
238                .collect()
239        })
240        .collect();
241    metrics.entries_scanned = prev_ids.len() as u64;
242    metrics.scan_us = stage.lap();
243
244    let mut ids = Vec::with_capacity(chunks.iter().map(Vec::len).sum());
245    for c in &chunks {
246        ids.extend_from_slice(c);
247    }
248    metrics.materialize_us = stage.lap();
249
250    (
251        SearchResult {
252            ids,
253            content_generation: idx.content_generation(),
254            structural_generation: idx.structural_generation(),
255        },
256        metrics,
257    )
258}
259
260fn full_scan_group(
261    idx: &VolumeIndex,
262    memo: &PathMemos,
263    terms: &[CTerm],
264    skip_excluded: bool,
265    bitmap: &mut [u64],
266    metrics: &mut SearchMetrics,
267) {
268    let n = idx.len();
269    let chunk_results: Vec<(Vec<u64>, u64, u64)> = (0..n.div_ceil(CHUNK))
270        .into_par_iter()
271        .map(|ci| {
272            let start = ci * CHUNK;
273            let end = (start + CHUNK).min(n);
274            let mut words = vec![0u64; (end - start).div_ceil(64)];
275            let mut ctx = EvalCtx::default();
276            let mut scanned = 0u64;
277            let mut skipped = 0u64;
278            for id in start..end {
279                let id = id as EntryId;
280                if !idx.is_live(id) {
281                    continue;
282                }
283                if skip_excluded && idx.is_excluded(id) {
284                    skipped += 1;
285                    continue;
286                }
287                scanned += 1;
288                if terms_match(idx, memo, &mut ctx, terms, id) {
289                    let rel = id as usize - start;
290                    words[rel / 64] |= 1u64 << (rel % 64);
291                }
292            }
293            (words, scanned, skipped)
294        })
295        .collect();
296
297    let mut word_base = 0usize;
298    for (words, scanned, skipped) in &chunk_results {
299        for (i, w) in words.iter().enumerate() {
300            bitmap[word_base + i] |= w;
301        }
302        word_base += CHUNK / 64;
303        metrics.entries_scanned += scanned;
304        metrics.excluded_skipped += skipped;
305    }
306}
307
308/// Walk the pre-sorted permutation keeping entries that pass `keep` —
309/// parallel chunks, order preserved by concatenation. Name order is the
310/// always-maintained index column; size/mtime orders are lazily derived
311/// caches that build on the first query sorting by them (one parallel sort)
312/// and extend per generation after that.
313fn materialize_filtered(
314    idx: &VolumeIndex,
315    opt: &QueryOptions,
316    keep: impl Fn(EntryId) -> bool + Sync,
317) -> Vec<EntryId> {
318    // Fine-grained chunks: at 2^17 a 1M-entry walk only fans out 8 ways and
319    // the walk becomes the latency floor for every query.
320    const MAT_CHUNK: usize = 1 << 14;
321
322    let size_perm;
323    let mtime_perm;
324    let perm: &[EntryId] = match opt.sort {
325        SortKey::Name => idx.name_permutation(),
326        SortKey::Size => {
327            size_perm = SizePerm::get(idx);
328            &size_perm.0.ids
329        }
330        SortKey::Mtime => {
331            mtime_perm = MtimePerm::get(idx);
332            &mtime_perm.0.ids
333        }
334    };
335    let chunks: Vec<Vec<EntryId>> = perm
336        .par_chunks(MAT_CHUNK)
337        .map(|chunk| chunk.iter().copied().filter(|&id| keep(id)).collect())
338        .collect();
339    let total = chunks.iter().map(Vec::len).sum();
340    let mut ids = Vec::with_capacity(total);
341    if opt.desc {
342        for c in chunks.iter().rev() {
343            ids.extend(c.iter().rev());
344        }
345    } else {
346        for c in &chunks {
347            ids.extend_from_slice(c);
348        }
349    }
350    ids
351}
352
353#[cfg(test)]
354mod tests {
355    use super::super::dates::UtcResolver;
356    use super::super::{CaseMode, QueryOptions, compile, parse};
357    use super::*;
358    use crate::index::{Frn, RawEntry, SortKey, VolumeIndexBuilder};
359
360    fn run(idx: &VolumeIndex, query: &str, opt: QueryOptions) -> Vec<String> {
361        let ast = parse(query).unwrap();
362        let q = compile(&ast, opt.case, &UtcResolver).unwrap();
363        search(idx, &q, &opt)
364            .0
365            .ids
366            .iter()
367            .map(|&id| String::from_utf8_lossy(idx.name(id)).into_owned())
368            .collect()
369    }
370
371    fn names(idx: &VolumeIndex, query: &str) -> Vec<String> {
372        run(idx, query, QueryOptions::default())
373    }
374
375    /// C:\ ├─ Docs\(dir) │ ├─ Report.PDF │ └─ notes.txt ├─ src\(dir) │ └─ main.rs └─ big.BIN
376    fn sample() -> VolumeIndex {
377        let day = 864_000_000_000i64; // FILETIME ticks per day
378        let entries: &[(&str, u64, u64, bool, u64, i64)] = &[
379            // (name, record, parent, is_dir, size, mtime)
380            ("Docs", 10, 5, true, 0, 100 * day),
381            ("Report.PDF", 11, 10, false, 2 << 20, 19_000 * day),
382            ("notes.txt", 12, 10, false, 512, 19_100 * day),
383            ("src", 20, 5, true, 0, 100 * day),
384            ("main.rs", 21, 20, false, 4096, 19_200 * day),
385            ("big.BIN", 30, 5, false, 3 << 30, 19_300 * day),
386        ];
387        let mut b = VolumeIndexBuilder::new("C:", 5);
388        for (name, rec, parent, is_dir, size, mtime) in entries {
389            let units: Vec<u16> = name.encode_utf16().collect();
390            b.push(RawEntry {
391                parent_frn: Frn(*parent),
392                frn: Frn((1 << 48) | rec),
393                name_utf16: &units,
394                is_dir: *is_dir,
395                is_reparse: false,
396                is_hidden: false,
397                is_system: false,
398                size: *size,
399                mtime: *mtime,
400            });
401        }
402        b.finish()
403    }
404
405    #[test]
406    fn substring_smart_case() {
407        let idx = sample();
408        assert_eq!(names(&idx, "report"), vec!["Report.PDF"]);
409        assert_eq!(names(&idx, "Report"), vec!["Report.PDF"]);
410        assert!(names(&idx, "REPORT.pdf").is_empty());
411        let opt = QueryOptions {
412            case: CaseMode::Insensitive,
413            ..Default::default()
414        };
415        assert_eq!(run(&idx, "REPORT", opt), vec!["Report.PDF"]);
416    }
417
418    #[test]
419    fn and_or_not() {
420        let idx = sample();
421        assert_eq!(names(&idx, "main rs"), vec!["main.rs"]);
422        let mut both = names(&idx, "pdf | txt");
423        both.sort();
424        assert_eq!(both, vec!["Report.PDF", "notes.txt"]);
425        assert_eq!(names(&idx, ".txt !notes"), Vec::<String>::new());
426    }
427
428    #[test]
429    fn ext_filter_case_insensitive_and_files_only() {
430        let idx = sample();
431        assert_eq!(names(&idx, "ext:pdf"), vec!["Report.PDF"]);
432        let mut multi = names(&idx, "ext:pdf;rs");
433        multi.sort();
434        assert_eq!(multi, vec!["Report.PDF", "main.rs"]);
435    }
436
437    #[test]
438    fn size_filter_excludes_dirs() {
439        let idx = sample();
440        assert_eq!(names(&idx, "size:>1gb"), vec!["big.BIN"]);
441        assert_eq!(names(&idx, "size:512"), vec!["notes.txt"]);
442        assert!(names(&idx, "size:0 folder:").is_empty());
443    }
444
445    #[test]
446    fn folder_and_file_filters() {
447        let idx = sample();
448        let mut dirs = names(&idx, "folder:");
449        dirs.sort();
450        assert_eq!(dirs, vec!["C:", "Docs", "src"]);
451        assert_eq!(names(&idx, "folder:doc"), vec!["Docs"]);
452    }
453
454    #[test]
455    fn path_terms() {
456        let idx = sample();
457        let mut in_docs = names(&idx, r"docs\");
458        in_docs.sort();
459        // Path semantics: `docs\` matches entries *under* the folder.
460        assert_eq!(in_docs, vec!["Report.PDF", "notes.txt"]);
461        assert_eq!(names(&idx, r"path:src main"), vec!["main.rs"]);
462    }
463
464    #[test]
465    fn wildcards_anchor_to_whole_name() {
466        let idx = sample();
467        assert_eq!(names(&idx, "*.rs"), vec!["main.rs"]);
468        assert_eq!(names(&idx, "main.?s"), vec!["main.rs"]);
469        assert!(names(&idx, "*.r").is_empty());
470        // Specialized prefix / inner forms.
471        assert_eq!(names(&idx, "main*"), vec!["main.rs"]);
472        assert_eq!(names(&idx, "*ain*"), vec!["main.rs"]);
473        assert!(names(&idx, "ain*").is_empty());
474    }
475
476    #[test]
477    fn regex_term() {
478        let idx = sample();
479        assert_eq!(names(&idx, "regex:^ma.n\\.rs$"), vec!["main.rs"]);
480    }
481
482    /// The literal prefilter must never lose a match: a regex query through
483    /// the engine (prefiltered pool sweep + residual, or a full scan when no
484    /// literal exists) must equal a naive `Regex::is_match` over every name,
485    /// for every case mode and a spread of pattern shapes. This is the one
486    /// unacceptable failure mode — a superset prefilter that drops a real hit.
487    #[test]
488    fn regex_matches_naive_oracle() {
489        use regex::bytes::RegexBuilder;
490
491        struct Rng(u64);
492        impl Rng {
493            fn next(&mut self) -> u64 {
494                let mut x = self.0;
495                x ^= x << 13;
496                x ^= x >> 7;
497                x ^= x << 17;
498                self.0 = x;
499                x
500            }
501        }
502        let frags = [
503            "report",
504            "Report",
505            "main",
506            "src",
507            "日本",
508            "語",
509            "data",
510            "DLL",
511            "exe",
512            "img",
513            "2024",
514            "v2",
515            "ファイル",
516            "tmp",
517        ];
518        let exts = [".rs", ".txt", ".PDF", ".dll", ".log", ".日", ""];
519        let mut rng = Rng(0x9E37_79B9);
520        let mut b = VolumeIndexBuilder::new("C:", 5);
521        // The builder seeds the volume root ("C:"); it is a live entry a name
522        // regex sees too (e.g. `.*`), so the naive oracle must include it.
523        let mut made: Vec<String> = vec!["C:".to_string()];
524        for i in 0..300u64 {
525            let mut name = String::new();
526            for _ in 0..=(rng.next() % 3) {
527                name.push_str(frags[rng.next() as usize % frags.len()]);
528            }
529            name.push_str(exts[rng.next() as usize % exts.len()]);
530            if name.is_empty() {
531                name.push('x');
532            }
533            let units: Vec<u16> = name.encode_utf16().collect();
534            b.push(RawEntry {
535                parent_frn: Frn(5),
536                frn: Frn((1 << 48) | (100 + i)),
537                name_utf16: &units,
538                is_dir: false,
539                is_reparse: false,
540                is_hidden: false,
541                is_system: false,
542                size: i,
543                mtime: i as i64,
544            });
545            made.push(name);
546        }
547        let idx = b.finish();
548
549        // Prefilter-bearing (prefix/suffix literal) and literal-less (full
550        // scan) shapes, plus alternations, anchors, multibyte and digits.
551        let patterns = [
552            "^report",
553            "report",
554            "ort",
555            "main.*rs$",
556            "日.*語",
557            "[0-9]+",
558            "v[0-9]",
559            r".*\.rs$",
560            r"\.dll$",
561            "dll|exe",
562            "^$",
563            "Report",
564            "REPORT",
565            "src.*main",
566            r"^\d",
567            ".*",
568            "ファイル",
569            "(report|main)",
570            r"tmp.*\.log$",
571            "日本",
572        ];
573        let cases = [CaseMode::Smart, CaseMode::Insensitive, CaseMode::Sensitive];
574        for pat in patterns {
575            for case in cases {
576                let opt = QueryOptions {
577                    case,
578                    ..Default::default()
579                };
580                let ci = match case {
581                    CaseMode::Insensitive => true,
582                    CaseMode::Sensitive => false,
583                    CaseMode::Smart => !crate::wtf8::has_uppercase(pat),
584                };
585                let re = RegexBuilder::new(pat)
586                    .case_insensitive(ci)
587                    .dot_matches_new_line(true)
588                    .build()
589                    .unwrap();
590                let mut expect: Vec<String> = made
591                    .iter()
592                    .filter(|n| re.is_match(n.as_bytes()))
593                    .cloned()
594                    .collect();
595                expect.sort();
596                // Quote so the parser keeps `|`/`$`/etc. inside one regex atom.
597                let mut got = run(&idx, &format!(r#"regex:"{pat}""#), opt);
598                got.sort();
599                assert_eq!(
600                    got, expect,
601                    "regex `{pat}` (case {case:?}) diverged from the naive oracle"
602                );
603            }
604        }
605    }
606
607    #[test]
608    fn date_filter_utc() {
609        let idx = sample();
610        let r = names(&idx, "dm:>=1650 ext:pdf");
611        assert_eq!(r, vec!["Report.PDF"]);
612    }
613
614    /// Whole-query regex mode (ADR-0023): the entire text is one regex, no
615    /// parsing/operators, matched against the name or the full path per scope.
616    #[test]
617    fn whole_regex_mode_name_and_path_scope() {
618        use super::super::{RegexScope, compile_whole_regex};
619
620        let idx = sample();
621        let run_re = |pat: &str, scope: RegexScope| {
622            let q = compile_whole_regex(pat, CaseMode::Smart, scope).unwrap();
623            let mut got: Vec<String> = search(&idx, &q, &QueryOptions::default())
624                .0
625                .ids
626                .iter()
627                .map(|&id| String::from_utf8_lossy(idx.name(id)).into_owned())
628                .collect();
629            got.sort();
630            got
631        };
632
633        // Name scope: the text is a regex over the file name. Operators that
634        // the normal parser would treat as AND/OR are regex metachars here.
635        assert_eq!(run_re(r"\.rs$", RegexScope::Name), vec!["main.rs"]);
636        assert_eq!(run_re("^report", RegexScope::Name), vec!["Report.PDF"]); // smart-case ci
637        assert_eq!(
638            run_re("pdf|txt", RegexScope::Name),
639            vec!["Report.PDF", "notes.txt"]
640        );
641        // A literal-less pattern falls back to the full scan and still matches.
642        assert!(run_re(r"[0-9]", RegexScope::Name).is_empty()); // no sample name has a digit
643
644        // Path scope: matched against the full path (parent included).
645        assert_eq!(
646            run_re(r"docs.*\.pdf$", RegexScope::Path),
647            vec!["Report.PDF"]
648        );
649
650        // Invalid / oversized patterns are a compile error, never a panic.
651        assert!(compile_whole_regex("(", CaseMode::Smart, RegexScope::Name).is_err());
652        assert!(compile_whole_regex("(a{500}){500}", CaseMode::Smart, RegexScope::Name).is_err());
653    }
654
655    #[test]
656    fn empty_query_matches_all_live_entries() {
657        let idx = sample();
658        assert_eq!(names(&idx, "").len(), idx.live_len());
659    }
660
661    #[test]
662    fn sort_orders() {
663        let idx = sample();
664        let by_size = run(
665            &idx,
666            "file:",
667            QueryOptions {
668                sort: SortKey::Size,
669                ..Default::default()
670            },
671        );
672        assert_eq!(
673            by_size,
674            vec!["notes.txt", "main.rs", "Report.PDF", "big.BIN"]
675        );
676        let by_size_desc = run(
677            &idx,
678            "file:",
679            QueryOptions {
680                sort: SortKey::Size,
681                desc: true,
682                ..Default::default()
683            },
684        );
685        assert_eq!(
686            by_size_desc,
687            vec!["big.BIN", "Report.PDF", "main.rs", "notes.txt"]
688        );
689    }
690
691    #[test]
692    fn tombstones_are_excluded() {
693        let mut idx = sample();
694        idx.delete(11); // Report.PDF
695        assert!(names(&idx, "report").is_empty());
696    }
697
698    #[test]
699    fn hidden_system_excluded_by_default_and_toggleable() {
700        let mut b = VolumeIndexBuilder::new("C:", 5);
701        let mk = |name: &str| name.encode_utf16().collect::<Vec<u16>>();
702        let (bin, ghost, vis) = (mk("$Recycle.Bin"), mk("ghost.txt"), mk("visible.txt"));
703        let mut push = |rec: u64, parent: u64, name: &[u16], is_dir, is_system| {
704            b.push(RawEntry {
705                parent_frn: Frn(parent),
706                frn: Frn((1 << 48) | rec),
707                name_utf16: name,
708                is_dir,
709                is_reparse: false,
710                is_hidden: false,
711                is_system,
712                size: 1,
713                mtime: 1,
714            });
715        };
716        push(10, 5, &bin, true, true);
717        push(11, 10, &ghost, false, false);
718        push(20, 5, &vis, false, false);
719        let idx = b.finish();
720
721        assert_eq!(names(&idx, "txt"), vec!["visible.txt"]);
722        assert_eq!(names(&idx, "").len(), 2); // root + visible.txt
723
724        let all = run(
725            &idx,
726            "txt",
727            QueryOptions {
728                include_hidden_system: true,
729                ..Default::default()
730            },
731        );
732        let mut all = all;
733        all.sort();
734        assert_eq!(all, vec!["ghost.txt", "visible.txt"]);
735    }
736
737    /// Pool-scan vs naive oracle over deterministic pseudo-random names,
738    /// including multibyte, surrogate pairs, boundary-adjacent repeats and
739    /// post-rename stale pool gaps.
740    #[test]
741    fn pool_scan_matches_naive_oracle() {
742        use std::fmt::Write as _;
743
744        struct Rng(u64);
745        impl Rng {
746            fn next(&mut self) -> u64 {
747                let mut x = self.0;
748                x ^= x << 13;
749                x ^= x >> 7;
750                x ^= x << 17;
751                self.0 = x;
752                x
753            }
754        }
755        let fragments = [
756            "ab",
757            "abc",
758            "日本",
759            "語",
760            "x",
761            "𠮷",
762            "report",
763            "Report",
764            "ort",
765            "tab",
766            "TAB",
767            "ba",
768            "ファイル",
769        ];
770        let mut rng = Rng(42);
771        let mut b = VolumeIndexBuilder::new("C:", 5);
772        let mut names_made: Vec<String> = Vec::new();
773        for i in 0..500u64 {
774            let mut name = String::new();
775            for _ in 0..=(rng.next() % 4) {
776                name.push_str(fragments[(rng.next() % fragments.len() as u64) as usize]);
777            }
778            write!(name, "_{i}").unwrap();
779            let units: Vec<u16> = name.encode_utf16().collect();
780            b.push(RawEntry {
781                parent_frn: Frn(5),
782                frn: Frn((1 << 48) | (100 + i)),
783                name_utf16: &units,
784                is_dir: false,
785                is_reparse: false,
786                is_hidden: false,
787                is_system: false,
788                size: i,
789                mtime: i as i64,
790            });
791            names_made.push(name);
792        }
793        let mut idx = b.finish();
794        // Create stale pool gaps: rename a slice of entries.
795        for i in (0..500u64).step_by(7) {
796            let new_name = format!("renamed_{i}_abba");
797            let units: Vec<u16> = new_name.encode_utf16().collect();
798            let first_new = idx.len() as u32;
799            idx.upsert(&RawEntry {
800                parent_frn: Frn(5),
801                frn: Frn((1 << 48) | (100 + i)),
802                name_utf16: &units,
803                is_dir: false,
804                is_reparse: false,
805                is_hidden: false,
806                is_system: false,
807                size: i,
808                mtime: i as i64,
809            });
810            idx.merge_new_into_permutations(first_new);
811            names_made[i as usize] = new_name;
812        }
813
814        let needles = [
815            "ab",
816            "abc",
817            "ba",
818            "abba",
819            "日本",
820            "語x",
821            "𠮷",
822            "report",
823            "ort",
824            "tab",
825            "_3",
826            "renamed",
827            "zzz_nothing",
828        ];
829        for needle in needles {
830            let mut expect: Vec<String> = names_made
831                .iter()
832                .filter(|n| n.to_lowercase().contains(needle))
833                .cloned()
834                .collect();
835            expect.sort();
836            let mut got = names(&idx, needle);
837            got.sort();
838            assert_eq!(got, expect, "needle `{needle}` diverged from oracle");
839        }
840
841        // Smart-case uppercase needles: the folded sweep over-approximates
842        // (case-exact terms drive as folded supersets) and the exact
843        // residual must filter back to byte-exact matches.
844        let upper_needles = ["Report", "Rep", "TAB", "Ort"];
845        assert!(
846            names_made.iter().any(|n| n.contains("Report")),
847            "oracle coverage: mixed-case names must exist"
848        );
849        for needle in upper_needles {
850            let mut expect: Vec<String> = names_made
851                .iter()
852                .filter(|n| n.contains(needle))
853                .cloned()
854                .collect();
855            expect.sort();
856            let mut got = names(&idx, needle);
857            got.sort();
858            assert_eq!(got, expect, "smart-case needle `{needle}` diverged");
859        }
860        // Sensitive mode: every needle is byte-exact, lowercase ones too.
861        let sensitive = QueryOptions {
862            case: CaseMode::Sensitive,
863            ..Default::default()
864        };
865        for needle in needles.iter().chain(upper_needles.iter()) {
866            let mut expect: Vec<String> = names_made
867                .iter()
868                .filter(|n| n.contains(needle))
869                .cloned()
870                .collect();
871            expect.sort();
872            let mut got = run(&idx, needle, sensitive);
873            got.sort();
874            assert_eq!(got, expect, "sensitive needle `{needle}` diverged");
875        }
876
877        // ── Refine oracle: for every subsumed step of a typing sequence,
878        // refining the previous ids must equal a fresh search, byte for
879        // byte and in order. False positives here lose results — the one
880        // unacceptable failure mode of the query cache.
881        let sequences: &[&[&str]] = &[
882            &["", "a", "ab", "abb", "abba"],
883            &["r", "re", "rep", "repo", "report"],
884            &["ab", "ab ba", "ab ba _3"],
885            &["日", "日本", "日本 語"],
886            &["t", "ta", "tab", "tab *ab*"],
887            &["re", "renamed", "renamed !zzz"],
888            &["a", "a size:>100", "a size:>100 size:<400"],
889            &["", "x", "x𠮷"],
890            &["or", "Ort"], // smart-case flip: folded prev → orig next
891        ];
892        let opts = [
893            QueryOptions::default(),
894            QueryOptions {
895                sort: SortKey::Size,
896                desc: true,
897                ..Default::default()
898            },
899        ];
900        let mut refined_pairs = 0;
901        for opt in opts {
902            for seq in sequences {
903                let mut prev: Option<(super::super::CompiledQuery, Vec<EntryId>)> = None;
904                for text in *seq {
905                    let q = compile(&parse(text).unwrap(), opt.case, &UtcResolver).unwrap();
906                    let fresh = search(&idx, &q, &opt).0.ids;
907                    if let Some((pq, pids)) = &prev
908                        && super::super::subsumes(pq, &opt, &q, &opt)
909                    {
910                        let refined = refine(&idx, &q, &opt, pids).0.ids;
911                        assert_eq!(
912                            refined, fresh,
913                            "refine diverged from fresh search at `{text}` (sort {:?})",
914                            opt.sort
915                        );
916                        refined_pairs += 1;
917                    }
918                    prev = Some((q, fresh));
919                }
920            }
921        }
922        assert!(
923            refined_pairs >= 20,
924            "subsumption barely fired ({refined_pairs} pairs) — the oracle is vacuous"
925        );
926    }
927}
928
929#[cfg(test)]
930mod proptests {
931    use std::sync::atomic::{AtomicU64, Ordering};
932
933    use proptest::prelude::*;
934
935    use super::super::dates::UtcResolver;
936    use super::super::{
937        CaseMode, CompiledQuery, QueryOptions, RegexScope, compile, compile_whole_regex, parse,
938        subsumes,
939    };
940    use super::{EntryId, refine, search};
941    use crate::index::{Frn, RawEntry, SortKey, VolumeIndex, VolumeIndexBuilder};
942
943    // Counts every subsumed pair the refine-vs-fresh oracle actually checked,
944    // summed across *all* generated proptest cases. The vacuity guard
945    // (`subsumption_fires_at_least_sometimes`) reads it: without it a green run
946    // could simply mean subsumption never fired and the equality was never
947    // exercised.
948    static REFINED_PAIRS: AtomicU64 = AtomicU64::new(0);
949
950    /// Name fragments spanning the matcher domains the subsumption algebra
951    /// bridges: ASCII case pairs (smart-case fold ↔ orig), multibyte UTF-8
952    /// (fold is length-preserving per code point), a surrogate pair, and
953    /// extension-shaped tails. Short and overlapping so random concatenations
954    /// collide and the typed prefixes/suffixes actually subsume.
955    const FRAGMENTS: &[&str] = &[
956        "ab", "abc", "Re", "report", "Report", "ort", "tab", "TAB", "日本", "語", "𠮷", "x",
957        "main", ".rs", ".txt", ".PDF",
958    ];
959
960    /// One generated index entry: a name plus the attributes the option
961    /// cross-product reads (dir/hidden/system for visibility, size/mtime for
962    /// the sort columns).
963    #[derive(Debug, Clone)]
964    struct GenEntry {
965        name: String,
966        is_dir: bool,
967        is_hidden: bool,
968        is_system: bool,
969        size: u64,
970        mtime: i64,
971    }
972
973    fn entry_strategy() -> impl Strategy<Value = GenEntry> {
974        (
975            proptest::collection::vec(0usize..FRAGMENTS.len(), 1..=4),
976            any::<bool>(),
977            any::<bool>(),
978            any::<bool>(),
979            0u64..(8u64 << 30),
980            0i64..(20_000i64 * 864_000_000_000),
981        )
982            .prop_map(|(parts, is_dir, is_hidden, is_system, size, mtime)| {
983                let name: String = parts.iter().map(|&i| FRAGMENTS[i]).collect();
984                GenEntry {
985                    name,
986                    is_dir,
987                    is_hidden,
988                    is_system,
989                    size,
990                    mtime,
991                }
992            })
993    }
994
995    fn build_index(entries: &[GenEntry]) -> VolumeIndex {
996        let mut b = VolumeIndexBuilder::new("C:", 5);
997        for (i, e) in entries.iter().enumerate() {
998            let units: Vec<u16> = e.name.encode_utf16().collect();
999            b.push(RawEntry {
1000                parent_frn: Frn(5),
1001                frn: Frn((1 << 48) | (100 + i as u64)),
1002                name_utf16: &units,
1003                is_dir: e.is_dir,
1004                is_reparse: false,
1005                is_hidden: e.is_hidden,
1006                is_system: e.is_system,
1007                size: e.size,
1008                mtime: e.mtime,
1009            });
1010        }
1011        b.finish()
1012    }
1013
1014    /// A typed-query growth step: each variant appends to the previous text so
1015    /// the result *usually* narrows — the case subsumption is designed for.
1016    /// Suffix/filter steps mix in the non-name matcher domains.
1017    #[derive(Debug, Clone)]
1018    enum Step {
1019        /// Append the next character of a fragment (incremental typing).
1020        Frag(usize),
1021        /// Append a whole extra name term (AND narrows further).
1022        Term(usize),
1023        /// Append a `*lit*` wildcard term.
1024        Wildcard(usize),
1025        /// Append a negated name term.
1026        Not(usize),
1027        /// Append a `size:` lower-bound filter.
1028        Size(u64),
1029        /// Append a `file:` / `folder:` filter.
1030        IsDir(bool),
1031        /// Append an `ext:` filter.
1032        Ext(usize),
1033        /// A backspace-like reset to a shorter prefix (deliberately *widens*,
1034        /// so subsumption must decline — keeps the oracle honest).
1035        Truncate,
1036    }
1037
1038    fn step_strategy() -> impl Strategy<Value = Step> {
1039        prop_oneof![
1040            (0usize..FRAGMENTS.len()).prop_map(Step::Frag),
1041            (0usize..FRAGMENTS.len()).prop_map(Step::Term),
1042            (0usize..FRAGMENTS.len()).prop_map(Step::Wildcard),
1043            (0usize..FRAGMENTS.len()).prop_map(Step::Not),
1044            (1u64..(4u64 << 30)).prop_map(Step::Size),
1045            any::<bool>().prop_map(Step::IsDir),
1046            (0usize..3usize).prop_map(Step::Ext),
1047            Just(Step::Truncate),
1048        ]
1049    }
1050
1051    /// Turn a step sequence into a growing list of query texts. `Truncate`
1052    /// chops the trailing whitespace-delimited atom; every other step appends.
1053    fn texts_from_steps(steps: &[Step]) -> Vec<String> {
1054        use std::fmt::Write as _;
1055        const EXTS: &[&str] = &["rs", "txt", "pdf"];
1056        let mut cur = String::new();
1057        let mut out = vec![cur.clone()];
1058        for step in steps {
1059            match step {
1060                Step::Frag(i) => cur.push_str(FRAGMENTS[*i]),
1061                Step::Term(i) => {
1062                    cur.push(' ');
1063                    cur.push_str(FRAGMENTS[*i]);
1064                }
1065                Step::Wildcard(i) => {
1066                    cur.push_str(" *");
1067                    cur.push_str(FRAGMENTS[*i].trim_start_matches('.'));
1068                    cur.push('*');
1069                }
1070                Step::Not(i) => {
1071                    cur.push_str(" !");
1072                    cur.push_str(FRAGMENTS[*i].trim_start_matches('.'));
1073                }
1074                Step::Size(min) => {
1075                    write!(cur, " size:>{min}").expect("writing to a String is infallible");
1076                }
1077                Step::IsDir(d) => {
1078                    cur.push_str(if *d { " folder:" } else { " file:" });
1079                }
1080                Step::Ext(i) => {
1081                    cur.push_str(" ext:");
1082                    cur.push_str(EXTS[*i]);
1083                }
1084                Step::Truncate => {
1085                    let trimmed = cur.trim_end();
1086                    let keep = trimmed.rfind(char::is_whitespace).map_or(0, |p| p);
1087                    cur.truncate(keep);
1088                }
1089            }
1090            out.push(cur.clone());
1091        }
1092        out
1093    }
1094
1095    /// Compile one query text honoring whole-query regex mode (then the text
1096    /// *is* the pattern). Returns `None` when the text fails to compile (an
1097    /// invalid regex fragment); the caller skips that step.
1098    fn compile_text(text: &str, opt: &QueryOptions) -> Option<CompiledQuery> {
1099        if opt.regex_mode {
1100            compile_whole_regex(text, opt.case, opt.regex_scope).ok()
1101        } else {
1102            compile(&parse(text).ok()?, opt.case, &UtcResolver).ok()
1103        }
1104    }
1105
1106    fn options_strategy() -> impl Strategy<Value = QueryOptions> {
1107        (
1108            prop_oneof![
1109                Just(SortKey::Name),
1110                Just(SortKey::Size),
1111                Just(SortKey::Mtime)
1112            ],
1113            any::<bool>(),
1114            prop_oneof![
1115                Just(CaseMode::Smart),
1116                Just(CaseMode::Insensitive),
1117                Just(CaseMode::Sensitive)
1118            ],
1119            any::<bool>(),
1120            any::<bool>(),
1121            prop_oneof![Just(RegexScope::Name), Just(RegexScope::Path)],
1122        )
1123            .prop_map(
1124                |(sort, desc, case, include_hidden_system, regex_mode, regex_scope)| QueryOptions {
1125                    sort,
1126                    desc,
1127                    case,
1128                    include_hidden_system,
1129                    regex_mode,
1130                    regex_scope,
1131                },
1132            )
1133    }
1134
1135    proptest! {
1136        // The one unacceptable failure mode (subsume.rs): whenever
1137        // `subsumes(prev, next)` claims the next result fits inside the
1138        // previous one, `refine(prev_ids)` must reproduce `search(next)`
1139        // EXACTLY — identical id set *and* order — across the full option
1140        // cross-product (sort × desc × hidden/system × case × regex × scope)
1141        // and a random index. A divergence here is a real soundness bug:
1142        // refine would silently drop or misorder live results. Both prev and
1143        // next compile under the *same* options (the real refine precondition:
1144        // a typing session keeps its options and index generation fixed).
1145        #![proptest_config(ProptestConfig::with_cases(256))]
1146        #[test]
1147        fn refine_equals_fresh_whenever_subsumed(
1148            entries in proptest::collection::vec(entry_strategy(), 1..40),
1149            steps in proptest::collection::vec(step_strategy(), 1..8),
1150            opt in options_strategy(),
1151        ) {
1152            let idx = build_index(&entries);
1153            let texts = texts_from_steps(&steps);
1154
1155            let mut prev: Option<(CompiledQuery, Vec<EntryId>)> = None;
1156            for text in &texts {
1157                let Some(q) = compile_text(text, &opt) else {
1158                    // An uncompilable step breaks the chain — a fresh search
1159                    // would start over, so drop the cached predecessor too.
1160                    prev = None;
1161                    continue;
1162                };
1163                let fresh = search(&idx, &q, &opt).0.ids;
1164
1165                if let Some((pq, pids)) = &prev
1166                    && subsumes(pq, &opt, &q, &opt)
1167                {
1168                    let refined = refine(&idx, &q, &opt, pids).0.ids;
1169                    prop_assert_eq!(
1170                        &refined,
1171                        &fresh,
1172                        "refine diverged from fresh search at `{}` (opt {:?})",
1173                        text,
1174                        opt
1175                    );
1176                    REFINED_PAIRS.fetch_add(1, Ordering::Relaxed);
1177                }
1178                prev = Some((q, fresh));
1179            }
1180        }
1181    }
1182
1183    // Vacuity guard: the proptest above is only meaningful if subsumption
1184    // actually fired and the refine==fresh equality was exercised. proptest
1185    // runs every `#[test]` in this module in the same process, so the shared
1186    // counter is populated by the time this (alphabetically later) test runs.
1187    // The threshold is deliberately well below the typical count (hundreds of
1188    // subsumed pairs over 256 cases) so it tolerates shrinking/seed variance
1189    // while still catching a regression that makes subsumption never fire.
1190    #[test]
1191    fn subsumption_fires_at_least_sometimes() {
1192        // Drive a deterministic spread of options directly so the guard does
1193        // not rely on the fuzz test's nondeterministic ordering: incremental
1194        // typing under each sort/case must subsume at least the obvious steps.
1195        let entries: Vec<GenEntry> = (0..30u64)
1196            .map(|i| {
1197                let name = format!(
1198                    "{}{}{}",
1199                    FRAGMENTS[i as usize % FRAGMENTS.len()],
1200                    FRAGMENTS[(i as usize + 3) % FRAGMENTS.len()],
1201                    if i % 2 == 0 { ".rs" } else { ".txt" },
1202                );
1203                GenEntry {
1204                    name,
1205                    is_dir: i % 5 == 0,
1206                    is_hidden: i % 7 == 0,
1207                    is_system: i % 11 == 0,
1208                    size: i * 1024,
1209                    mtime: i as i64 * 864_000_000_000,
1210                }
1211            })
1212            .collect();
1213        let idx = build_index(&entries);
1214
1215        let mut local_fired = 0u64;
1216        for sort in [SortKey::Name, SortKey::Size, SortKey::Mtime] {
1217            for case in [CaseMode::Smart, CaseMode::Insensitive, CaseMode::Sensitive] {
1218                let opt = QueryOptions {
1219                    sort,
1220                    case,
1221                    ..Default::default()
1222                };
1223                let seq = ["", "r", "re", "rep", "repo", "report", "report .rs"];
1224                let mut prev: Option<(CompiledQuery, Vec<EntryId>)> = None;
1225                for text in seq {
1226                    let q = compile_text(text, &opt).expect("static texts compile");
1227                    let fresh = search(&idx, &q, &opt).0.ids;
1228                    if let Some((pq, pids)) = &prev
1229                        && subsumes(pq, &opt, &q, &opt)
1230                    {
1231                        let refined = refine(&idx, &q, &opt, pids).0.ids;
1232                        assert_eq!(
1233                            refined, fresh,
1234                            "refine diverged from fresh search at `{text}` (opt {opt:?})"
1235                        );
1236                        local_fired += 1;
1237                    }
1238                    prev = Some((q, fresh));
1239                }
1240            }
1241        }
1242        assert!(
1243            local_fired >= 9,
1244            "subsumption barely fired ({local_fired}) — the oracle is vacuous"
1245        );
1246        // Fold the fuzz test's tally in too (best-effort: it may not have run
1247        // yet under filtered/sharded runs, hence the independent local check).
1248        REFINED_PAIRS.fetch_add(local_fired, Ordering::Relaxed);
1249    }
1250}