Skip to main content

fmf_core\query/
sweep.rs

1use rayon::prelude::*;
2
3use super::compile::Driver;
4use super::memo::OffsetTable;
5use crate::index::{EntryId, VolumeIndex};
6
7// ── Drivers ─────────────────────────────────────────────────────────────
8
9/// Run a pool-sweep driver and return live (and, when filtering, non-
10/// excluded) candidate entries. Hits are validated against entry
11/// boundaries: pool bytes from renamed-away names ("stale gaps") and
12/// matches spanning two names map outside their entry's range and are
13/// rejected.
14pub(super) fn driver_candidates(
15    idx: &VolumeIndex,
16    table: &OffsetTable,
17    driver: &Driver,
18    skip_excluded: bool,
19) -> Vec<EntryId> {
20    // The folded pool is the only contiguous one; case-exact drivers sweep
21    // it with a folded needle (superset — original-case match implies the
22    // folded match) and the exact comparison runs as a residual.
23    let pool: &[u8] = idx.lower_pool_bytes();
24    if table.len() == 0 || pool.is_empty() {
25        return Vec::new();
26    }
27
28    // Over-split so uneven hit densities still balance across threads.
29    let threads = rayon::current_num_threads().max(1) * 4;
30    let per = table.len().div_ceil(threads).max(1);
31    let ranges: Vec<(usize, usize)> = (0..table.len())
32        .step_by(per)
33        .map(|s| (s, (s + per).min(table.len())))
34        .collect();
35
36    let accept = |id: EntryId| idx.is_live(id) && !(skip_excluded && idx.is_excluded(id));
37
38    let chunks: Vec<Vec<EntryId>> = ranges
39        .into_par_iter()
40        .map(|(ks, ke)| {
41            let pool_start = table.offs[ks] as usize;
42            let pool_end = if ke < table.len() {
43                table.offs[ke] as usize
44            } else {
45                pool.len()
46            };
47            let hay = &pool[pool_start..pool_end];
48            let mut out = Vec::new();
49
50            let mut sweep =
51                |needle_len: usize,
52                 find: &mut dyn FnMut(&[u8]) -> Option<usize>,
53                 anchor: &mut dyn FnMut(usize, usize, usize) -> bool| {
54                    let mut pos = 0usize;
55                    // Hits arrive in increasing pool order, so the entry
56                    // cursor advances monotonically — amortized O(1) per hit
57                    // instead of a binary search.
58                    let mut k = ks;
59                    while pos < hay.len() {
60                        let Some(rel) = find(&hay[pos..]) else { break };
61                        let hit = pool_start + pos + rel;
62                        while k + 1 < ke && (table.offs[k + 1] as usize) <= hit {
63                            k += 1;
64                        }
65                        let id = table.ids[k];
66                        let off = table.offs[k] as usize;
67                        // A pair whose entry has since moved its name (in-
68                        // place dir rename, incremental table) covers dead
69                        // bytes — skip its whole region like a stale gap.
70                        if idx.name_off_of(id) as usize != off {
71                            let next = if k + 1 < table.len() {
72                                (table.offs[k + 1] as usize).min(pool_end)
73                            } else {
74                                pool_end
75                            };
76                            pos = next.max(hit + 1) - pool_start;
77                            continue;
78                        }
79                        let end = off + idx.name_len_of(id);
80                        if hit + needle_len <= end && anchor(hit, off, end) {
81                            if accept(id) {
82                                out.push(id);
83                            }
84                            // One hit per entry is enough: resume at its end.
85                            pos = end - pool_start;
86                        } else if hit >= end {
87                            // Stale gap between entries: jump to the next entry.
88                            let next = if k + 1 < table.len() {
89                                (table.offs[k + 1] as usize).min(pool_end)
90                            } else {
91                                pool_end
92                            };
93                            pos = next.max(hit + 1) - pool_start;
94                        } else {
95                            pos = hit + 1 - pool_start;
96                        }
97                    }
98                };
99
100            match driver {
101                Driver::Sub {
102                    finder, needle_len, ..
103                } => {
104                    sweep(*needle_len, &mut |h| finder.find(h), &mut |_, _, _| true);
105                }
106                Driver::Prefix { bytes, .. } => {
107                    let finder = memchr::memmem::Finder::new(bytes);
108                    sweep(bytes.len(), &mut |h| finder.find(h), &mut |hit, off, _| {
109                        hit == off
110                    });
111                }
112                Driver::Suffixes {
113                    suffixes,
114                    files_only,
115                    ..
116                } => {
117                    // Anchored tails defeat memmem's rare-byte prefilter
118                    // ('.' occurs in almost every name), so a sequential
119                    // table-order tail compare wins here.
120                    for k in ks..ke {
121                        let id = table.ids[k];
122                        let off = table.offs[k] as usize;
123                        if idx.name_off_of(id) as usize != off {
124                            continue; // stale pair: dead bytes
125                        }
126                        let name = &pool[off..off + idx.name_len_of(id)];
127                        if suffixes.iter().any(|s| name.ends_with(s))
128                            && (!*files_only || !idx.is_dir(id))
129                            && accept(id)
130                        {
131                            out.push(id);
132                        }
133                    }
134                }
135                _ => unreachable!(),
136            }
137            out
138        })
139        .collect();
140    chunks.concat()
141}
142
143#[cfg(test)]
144mod tests {
145    use memchr::memmem;
146
147    use super::*;
148    use crate::index::VolumeIndexBuilder;
149    use crate::index::testutil::{raw, raw_attr, u16s};
150
151    fn sub_driver(needle: &str) -> Driver {
152        Driver::Sub {
153            finder: memmem::Finder::new(needle.as_bytes()).into_owned(),
154            needle_len: needle.len(),
155        }
156    }
157
158    /// `driver_candidates` over a fresh table, id-sorted for stable assertions.
159    fn run(idx: &VolumeIndex, driver: &Driver, skip_excluded: bool) -> Vec<EntryId> {
160        let table = OffsetTable::build(idx);
161        let mut ids = driver_candidates(idx, &table, driver, skip_excluded);
162        ids.sort_unstable();
163        ids
164    }
165
166    #[test]
167    fn hits_spanning_two_names_are_rejected() {
168        // 4000 entries guarantee multi-entry sweep chunks regardless of the
169        // rayon thread count, so boundary-spanning hits are actually found
170        // by the finder and must be rejected by the anchor check.
171        let mut b = VolumeIndexBuilder::new("C:", 5);
172        let name = u16s("abcd");
173        for i in 0..4000u64 {
174            b.push(raw(100 + i, 5, &name, false, i, i as i64));
175        }
176        let idx = b.finish();
177        // "cdab" only ever occurs across an "abcd|abcd" boundary.
178        assert!(run(&idx, &sub_driver("cdab"), true).is_empty());
179        // Control: in-name hits return every live entry exactly once.
180        assert_eq!(
181            run(&idx, &sub_driver("abcd"), true),
182            (1..=4000).collect::<Vec<EntryId>>()
183        );
184    }
185
186    #[test]
187    fn repeated_hits_inside_one_name_yield_a_single_candidate() {
188        let mut b = VolumeIndexBuilder::new("C:", 5);
189        let name = u16s("ababab");
190        b.push(raw(10, 5, &name, false, 1, 1));
191        let idx = b.finish();
192        let id = idx.entry_by_record(10).unwrap();
193        assert_eq!(run(&idx, &sub_driver("ab"), true), vec![id]);
194    }
195
196    #[test]
197    fn stale_gap_from_dir_rename_is_skipped() {
198        let mut b = VolumeIndexBuilder::new("C:", 5);
199        let (a, d, z) = (u16s("aaa.txt"), u16s("needledir"), u16s("needle_zzz"));
200        b.push(raw(10, 5, &a, false, 1, 1));
201        b.push(raw(20, 5, &d, true, 0, 2));
202        b.push(raw(30, 5, &z, false, 1, 3));
203        let mut idx = b.finish();
204        let renamed = u16s("renamed");
205        idx.rename_dir_in_place(20, &renamed, 5).unwrap();
206        idx.merge_new_into_permutations(idx.len() as u32);
207
208        let dir = idx.entry_by_record(20).unwrap();
209        let zzz = idx.entry_by_record(30).unwrap();
210        // The old dir name bytes are now a stale gap: hits there map to no
211        // entry and must not stop the sweep from reaching later entries.
212        assert_eq!(run(&idx, &sub_driver("needle"), true), vec![zzz]);
213        // A needle that only occurs inside the gap yields nothing.
214        assert!(run(&idx, &sub_driver("needledir"), true).is_empty());
215        // The appended new name is reachable through the re-sorted table.
216        assert_eq!(run(&idx, &sub_driver("renamed"), true), vec![dir]);
217    }
218
219    #[test]
220    fn tombstoned_entries_are_dropped_even_on_pool_hits() {
221        let mut b = VolumeIndexBuilder::new("C:", 5);
222        let old = u16s("aaa.txt");
223        b.push(raw(10, 5, &old, false, 1, 1));
224        let mut idx = b.finish();
225        let first_new = idx.len() as u32;
226        let renamed = u16s("bbb.txt");
227        idx.upsert(&raw(10, 5, &renamed, false, 1, 2));
228        idx.merge_new_into_permutations(first_new);
229
230        // The tombstoned entry still owns its pool bytes and table slot,
231        // but a hit on it must not surface.
232        assert!(run(&idx, &sub_driver("aaa"), true).is_empty());
233        let new_id = idx.entry_by_record(10).unwrap();
234        assert_eq!(run(&idx, &sub_driver("bbb"), true), vec![new_id]);
235    }
236
237    #[test]
238    fn prefix_driver_rejects_non_anchored_hits() {
239        let mut b = VolumeIndexBuilder::new("C:", 5);
240        let (a, z) = (u16s("abc.txt"), u16s("zzabc.txt"));
241        b.push(raw(10, 5, &a, false, 1, 1));
242        b.push(raw(20, 5, &z, false, 1, 2));
243        let idx = b.finish();
244        let abc = idx.entry_by_record(10).unwrap();
245        let driver = Driver::Prefix {
246            bytes: b"abc".to_vec(),
247        };
248        // "zzabc.txt" contains the needle but not at the name start.
249        assert_eq!(run(&idx, &driver, true), vec![abc]);
250    }
251
252    #[test]
253    fn suffixes_driver_files_only_excluded_and_multi_suffix() {
254        let mut b = VolumeIndexBuilder::new("C:", 5);
255        let (bld, trc, txt, old, gho) = (
256            u16s("build.log"),
257            u16s("trace.log"),
258            u16s("notes.txt"),
259            u16s("old.log"),
260            u16s("ghost.log"),
261        );
262        b.push(raw(10, 5, &bld, true, 0, 1)); // directory named *.log
263        b.push(raw(20, 5, &trc, false, 1, 2));
264        b.push(raw(30, 5, &txt, false, 1, 3));
265        b.push(raw(40, 5, &old, false, 1, 4));
266        b.push(raw_attr(50, 5, &gho, false, true, false)); // hidden
267        let mut idx = b.finish();
268        idx.delete(40); // tombstoned *.log
269        let dir = idx.entry_by_record(10).unwrap();
270        let trace = idx.entry_by_record(20).unwrap();
271        let notes = idx.entry_by_record(30).unwrap();
272        let ghost = idx.entry_by_record(50).unwrap();
273
274        let log = |files_only: bool| Driver::Suffixes {
275            suffixes: vec![b".log".to_vec()],
276            files_only,
277        };
278        // files_only drops the dir; tombstone and hidden drop implicitly.
279        assert_eq!(run(&idx, &log(true), true), vec![trace]);
280        // Without files_only the dir qualifies too.
281        assert_eq!(run(&idx, &log(false), true), vec![dir, trace]);
282        // skip_excluded=false surfaces the hidden file (tombstone still out).
283        assert_eq!(run(&idx, &log(true), false), vec![trace, ghost]);
284        // Multiple suffixes union within one pass.
285        let multi = Driver::Suffixes {
286            suffixes: vec![b".log".to_vec(), b".txt".to_vec()],
287            files_only: true,
288        };
289        assert_eq!(run(&idx, &multi, true), vec![trace, notes]);
290    }
291}