Skip to main content

fmf_core\query/
compile.rs

1//! AST → compiled execution plan. Each AND group gets a *driver* — the most
2//! selective positive literal, executed as a single SIMD sweep over the name
3//! pool — plus residual matchers ordered by evaluation cost (numeric filters
4//! → memmem → regex → path).
5
6use memchr::memmem;
7use regex::bytes::{Regex, RegexBuilder};
8use thiserror::Error;
9
10use super::ast::{Ast, Term};
11use super::dates::DateResolver;
12use crate::wtf8;
13
14// The case mode / regex scope are contract surface (FmfQueryOptions carries
15// them as u32) — the canonical definitions are used directly (ADR-0018).
16pub use fmf_contract::options::{CaseMode, RegexScope};
17
18/// Why a query failed to compile into an executable plan.
19#[derive(Debug, Error)]
20pub enum CompileError {
21    /// A `regex:`/`path:`-regex (or whole-query) pattern is invalid syntax or
22    /// exceeds the compile size limit (`REGEX_SIZE_LIMIT`, ADR-0023).
23    #[error("invalid regex `{pattern}`: {source}")]
24    Regex {
25        /// The offending pattern text, as written in the query.
26        pattern: String,
27        /// The underlying error from the `regex` crate's builder.
28        source: regex::Error,
29    },
30}
31
32pub(super) enum Matcher {
33    /// Empty needle — matches everything.
34    True,
35    /// Substring in the name. `folded` selects the lower pool + folded needle.
36    NameSub {
37        finder: memmem::Finder<'static>,
38        folded: bool,
39    },
40    /// Name starts with the bytes (`lit*`).
41    NamePrefix {
42        bytes: Vec<u8>,
43        folded: bool,
44    },
45    /// Name ends with the bytes (`*.lit`).
46    NameSuffix {
47        bytes: Vec<u8>,
48        folded: bool,
49    },
50    /// Substring in the full path.
51    PathSub {
52        finder: memmem::Finder<'static>,
53        folded: bool,
54    },
55    /// Anchored wildcard or user regex over the (original) name bytes.
56    NameRegex {
57        re: Regex,
58    },
59    /// Unanchored wildcard/regex over the (original) full-path bytes.
60    PathRegex {
61        re: Regex,
62    },
63    /// Extension equals any of these folded byte strings.
64    Ext {
65        exts: Vec<Vec<u8>>,
66    },
67    Size {
68        min: u64,
69        max: u64,
70    },
71    /// Inclusive FILETIME tick range.
72    Mtime {
73        min: i64,
74        max: i64,
75    },
76    IsDir(bool),
77}
78
79impl Matcher {
80    const fn cost(&self) -> u8 {
81        match self {
82            Self::True | Self::Size { .. } | Self::Mtime { .. } | Self::IsDir(_) => 0,
83            Self::Ext { .. } | Self::NamePrefix { .. } | Self::NameSuffix { .. } => 1,
84            Self::NameSub { .. } => 2,
85            Self::NameRegex { .. } => 3,
86            Self::PathSub { .. } => 4,
87            Self::PathRegex { .. } => 5,
88        }
89    }
90
91    const fn needs_folded_path(&self) -> bool {
92        matches!(self, Self::PathSub { folded: true, .. })
93    }
94
95    const fn needs_orig_path(&self) -> bool {
96        matches!(
97            self,
98            Self::PathSub { folded: false, .. } | Self::PathRegex { .. }
99        )
100    }
101}
102
103pub(super) struct CTerm {
104    pub negated: bool,
105    pub matcher: Matcher,
106    /// Derived for case-exact name literals: the needle is *not* its own
107    /// fold (it contains an uppercase/foldable character). Such a needle
108    /// can never occur in a fold-identical name — the matcher's O(1)
109    /// reject (matchers.rs, ADR-0004).
110    pub exact_needle_unstable: bool,
111}
112
113/// Candidate generator for one AND group — a single sweep over the folded
114/// name pool (the only contiguous one) instead of a per-entry matcher call.
115/// Needles are always folded; a case-exact source term makes the sweep a
116/// superset and its exact comparison runs as a residual
117/// (`CompiledGroup::driver_exact`).
118// The Finder-carrying variants dwarf the unit ones; boxing would add an
119// indirection to the hottest call in the engine for no measurable win.
120#[allow(clippy::large_enum_variant)]
121pub(super) enum Driver {
122    /// No usable positive literal: evaluate every entry.
123    FullScan,
124    /// Group has no terms at all (empty query / bare `folder:`-less group).
125    MatchAll,
126    Sub {
127        finder: memmem::Finder<'static>,
128        needle_len: usize,
129    },
130    Prefix {
131        bytes: Vec<u8>,
132    },
133    Suffixes {
134        suffixes: Vec<Vec<u8>>,
135        files_only: bool,
136    },
137}
138
139impl Driver {
140    pub(super) const fn label(&self) -> &'static str {
141        match self {
142            Self::FullScan => "full-scan",
143            Self::MatchAll => "match-all",
144            Self::Sub { .. } => "pool-scan",
145            Self::Prefix { .. } => "prefix",
146            Self::Suffixes { .. } => "suffix",
147        }
148    }
149}
150
151pub(super) struct CompiledGroup {
152    pub driver: Driver,
153    /// Residual matchers (cost-ordered); the driver's own condition is fully
154    /// checked by the sweep and removed from here.
155    pub terms: Vec<CTerm>,
156    /// The term the driver was built from (None for FullScan/MatchAll).
157    /// The sweep never reads it — it exists so cached-query refinement can
158    /// re-evaluate the *complete* group per candidate (`exec::refine`), so
159    /// subsumption sees every condition (subsume.rs), and so the exec can
160    /// verify it per candidate when the sweep was a superset (below).
161    pub driver_term: Option<CTerm>,
162    /// False when the source term is case-exact: the folded sweep then
163    /// over-approximates and `driver_term` must be verified per candidate.
164    pub driver_exact: bool,
165}
166
167impl CompiledGroup {
168    /// Every condition of this AND group: the driver's source term (most
169    /// selective, so first) followed by the cost-ordered residuals.
170    pub(super) fn all_terms(&self) -> impl Iterator<Item = &CTerm> {
171        self.driver_term.iter().chain(self.terms.iter())
172    }
173
174    /// The conditions the sweep did *not* fully check: the residuals, plus
175    /// the driver's source term when the sweep was a folded superset.
176    pub(super) fn residual_terms(&self) -> impl Iterator<Item = &CTerm> {
177        self.driver_term
178            .iter()
179            .filter(|_| !self.driver_exact)
180            .chain(self.terms.iter())
181    }
182}
183
184/// An executable plan: one compiled AND group per OR clause, plus the path
185/// pools the sweep must materialize to evaluate them.
186pub struct CompiledQuery {
187    pub(super) groups: Vec<CompiledGroup>,
188    pub(super) needs_folded_paths: bool,
189    pub(super) needs_orig_paths: bool,
190}
191
192/// Smart-case decision for one needle.
193fn insensitive(needle: &str, case: CaseMode) -> bool {
194    match case {
195        CaseMode::Insensitive => true,
196        CaseMode::Sensitive => false,
197        CaseMode::Smart => !wtf8::has_uppercase(needle),
198    }
199}
200
201fn fold_needle(needle: &str, case: CaseMode) -> (Vec<u8>, bool) {
202    if insensitive(needle, case) {
203        (wtf8::fold_str(needle).into_bytes(), true)
204    } else {
205        (needle.as_bytes().to_vec(), false)
206    }
207}
208
209fn substring_finder(needle: &str, case: CaseMode) -> (memmem::Finder<'static>, bool) {
210    let (bytes, folded) = fold_needle(needle, case);
211    (memmem::Finder::new(&bytes).into_owned(), folded)
212}
213
214/// `lit*` / `*lit` / `*lit*` style patterns collapse to anchored byte
215/// comparisons; everything else stays a regex.
216enum WildShape {
217    Prefix(String),
218    Suffix(String),
219    Inner(String),
220    General,
221}
222
223fn classify_wildcard(pattern: &str) -> WildShape {
224    if pattern.contains('?') {
225        return WildShape::General;
226    }
227    let starts = pattern.starts_with('*');
228    let ends = pattern.ends_with('*');
229    let inner = pattern.trim_matches('*');
230    if inner.contains('*') || inner.is_empty() {
231        return WildShape::General; // "a*b", "**", "*"
232    }
233    match (starts, ends) {
234        (true, true) => WildShape::Inner(inner.to_string()),
235        (true, false) => WildShape::Suffix(inner.to_string()),
236        (false, true) => WildShape::Prefix(inner.to_string()),
237        (false, false) => WildShape::General, // no '*' at all → parser bug
238    }
239}
240
241/// Translate a `*`/`?` pattern into a regex body (no anchors).
242fn wildcard_to_regex_body(pattern: &str) -> String {
243    let mut out = String::with_capacity(pattern.len() * 2);
244    for c in pattern.chars() {
245        match c {
246            '*' => out.push_str(".*"),
247            '?' => out.push('.'),
248            c => out.push_str(&regex::escape(&c.to_string())),
249        }
250    }
251    out
252}
253
254/// Compile-time bounds on a user regex (ADR-0023). The `regex` crate matches
255/// in guaranteed linear time (finite automata, no backtracking) — so there is
256/// no `ReDoS` *execution* blowup — but a pathological pattern can still demand a
257/// large program/DFA at *build* time. We index file names only (p99 ≈110 B),
258/// so a legitimate name regex never approaches 1 MiB; capping there turns a
259/// memory-DoS pattern into a clean `CompiledTooBig` → `FMF_E_QUERY_SYNTAX`
260/// rejection (it flows through `CompileError::Regex` unchanged), instead of
261/// letting the elevated service compile it. Both default higher (10/2 MiB).
262const REGEX_SIZE_LIMIT: usize = 1 << 20;
263const REGEX_DFA_SIZE_LIMIT: usize = 1 << 20;
264
265fn build_regex(body: &str, ci: bool, pattern_for_err: &str) -> Result<Regex, CompileError> {
266    RegexBuilder::new(body)
267        .case_insensitive(ci)
268        .dot_matches_new_line(true)
269        .size_limit(REGEX_SIZE_LIMIT)
270        .dfa_size_limit(REGEX_DFA_SIZE_LIMIT)
271        .build()
272        .map_err(|source| CompileError::Regex {
273            pattern: pattern_for_err.to_string(),
274            source,
275        })
276}
277
278fn compile_term(
279    term: &Term,
280    case: CaseMode,
281    dates: &dyn DateResolver,
282) -> Result<CTerm, CompileError> {
283    let (negated, term) = match term {
284        Term::Not(inner) => (true, inner.as_ref()),
285        t => (false, t),
286    };
287
288    let matcher = match term {
289        Term::Name(s) if s.is_empty() => Matcher::True,
290        Term::Name(s) => {
291            let (finder, folded) = substring_finder(s, case);
292            Matcher::NameSub { finder, folded }
293        }
294        Term::Path(s) => {
295            let (finder, folded) = substring_finder(s, case);
296            Matcher::PathSub { finder, folded }
297        }
298        Term::Wildcard(s) => match classify_wildcard(s) {
299            WildShape::Prefix(lit) => {
300                let (bytes, folded) = fold_needle(&lit, case);
301                Matcher::NamePrefix { bytes, folded }
302            }
303            WildShape::Suffix(lit) => {
304                let (bytes, folded) = fold_needle(&lit, case);
305                Matcher::NameSuffix { bytes, folded }
306            }
307            WildShape::Inner(lit) => {
308                let (finder, folded) = substring_finder(&lit, case);
309                Matcher::NameSub { finder, folded }
310            }
311            WildShape::General => {
312                let body = format!("^{}$", wildcard_to_regex_body(s));
313                Matcher::NameRegex {
314                    re: build_regex(&body, insensitive(s, case), s)?,
315                }
316            }
317        },
318        Term::PathWildcard(s) => Matcher::PathRegex {
319            re: build_regex(&wildcard_to_regex_body(s), insensitive(s, case), s)?,
320        },
321        Term::Regex(s) => Matcher::NameRegex {
322            re: build_regex(s, insensitive(s, case), s)?,
323        },
324        Term::Ext(exts) => Matcher::Ext {
325            exts: exts
326                .iter()
327                .map(|e| wtf8::fold_str(e).into_bytes())
328                .collect(),
329        },
330        Term::Size { min, max } => Matcher::Size {
331            min: *min,
332            max: *max,
333        },
334        // [start, end) at local midnight → inclusive tick range.
335        Term::Mtime { start, end } => Matcher::Mtime {
336            min: start.map_or(i64::MIN, |c| dates.filetime_at_midnight(c)),
337            max: end.map_or(i64::MAX, |c| {
338                dates.filetime_at_midnight(c).saturating_sub(1)
339            }),
340        },
341        Term::IsDir(d) => Matcher::IsDir(*d),
342        Term::Not(_) => unreachable!("nested Not is flattened by the parser"),
343    };
344
345    let unstable = |bytes: &[u8]| {
346        let s = std::str::from_utf8(bytes).expect("query needles are valid UTF-8");
347        wtf8::has_uppercase(s)
348    };
349    let exact_needle_unstable = match &matcher {
350        Matcher::NameSub {
351            finder,
352            folded: false,
353        } => unstable(finder.needle()),
354        Matcher::NamePrefix {
355            bytes,
356            folded: false,
357        }
358        | Matcher::NameSuffix {
359            bytes,
360            folded: false,
361        } => unstable(bytes),
362        _ => false,
363    };
364    Ok(CTerm {
365        negated,
366        matcher,
367        exact_needle_unstable,
368    })
369}
370
371/// Driver candidate score — longer literals are more selective. Returns
372/// None for matchers that cannot drive a pool sweep.
373fn driver_score(t: &CTerm) -> Option<usize> {
374    if t.negated {
375        return None;
376    }
377    match &t.matcher {
378        Matcher::NameSub { finder, .. } => Some(finder.needle().len() * 2),
379        Matcher::NamePrefix { bytes, .. } | Matcher::NameSuffix { bytes, .. } => {
380            Some(bytes.len() * 2)
381        }
382        // The sweep needle is ".<ext>" — score like the other literals.
383        Matcher::Ext { exts } if !exts.is_empty() => {
384            Some(exts.iter().map(|e| (e.len() + 1) * 2).min().unwrap_or(0))
385        }
386        _ => None,
387    }
388}
389
390/// Fold a case-exact needle for the superset sweep. Needles always
391/// originate from the query `&str`, so the bytes are valid UTF-8; the
392/// fold's length preservation keeps prefix/suffix anchors sound.
393fn fold_exact_needle(bytes: &[u8]) -> Vec<u8> {
394    let s = std::str::from_utf8(bytes).expect("query needles are valid UTF-8");
395    wtf8::fold_str(s).into_bytes()
396}
397
398/// Build the sweep driver from a term, leaving the term intact (kept as
399/// `CompiledGroup::driver_term`). Returns the driver and whether it fully
400/// checks the term — false for a case-exact term: the sweep folds its
401/// needle (sound: an original-case match always implies the folded match)
402/// and the exact comparison runs as a residual.
403fn driver_for(t: &CTerm) -> (Driver, bool) {
404    match &t.matcher {
405        Matcher::NameSub { finder, folded } => {
406            let needle = if *folded {
407                finder.needle().to_vec()
408            } else {
409                fold_exact_needle(finder.needle())
410            };
411            (
412                Driver::Sub {
413                    needle_len: needle.len(),
414                    finder: memmem::Finder::new(&needle).into_owned(),
415                },
416                *folded,
417            )
418        }
419        Matcher::NamePrefix { bytes, folded } => (
420            Driver::Prefix {
421                bytes: if *folded {
422                    bytes.clone()
423                } else {
424                    fold_exact_needle(bytes)
425                },
426            },
427            *folded,
428        ),
429        Matcher::NameSuffix { bytes, folded } => (
430            Driver::Suffixes {
431                suffixes: vec![if *folded {
432                    bytes.clone()
433                } else {
434                    fold_exact_needle(bytes)
435                }],
436                files_only: false,
437            },
438            *folded,
439        ),
440        Matcher::Ext { exts } => (
441            Driver::Suffixes {
442                suffixes: exts
443                    .iter()
444                    .map(|e| {
445                        let mut s = Vec::with_capacity(e.len() + 1);
446                        s.push(b'.');
447                        s.extend_from_slice(e);
448                        s
449                    })
450                    .collect(),
451                files_only: true,
452            },
453            true,
454        ),
455        _ => unreachable!("driver_score gated"),
456    }
457}
458
459/// Kill switch for the regex literal prefilter (`FMF_REGEX_PREFILTER=0`) —
460/// forces literal-less *and* literal-bearing regex groups onto the chunked
461/// full scan. A field escape hatch if a prefilter soundness bug ever
462/// surfaces (the same shape as `FMF_QUERY_CACHE`, ADR-0023).
463fn regex_prefilter_enabled() -> bool {
464    static ENABLED: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
465    *ENABLED.get_or_init(|| std::env::var("FMF_REGEX_PREFILTER").map_or(true, |v| v != "0"))
466}
467
468/// Extract a *required* literal factor from a name regex and turn it into a
469/// folded-pool substring sweep — the same linear sweep every literal query
470/// uses (ADR-0002), so regex stays off the full scan without any standing
471/// index (ADR-0023).
472///
473/// Soundness: regex-syntax prefix (resp. suffix) extraction yields literals
474/// that every match must begin (resp. end) with; the longest common
475/// prefix/suffix `S` of that set is therefore present, contiguously, in
476/// every matched substring — hence in the name. Folding `S` and sweeping the
477/// (folded) lower pool is a superset for both case modes (an original-case
478/// occurrence implies the folded one, length-preserving per code point), and
479/// the `NameRegex` residual re-checks every candidate exactly. Returns `None`
480/// when no usable literal exists (`\d+`, a leading `.*`, an alternation with
481/// no common factor); the caller then falls back to a full scan.
482fn regex_name_prefilter(re: &Regex) -> Option<Driver> {
483    use regex_syntax::hir::literal::{ExtractKind, Extractor};
484
485    let hir = regex_syntax::parse(re.as_str()).ok()?;
486    let factor = |kind: ExtractKind| -> Option<Vec<u8>> {
487        let is_suffix = matches!(kind, ExtractKind::Suffix);
488        let mut ex = Extractor::new();
489        ex.kind(kind);
490        let seq = ex.extract(&hir);
491        let bytes = if is_suffix {
492            seq.longest_common_suffix()
493        } else {
494            seq.longest_common_prefix()
495        }?;
496        // A common factor that splits a multi-byte code point is unusable as
497        // a folded needle; bail to a full scan rather than fold garbage.
498        let folded = wtf8::fold_str(std::str::from_utf8(bytes).ok()?).into_bytes();
499        // A 1-byte needle hits nearly every name — the per-hit sweep
500        // bookkeeping then loses to a plain full scan (the `score >= 4` gate).
501        (folded.len() >= 2).then_some(folded)
502    };
503
504    // Prefer the longer required factor; both map to a sound substring sweep.
505    let needle = [ExtractKind::Prefix, ExtractKind::Suffix]
506        .into_iter()
507        .filter_map(factor)
508        .max_by_key(Vec::len)?;
509    Some(Driver::Sub {
510        needle_len: needle.len(),
511        finder: memmem::Finder::new(&needle).into_owned(),
512    })
513}
514
515/// When a group has no literal driver, try to drive it from a positive name
516/// regex's required literal. The regex matcher stays in `terms` as the
517/// residual that confirms each candidate, so `driver_term` is `None`.
518fn regex_prefilter_driver(terms: &[CTerm]) -> Driver {
519    if !regex_prefilter_enabled() {
520        return Driver::FullScan;
521    }
522    terms
523        .iter()
524        .filter(|t| !t.negated)
525        .find_map(|t| match &t.matcher {
526            Matcher::NameRegex { re } => regex_name_prefilter(re),
527            _ => None,
528        })
529        .unwrap_or(Driver::FullScan)
530}
531
532/// Compile the *entire* query text as one regex (whole-query regex mode).
533///
534/// No parsing, no operators (ADR-0023) — the text is the pattern, matched
535/// against the file name or the full path per `scope`. Name scope reuses the
536/// literal prefilter; path scope falls back to a full scan (the path pool is
537/// not contiguous). One AND group, the regex left as the residual.
538///
539/// # Errors
540///
541/// Returns [`CompileError::Regex`] if `text` is not a valid regex or exceeds
542/// the compile size limit.
543pub fn compile_whole_regex(
544    text: &str,
545    case: CaseMode,
546    scope: RegexScope,
547) -> Result<CompiledQuery, CompileError> {
548    let re = build_regex(text, insensitive(text, case), text)?;
549    let (matcher, needs_orig_paths) = match scope {
550        RegexScope::Name => (Matcher::NameRegex { re }, false),
551        RegexScope::Path => (Matcher::PathRegex { re }, true),
552    };
553    let term = CTerm {
554        negated: false,
555        matcher,
556        exact_needle_unstable: false,
557    };
558    let driver = match scope {
559        RegexScope::Name => regex_prefilter_driver(std::slice::from_ref(&term)),
560        RegexScope::Path => Driver::FullScan,
561    };
562    Ok(CompiledQuery {
563        groups: vec![CompiledGroup {
564            driver,
565            terms: vec![term],
566            driver_term: None,
567            driver_exact: true,
568        }],
569        needs_folded_paths: false,
570        needs_orig_paths,
571    })
572}
573
574/// Compile a parsed [`Ast`] into an executable [`CompiledQuery`].
575///
576/// # Errors
577///
578/// Returns [`CompileError::Regex`] if a `regex:`/`path:`-regex term fails to
579/// compile.
580pub fn compile(
581    ast: &Ast,
582    case: CaseMode,
583    dates: &dyn DateResolver,
584) -> Result<CompiledQuery, CompileError> {
585    let mut groups = Vec::with_capacity(ast.groups.len());
586    for g in &ast.groups {
587        let mut terms = Vec::with_capacity(g.len());
588        for t in g {
589            terms.push(compile_term(t, case, dates)?);
590        }
591
592        // Pick the most selective positive literal as the driver and pull it
593        // out of the residual list. Empty needles (Matcher::True) never score.
594        let mut driver_term = None;
595        let mut driver_exact = true;
596        let driver = if terms.is_empty() {
597            Driver::MatchAll
598        } else {
599            let best = terms
600                .iter()
601                .enumerate()
602                .filter_map(|(i, t)| driver_score(t).map(|s| (s, i)))
603                .max_by_key(|(s, _)| *s);
604            // Single-byte needles hit nearly every entry — the per-hit sweep
605            // bookkeeping then costs more than a plain full scan does.
606            match best {
607                Some((score, i)) if score >= 4 => {
608                    let t = terms.swap_remove(i);
609                    let (d, exact) = driver_for(&t);
610                    driver_term = Some(t);
611                    driver_exact = exact;
612                    d
613                }
614                // No usable literal driver. A positive name regex can still
615                // narrow via its required literal (the regex stays a residual
616                // — driver_term None, driver_exact irrelevant); otherwise full
617                // scan.
618                _ => regex_prefilter_driver(&terms),
619            }
620        };
621
622        terms.sort_by_key(|t| t.matcher.cost());
623        groups.push(CompiledGroup {
624            driver,
625            terms,
626            driver_term,
627            driver_exact,
628        });
629    }
630
631    let needs_folded_paths = groups
632        .iter()
633        .flat_map(|g| &g.terms)
634        .any(|t| t.matcher.needs_folded_path());
635    let needs_orig_paths = groups
636        .iter()
637        .flat_map(|g| &g.terms)
638        .any(|t| t.matcher.needs_orig_path());
639    Ok(CompiledQuery {
640        groups,
641        needs_folded_paths,
642        needs_orig_paths,
643    })
644}
645
646impl CompiledQuery {
647    /// Human-readable driver summary for `QueryTrace`.
648    #[must_use]
649    pub fn driver_label(&self) -> String {
650        let mut labels: Vec<&str> = self.groups.iter().map(|g| g.driver.label()).collect();
651        labels.dedup();
652        labels.join("+")
653    }
654}
655
656#[cfg(test)]
657mod tests {
658    use super::super::parse;
659    use super::*;
660
661    fn prefilter_needle(pattern: &str) -> Option<Vec<u8>> {
662        let re = build_regex(pattern, false, pattern).unwrap();
663        match regex_name_prefilter(&re) {
664            Some(Driver::Sub { finder, .. }) => Some(finder.needle().to_vec()),
665            Some(_) => panic!("regex prefilter must only produce a Sub driver"),
666            None => None,
667        }
668    }
669
670    #[test]
671    fn regex_prefilter_extracts_required_literal() {
672        // Leading literal → prefix factor.
673        assert_eq!(prefilter_needle("^report"), Some(b"report".to_vec()));
674        assert_eq!(prefilter_needle("windows.*"), Some(b"windows".to_vec()));
675        // Trailing-anchored literal → suffix factor (the prefix is `.*`).
676        assert_eq!(prefilter_needle(r".*\.dll"), Some(b".dll".to_vec()));
677        assert_eq!(prefilter_needle(r"\.rs$"), Some(b".rs".to_vec()));
678        // Folded for the lower-pool sweep; the case-sensitive residual still
679        // re-checks each candidate.
680        assert_eq!(prefilter_needle("^Report"), Some(b"report".to_vec()));
681    }
682
683    #[test]
684    fn regex_prefilter_declines_without_a_usable_literal() {
685        assert_eq!(prefilter_needle(r"\d+"), None);
686        assert_eq!(prefilter_needle(".*"), None);
687        assert_eq!(
688            prefilter_needle("a"),
689            None,
690            "1-byte literal is not selective"
691        );
692        assert_eq!(
693            prefilter_needle("dll|exe"),
694            None,
695            "no common factor across the alternation"
696        );
697    }
698
699    #[test]
700    fn regex_only_group_drives_a_pool_scan() {
701        // A pure name-regex group with a literal must leave the full scan
702        // behind: a Sub driver, no driver_term (the regex stays the residual).
703        let ast = parse("regex:^report").unwrap();
704        let q = compile(&ast, CaseMode::Smart, &super::super::dates::UtcResolver).unwrap();
705        let g = &q.groups[0];
706        assert!(matches!(g.driver, Driver::Sub { .. }), "expected pool-scan");
707        assert!(g.driver_term.is_none(), "regex must remain a residual");
708        assert!(
709            g.terms
710                .iter()
711                .any(|t| matches!(t.matcher, Matcher::NameRegex { .. })),
712            "the regex residual confirms each candidate"
713        );
714
715        // A literal-less regex stays on the full scan.
716        let ast = parse(r"regex:\d+").unwrap();
717        let q = compile(&ast, CaseMode::Smart, &super::super::dates::UtcResolver).unwrap();
718        assert!(matches!(q.groups[0].driver, Driver::FullScan));
719    }
720
721    #[test]
722    fn oversized_regex_is_rejected_not_compiled() {
723        // A pattern that demands a >1 MiB program must come back as a clean
724        // CompileError (→ FMF_E_QUERY_SYNTAX), never a panic or an OOM. The
725        // bounded-repetition blowup unrolls past REGEX_SIZE_LIMIT.
726        let ast = parse(r"regex:(a{500}){500}").unwrap();
727        let result = compile(&ast, CaseMode::Smart, &super::super::dates::UtcResolver);
728        assert!(
729            matches!(result, Err(CompileError::Regex { .. })),
730            "a 1 MiB+ regex program must be refused, not compiled"
731        );
732    }
733}
734
735#[cfg(test)]
736mod proptests {
737    use proptest::{prop_assert, proptest};
738
739    use super::super::{dates::UtcResolver, parse};
740    use super::*;
741
742    proptest! {
743        // Compiling a `regex:` term must never panic and never OOM, whatever
744        // the pattern: it returns Ok (a built matcher) or a CompileError
745        // (invalid syntax or over the size limit). Biased to the regex
746        // metacharacter alphabet so the build paths get dense coverage.
747        #[test]
748        fn regex_compile_is_panic_free_and_bounded(
749            body in r"[a-z0-9()\[\]{}.*+?^$|\\]{0,40}"
750        ) {
751            let text = format!("regex:\"{body}\"");
752            if let Ok(ast) = parse(&text) {
753                // Ok or Err — both are acceptable; the property is "no panic".
754                let _ = compile(&ast, CaseMode::Smart, &UtcResolver);
755                prop_assert!(true);
756            }
757        }
758    }
759}