1use memchr::memmem;
7use regex::bytes::{Regex, RegexBuilder};
8use thiserror::Error;
9
10use super::ast::{Ast, Term};
11use super::dates::DateResolver;
12use crate::wtf8;
13
14pub use fmf_contract::options::{CaseMode, RegexScope};
17
18#[derive(Debug, Error)]
20pub enum CompileError {
21 #[error("invalid regex `{pattern}`: {source}")]
24 Regex {
25 pattern: String,
27 source: regex::Error,
29 },
30}
31
32pub(super) enum Matcher {
33 True,
35 NameSub {
37 finder: memmem::Finder<'static>,
38 folded: bool,
39 },
40 NamePrefix {
42 bytes: Vec<u8>,
43 folded: bool,
44 },
45 NameSuffix {
47 bytes: Vec<u8>,
48 folded: bool,
49 },
50 PathSub {
52 finder: memmem::Finder<'static>,
53 folded: bool,
54 },
55 NameRegex {
57 re: Regex,
58 },
59 PathRegex {
61 re: Regex,
62 },
63 Ext {
65 exts: Vec<Vec<u8>>,
66 },
67 Size {
68 min: u64,
69 max: u64,
70 },
71 Mtime {
73 min: i64,
74 max: i64,
75 },
76 IsDir(bool),
77}
78
79impl Matcher {
80 const fn cost(&self) -> u8 {
81 match self {
82 Self::True | Self::Size { .. } | Self::Mtime { .. } | Self::IsDir(_) => 0,
83 Self::Ext { .. } | Self::NamePrefix { .. } | Self::NameSuffix { .. } => 1,
84 Self::NameSub { .. } => 2,
85 Self::NameRegex { .. } => 3,
86 Self::PathSub { .. } => 4,
87 Self::PathRegex { .. } => 5,
88 }
89 }
90
91 const fn needs_folded_path(&self) -> bool {
92 matches!(self, Self::PathSub { folded: true, .. })
93 }
94
95 const fn needs_orig_path(&self) -> bool {
96 matches!(
97 self,
98 Self::PathSub { folded: false, .. } | Self::PathRegex { .. }
99 )
100 }
101}
102
103pub(super) struct CTerm {
104 pub negated: bool,
105 pub matcher: Matcher,
106 pub exact_needle_unstable: bool,
111}
112
113#[allow(clippy::large_enum_variant)]
121pub(super) enum Driver {
122 FullScan,
124 MatchAll,
126 Sub {
127 finder: memmem::Finder<'static>,
128 needle_len: usize,
129 },
130 Prefix {
131 bytes: Vec<u8>,
132 },
133 Suffixes {
134 suffixes: Vec<Vec<u8>>,
135 files_only: bool,
136 },
137}
138
139impl Driver {
140 pub(super) const fn label(&self) -> &'static str {
141 match self {
142 Self::FullScan => "full-scan",
143 Self::MatchAll => "match-all",
144 Self::Sub { .. } => "pool-scan",
145 Self::Prefix { .. } => "prefix",
146 Self::Suffixes { .. } => "suffix",
147 }
148 }
149}
150
151pub(super) struct CompiledGroup {
152 pub driver: Driver,
153 pub terms: Vec<CTerm>,
156 pub driver_term: Option<CTerm>,
162 pub driver_exact: bool,
165}
166
167impl CompiledGroup {
168 pub(super) fn all_terms(&self) -> impl Iterator<Item = &CTerm> {
171 self.driver_term.iter().chain(self.terms.iter())
172 }
173
174 pub(super) fn residual_terms(&self) -> impl Iterator<Item = &CTerm> {
177 self.driver_term
178 .iter()
179 .filter(|_| !self.driver_exact)
180 .chain(self.terms.iter())
181 }
182}
183
184pub struct CompiledQuery {
187 pub(super) groups: Vec<CompiledGroup>,
188 pub(super) needs_folded_paths: bool,
189 pub(super) needs_orig_paths: bool,
190}
191
192fn insensitive(needle: &str, case: CaseMode) -> bool {
194 match case {
195 CaseMode::Insensitive => true,
196 CaseMode::Sensitive => false,
197 CaseMode::Smart => !wtf8::has_uppercase(needle),
198 }
199}
200
201fn fold_needle(needle: &str, case: CaseMode) -> (Vec<u8>, bool) {
202 if insensitive(needle, case) {
203 (wtf8::fold_str(needle).into_bytes(), true)
204 } else {
205 (needle.as_bytes().to_vec(), false)
206 }
207}
208
209fn substring_finder(needle: &str, case: CaseMode) -> (memmem::Finder<'static>, bool) {
210 let (bytes, folded) = fold_needle(needle, case);
211 (memmem::Finder::new(&bytes).into_owned(), folded)
212}
213
214enum WildShape {
217 Prefix(String),
218 Suffix(String),
219 Inner(String),
220 General,
221}
222
223fn classify_wildcard(pattern: &str) -> WildShape {
224 if pattern.contains('?') {
225 return WildShape::General;
226 }
227 let starts = pattern.starts_with('*');
228 let ends = pattern.ends_with('*');
229 let inner = pattern.trim_matches('*');
230 if inner.contains('*') || inner.is_empty() {
231 return WildShape::General; }
233 match (starts, ends) {
234 (true, true) => WildShape::Inner(inner.to_string()),
235 (true, false) => WildShape::Suffix(inner.to_string()),
236 (false, true) => WildShape::Prefix(inner.to_string()),
237 (false, false) => WildShape::General, }
239}
240
241fn wildcard_to_regex_body(pattern: &str) -> String {
243 let mut out = String::with_capacity(pattern.len() * 2);
244 for c in pattern.chars() {
245 match c {
246 '*' => out.push_str(".*"),
247 '?' => out.push('.'),
248 c => out.push_str(®ex::escape(&c.to_string())),
249 }
250 }
251 out
252}
253
254const REGEX_SIZE_LIMIT: usize = 1 << 20;
263const REGEX_DFA_SIZE_LIMIT: usize = 1 << 20;
264
265fn build_regex(body: &str, ci: bool, pattern_for_err: &str) -> Result<Regex, CompileError> {
266 RegexBuilder::new(body)
267 .case_insensitive(ci)
268 .dot_matches_new_line(true)
269 .size_limit(REGEX_SIZE_LIMIT)
270 .dfa_size_limit(REGEX_DFA_SIZE_LIMIT)
271 .build()
272 .map_err(|source| CompileError::Regex {
273 pattern: pattern_for_err.to_string(),
274 source,
275 })
276}
277
278fn compile_term(
279 term: &Term,
280 case: CaseMode,
281 dates: &dyn DateResolver,
282) -> Result<CTerm, CompileError> {
283 let (negated, term) = match term {
284 Term::Not(inner) => (true, inner.as_ref()),
285 t => (false, t),
286 };
287
288 let matcher = match term {
289 Term::Name(s) if s.is_empty() => Matcher::True,
290 Term::Name(s) => {
291 let (finder, folded) = substring_finder(s, case);
292 Matcher::NameSub { finder, folded }
293 }
294 Term::Path(s) => {
295 let (finder, folded) = substring_finder(s, case);
296 Matcher::PathSub { finder, folded }
297 }
298 Term::Wildcard(s) => match classify_wildcard(s) {
299 WildShape::Prefix(lit) => {
300 let (bytes, folded) = fold_needle(&lit, case);
301 Matcher::NamePrefix { bytes, folded }
302 }
303 WildShape::Suffix(lit) => {
304 let (bytes, folded) = fold_needle(&lit, case);
305 Matcher::NameSuffix { bytes, folded }
306 }
307 WildShape::Inner(lit) => {
308 let (finder, folded) = substring_finder(&lit, case);
309 Matcher::NameSub { finder, folded }
310 }
311 WildShape::General => {
312 let body = format!("^{}$", wildcard_to_regex_body(s));
313 Matcher::NameRegex {
314 re: build_regex(&body, insensitive(s, case), s)?,
315 }
316 }
317 },
318 Term::PathWildcard(s) => Matcher::PathRegex {
319 re: build_regex(&wildcard_to_regex_body(s), insensitive(s, case), s)?,
320 },
321 Term::Regex(s) => Matcher::NameRegex {
322 re: build_regex(s, insensitive(s, case), s)?,
323 },
324 Term::Ext(exts) => Matcher::Ext {
325 exts: exts
326 .iter()
327 .map(|e| wtf8::fold_str(e).into_bytes())
328 .collect(),
329 },
330 Term::Size { min, max } => Matcher::Size {
331 min: *min,
332 max: *max,
333 },
334 Term::Mtime { start, end } => Matcher::Mtime {
336 min: start.map_or(i64::MIN, |c| dates.filetime_at_midnight(c)),
337 max: end.map_or(i64::MAX, |c| {
338 dates.filetime_at_midnight(c).saturating_sub(1)
339 }),
340 },
341 Term::IsDir(d) => Matcher::IsDir(*d),
342 Term::Not(_) => unreachable!("nested Not is flattened by the parser"),
343 };
344
345 let unstable = |bytes: &[u8]| {
346 let s = std::str::from_utf8(bytes).expect("query needles are valid UTF-8");
347 wtf8::has_uppercase(s)
348 };
349 let exact_needle_unstable = match &matcher {
350 Matcher::NameSub {
351 finder,
352 folded: false,
353 } => unstable(finder.needle()),
354 Matcher::NamePrefix {
355 bytes,
356 folded: false,
357 }
358 | Matcher::NameSuffix {
359 bytes,
360 folded: false,
361 } => unstable(bytes),
362 _ => false,
363 };
364 Ok(CTerm {
365 negated,
366 matcher,
367 exact_needle_unstable,
368 })
369}
370
371fn driver_score(t: &CTerm) -> Option<usize> {
374 if t.negated {
375 return None;
376 }
377 match &t.matcher {
378 Matcher::NameSub { finder, .. } => Some(finder.needle().len() * 2),
379 Matcher::NamePrefix { bytes, .. } | Matcher::NameSuffix { bytes, .. } => {
380 Some(bytes.len() * 2)
381 }
382 Matcher::Ext { exts } if !exts.is_empty() => {
384 Some(exts.iter().map(|e| (e.len() + 1) * 2).min().unwrap_or(0))
385 }
386 _ => None,
387 }
388}
389
390fn fold_exact_needle(bytes: &[u8]) -> Vec<u8> {
394 let s = std::str::from_utf8(bytes).expect("query needles are valid UTF-8");
395 wtf8::fold_str(s).into_bytes()
396}
397
398fn driver_for(t: &CTerm) -> (Driver, bool) {
404 match &t.matcher {
405 Matcher::NameSub { finder, folded } => {
406 let needle = if *folded {
407 finder.needle().to_vec()
408 } else {
409 fold_exact_needle(finder.needle())
410 };
411 (
412 Driver::Sub {
413 needle_len: needle.len(),
414 finder: memmem::Finder::new(&needle).into_owned(),
415 },
416 *folded,
417 )
418 }
419 Matcher::NamePrefix { bytes, folded } => (
420 Driver::Prefix {
421 bytes: if *folded {
422 bytes.clone()
423 } else {
424 fold_exact_needle(bytes)
425 },
426 },
427 *folded,
428 ),
429 Matcher::NameSuffix { bytes, folded } => (
430 Driver::Suffixes {
431 suffixes: vec![if *folded {
432 bytes.clone()
433 } else {
434 fold_exact_needle(bytes)
435 }],
436 files_only: false,
437 },
438 *folded,
439 ),
440 Matcher::Ext { exts } => (
441 Driver::Suffixes {
442 suffixes: exts
443 .iter()
444 .map(|e| {
445 let mut s = Vec::with_capacity(e.len() + 1);
446 s.push(b'.');
447 s.extend_from_slice(e);
448 s
449 })
450 .collect(),
451 files_only: true,
452 },
453 true,
454 ),
455 _ => unreachable!("driver_score gated"),
456 }
457}
458
459fn regex_prefilter_enabled() -> bool {
464 static ENABLED: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
465 *ENABLED.get_or_init(|| std::env::var("FMF_REGEX_PREFILTER").map_or(true, |v| v != "0"))
466}
467
468fn regex_name_prefilter(re: &Regex) -> Option<Driver> {
483 use regex_syntax::hir::literal::{ExtractKind, Extractor};
484
485 let hir = regex_syntax::parse(re.as_str()).ok()?;
486 let factor = |kind: ExtractKind| -> Option<Vec<u8>> {
487 let is_suffix = matches!(kind, ExtractKind::Suffix);
488 let mut ex = Extractor::new();
489 ex.kind(kind);
490 let seq = ex.extract(&hir);
491 let bytes = if is_suffix {
492 seq.longest_common_suffix()
493 } else {
494 seq.longest_common_prefix()
495 }?;
496 let folded = wtf8::fold_str(std::str::from_utf8(bytes).ok()?).into_bytes();
499 (folded.len() >= 2).then_some(folded)
502 };
503
504 let needle = [ExtractKind::Prefix, ExtractKind::Suffix]
506 .into_iter()
507 .filter_map(factor)
508 .max_by_key(Vec::len)?;
509 Some(Driver::Sub {
510 needle_len: needle.len(),
511 finder: memmem::Finder::new(&needle).into_owned(),
512 })
513}
514
515fn regex_prefilter_driver(terms: &[CTerm]) -> Driver {
519 if !regex_prefilter_enabled() {
520 return Driver::FullScan;
521 }
522 terms
523 .iter()
524 .filter(|t| !t.negated)
525 .find_map(|t| match &t.matcher {
526 Matcher::NameRegex { re } => regex_name_prefilter(re),
527 _ => None,
528 })
529 .unwrap_or(Driver::FullScan)
530}
531
532pub fn compile_whole_regex(
544 text: &str,
545 case: CaseMode,
546 scope: RegexScope,
547) -> Result<CompiledQuery, CompileError> {
548 let re = build_regex(text, insensitive(text, case), text)?;
549 let (matcher, needs_orig_paths) = match scope {
550 RegexScope::Name => (Matcher::NameRegex { re }, false),
551 RegexScope::Path => (Matcher::PathRegex { re }, true),
552 };
553 let term = CTerm {
554 negated: false,
555 matcher,
556 exact_needle_unstable: false,
557 };
558 let driver = match scope {
559 RegexScope::Name => regex_prefilter_driver(std::slice::from_ref(&term)),
560 RegexScope::Path => Driver::FullScan,
561 };
562 Ok(CompiledQuery {
563 groups: vec![CompiledGroup {
564 driver,
565 terms: vec![term],
566 driver_term: None,
567 driver_exact: true,
568 }],
569 needs_folded_paths: false,
570 needs_orig_paths,
571 })
572}
573
574pub fn compile(
581 ast: &Ast,
582 case: CaseMode,
583 dates: &dyn DateResolver,
584) -> Result<CompiledQuery, CompileError> {
585 let mut groups = Vec::with_capacity(ast.groups.len());
586 for g in &ast.groups {
587 let mut terms = Vec::with_capacity(g.len());
588 for t in g {
589 terms.push(compile_term(t, case, dates)?);
590 }
591
592 let mut driver_term = None;
595 let mut driver_exact = true;
596 let driver = if terms.is_empty() {
597 Driver::MatchAll
598 } else {
599 let best = terms
600 .iter()
601 .enumerate()
602 .filter_map(|(i, t)| driver_score(t).map(|s| (s, i)))
603 .max_by_key(|(s, _)| *s);
604 match best {
607 Some((score, i)) if score >= 4 => {
608 let t = terms.swap_remove(i);
609 let (d, exact) = driver_for(&t);
610 driver_term = Some(t);
611 driver_exact = exact;
612 d
613 }
614 _ => regex_prefilter_driver(&terms),
619 }
620 };
621
622 terms.sort_by_key(|t| t.matcher.cost());
623 groups.push(CompiledGroup {
624 driver,
625 terms,
626 driver_term,
627 driver_exact,
628 });
629 }
630
631 let needs_folded_paths = groups
632 .iter()
633 .flat_map(|g| &g.terms)
634 .any(|t| t.matcher.needs_folded_path());
635 let needs_orig_paths = groups
636 .iter()
637 .flat_map(|g| &g.terms)
638 .any(|t| t.matcher.needs_orig_path());
639 Ok(CompiledQuery {
640 groups,
641 needs_folded_paths,
642 needs_orig_paths,
643 })
644}
645
646impl CompiledQuery {
647 #[must_use]
649 pub fn driver_label(&self) -> String {
650 let mut labels: Vec<&str> = self.groups.iter().map(|g| g.driver.label()).collect();
651 labels.dedup();
652 labels.join("+")
653 }
654}
655
656#[cfg(test)]
657mod tests {
658 use super::super::parse;
659 use super::*;
660
661 fn prefilter_needle(pattern: &str) -> Option<Vec<u8>> {
662 let re = build_regex(pattern, false, pattern).unwrap();
663 match regex_name_prefilter(&re) {
664 Some(Driver::Sub { finder, .. }) => Some(finder.needle().to_vec()),
665 Some(_) => panic!("regex prefilter must only produce a Sub driver"),
666 None => None,
667 }
668 }
669
670 #[test]
671 fn regex_prefilter_extracts_required_literal() {
672 assert_eq!(prefilter_needle("^report"), Some(b"report".to_vec()));
674 assert_eq!(prefilter_needle("windows.*"), Some(b"windows".to_vec()));
675 assert_eq!(prefilter_needle(r".*\.dll"), Some(b".dll".to_vec()));
677 assert_eq!(prefilter_needle(r"\.rs$"), Some(b".rs".to_vec()));
678 assert_eq!(prefilter_needle("^Report"), Some(b"report".to_vec()));
681 }
682
683 #[test]
684 fn regex_prefilter_declines_without_a_usable_literal() {
685 assert_eq!(prefilter_needle(r"\d+"), None);
686 assert_eq!(prefilter_needle(".*"), None);
687 assert_eq!(
688 prefilter_needle("a"),
689 None,
690 "1-byte literal is not selective"
691 );
692 assert_eq!(
693 prefilter_needle("dll|exe"),
694 None,
695 "no common factor across the alternation"
696 );
697 }
698
699 #[test]
700 fn regex_only_group_drives_a_pool_scan() {
701 let ast = parse("regex:^report").unwrap();
704 let q = compile(&ast, CaseMode::Smart, &super::super::dates::UtcResolver).unwrap();
705 let g = &q.groups[0];
706 assert!(matches!(g.driver, Driver::Sub { .. }), "expected pool-scan");
707 assert!(g.driver_term.is_none(), "regex must remain a residual");
708 assert!(
709 g.terms
710 .iter()
711 .any(|t| matches!(t.matcher, Matcher::NameRegex { .. })),
712 "the regex residual confirms each candidate"
713 );
714
715 let ast = parse(r"regex:\d+").unwrap();
717 let q = compile(&ast, CaseMode::Smart, &super::super::dates::UtcResolver).unwrap();
718 assert!(matches!(q.groups[0].driver, Driver::FullScan));
719 }
720
721 #[test]
722 fn oversized_regex_is_rejected_not_compiled() {
723 let ast = parse(r"regex:(a{500}){500}").unwrap();
727 let result = compile(&ast, CaseMode::Smart, &super::super::dates::UtcResolver);
728 assert!(
729 matches!(result, Err(CompileError::Regex { .. })),
730 "a 1 MiB+ regex program must be refused, not compiled"
731 );
732 }
733}
734
735#[cfg(test)]
736mod proptests {
737 use proptest::{prop_assert, proptest};
738
739 use super::super::{dates::UtcResolver, parse};
740 use super::*;
741
742 proptest! {
743 #[test]
748 fn regex_compile_is_panic_free_and_bounded(
749 body in r"[a-z0-9()\[\]{}.*+?^$|\\]{0,40}"
750 ) {
751 let text = format!("regex:\"{body}\"");
752 if let Ok(ast) = parse(&text) {
753 let _ = compile(&ast, CaseMode::Smart, &UtcResolver);
755 prop_assert!(true);
756 }
757 }
758 }
759}