Skip to main content

fmf_core\query/
ast.rs

1//! Tokenizer and parser: query text → [`Ast`] (OR of AND groups).
2
3use thiserror::Error;
4
5use super::dates::Civil;
6
7/// Reasons [`parse`] rejects malformed query text.
8#[derive(Debug, Error, PartialEq, Eq)]
9pub enum ParseError {
10    /// A `"` opened a quoted section that was never closed.
11    #[error("unclosed quote")]
12    UnclosedQuote,
13    /// A `size:` value did not parse as a byte amount or range.
14    #[error("invalid size filter `{0}`")]
15    InvalidSize(String),
16    /// A `dm:` value did not parse as a date or date range.
17    #[error("invalid date filter `{0}`")]
18    InvalidDate(String),
19    /// Negation was applied to a term that cannot be negated (e.g. `!folder:foo`).
20    #[error("`{0}` cannot be negated")]
21    CannotNegate(String),
22}
23
24/// A single matchable condition within an AND group of the [`Ast`].
25#[derive(Debug, Clone, PartialEq)]
26pub enum Term {
27    /// Substring in the file name.
28    Name(String),
29    /// Substring in the full path (term contained `\` or used `path:`).
30    Path(String),
31    /// `*`/`?` pattern matching the whole file name.
32    Wildcard(String),
33    /// `*`/`?` pattern matched unanchored against the full path.
34    PathWildcard(String),
35    /// `regex:` — applied to the file name.
36    Regex(String),
37    /// `ext:` — any of these extensions (without dots, original case).
38    Ext(Vec<String>),
39    /// `size:` — inclusive byte range.
40    Size {
41        /// Lower bound, in bytes (inclusive).
42        min: u64,
43        /// Upper bound, in bytes (inclusive).
44        max: u64,
45    },
46    /// `dm:` — [start, end) at local midnight; `None` = unbounded.
47    Mtime {
48        /// Inclusive start of the range at local midnight; `None` = unbounded below.
49        start: Option<Civil>,
50        /// Exclusive end of the range at local midnight; `None` = unbounded above.
51        end: Option<Civil>,
52    },
53    /// `folder:` / `file:`.
54    IsDir(bool),
55    /// Logical negation of the inner term.
56    Not(Box<Self>),
57}
58
59/// OR of AND groups: `a b | c` → `[[a, b], [c]]`.
60#[derive(Debug, Clone, PartialEq)]
61pub struct Ast {
62    /// AND groups joined by OR: a file matches if every term in any one group matches.
63    pub groups: Vec<Vec<Term>>,
64}
65
66/// Tokenize and parse query text into an [`Ast`].
67///
68/// # Errors
69///
70/// Returns a [`ParseError`] for malformed input: an unclosed quote, or an
71/// invalid `size:`/date filter, or a term that cannot be negated.
72///
73/// # Panics
74///
75/// Does not panic: `groups` is seeded with one group and only grows, so the
76/// `last_mut` access always succeeds.
77pub fn parse(input: &str) -> Result<Ast, ParseError> {
78    let mut groups: Vec<Vec<Term>> = vec![Vec::new()];
79    let mut rest = input;
80
81    loop {
82        rest = rest.trim_start();
83        if rest.is_empty() {
84            break;
85        }
86        if let Some(r) = rest.strip_prefix('|') {
87            groups.push(Vec::new());
88            rest = r;
89            continue;
90        }
91
92        let mut negated = false;
93        while let Some(r) = rest.strip_prefix('!') {
94            negated = !negated;
95            rest = r;
96        }
97
98        let (atom, r) = read_atom(rest)?;
99        rest = r;
100        if atom.is_empty() {
101            continue;
102        }
103        let terms = terms_from_atom(&atom, negated)?;
104        for t in terms {
105            groups
106                .last_mut()
107                .expect("groups is seeded with one element")
108                .push(t);
109        }
110    }
111
112    Ok(Ast { groups })
113}
114
115/// Read one atom: up to whitespace or `|`, honoring quoted sections
116/// (`"two words"`, `path:"Program Files"`).
117fn read_atom(input: &str) -> Result<(String, &str), ParseError> {
118    let mut out = String::new();
119    let mut it = input.char_indices();
120    let mut in_quotes = false;
121    loop {
122        let Some((i, c)) = it.next() else {
123            if in_quotes {
124                return Err(ParseError::UnclosedQuote);
125            }
126            return Ok((out, ""));
127        };
128        match c {
129            '"' => {
130                in_quotes = !in_quotes;
131                out.push(c);
132            }
133            c if !in_quotes && (c.is_whitespace() || c == '|') => {
134                return Ok((out, &input[i..]));
135            }
136            c => out.push(c),
137        }
138    }
139}
140
141fn unquote(s: &str) -> &str {
142    s.strip_prefix('"')
143        .and_then(|s| s.strip_suffix('"'))
144        .unwrap_or(s)
145}
146
147const FIELDS: &[&str] = &["ext", "path", "size", "dm", "regex", "file", "folder"];
148
149fn terms_from_atom(atom: &str, negated: bool) -> Result<Vec<Term>, ParseError> {
150    let wrap = |t: Term| if negated { Term::Not(Box::new(t)) } else { t };
151
152    // field:value?
153    if !atom.starts_with('"')
154        && let Some(colon) = atom.find(':')
155    {
156        let field = &atom[..colon];
157        let raw_value = &atom[colon + 1..];
158        if FIELDS.contains(&field.to_ascii_lowercase().as_str()) {
159            let value = unquote(raw_value);
160            return match field.to_ascii_lowercase().as_str() {
161                "ext" => {
162                    let exts: Vec<String> = value
163                        .split([';', ','])
164                        .map(|e| e.trim().trim_start_matches('.').to_string())
165                        .filter(|e| !e.is_empty())
166                        .collect();
167                    Ok(vec![wrap(Term::Ext(exts))])
168                }
169                "path" => Ok(vec![wrap(if has_wildcard(value) {
170                    Term::PathWildcard(value.to_string())
171                } else {
172                    Term::Path(value.to_string())
173                })]),
174                "size" => {
175                    let (min, max) = parse_size_range(value)
176                        .ok_or_else(|| ParseError::InvalidSize(value.to_string()))?;
177                    Ok(vec![wrap(Term::Size { min, max })])
178                }
179                "dm" => {
180                    let (start, end) = parse_date_range(value)
181                        .ok_or_else(|| ParseError::InvalidDate(value.to_string()))?;
182                    Ok(vec![wrap(Term::Mtime { start, end })])
183                }
184                // The regex value is the raw remainder (may itself contain ':').
185                "regex" => Ok(vec![wrap(Term::Regex(unquote(raw_value).to_string()))]),
186                "file" | "folder" => {
187                    let is_dir = field.eq_ignore_ascii_case("folder");
188                    if value.is_empty() {
189                        Ok(vec![wrap(Term::IsDir(is_dir))])
190                    } else if negated {
191                        // `!folder:foo` is ambiguous; refuse rather than guess.
192                        Err(ParseError::CannotNegate(format!("{field}:{value}")))
193                    } else {
194                        Ok(vec![Term::IsDir(is_dir), name_or_path_term(value)])
195                    }
196                }
197                _ => unreachable!(),
198            };
199        }
200    }
201
202    Ok(vec![wrap(name_or_path_term(unquote(atom)))])
203}
204
205fn name_or_path_term(value: &str) -> Term {
206    let wild = has_wildcard(value);
207    let pathy = value.contains('\\');
208    match (pathy, wild) {
209        (true, true) => Term::PathWildcard(value.to_string()),
210        (true, false) => Term::Path(value.to_string()),
211        (false, true) => Term::Wildcard(value.to_string()),
212        (false, false) => Term::Name(value.to_string()),
213    }
214}
215
216fn has_wildcard(s: &str) -> bool {
217    s.contains(['*', '?'])
218}
219
220/// `size:` value → inclusive byte range.
221/// Forms: `123`, `1kb`, `1.5mb`, `>1gb`, `>=`, `<`, `<=`, `1mb..2gb`.
222fn parse_size_range(v: &str) -> Option<(u64, u64)> {
223    if let Some((a, b)) = v.split_once("..") {
224        return Some((parse_size(a)?, parse_size(b)?));
225    }
226    if let Some(r) = v.strip_prefix(">=") {
227        return Some((parse_size(r)?, u64::MAX));
228    }
229    if let Some(r) = v.strip_prefix("<=") {
230        return Some((0, parse_size(r)?));
231    }
232    if let Some(r) = v.strip_prefix('>') {
233        return Some((parse_size(r)?.checked_add(1)?, u64::MAX));
234    }
235    if let Some(r) = v.strip_prefix('<') {
236        return Some((0, parse_size(r)?.checked_sub(1)?));
237    }
238    let exact = parse_size(v)?;
239    Some((exact, exact))
240}
241
242fn parse_size(v: &str) -> Option<u64> {
243    let v = v.trim().to_ascii_lowercase();
244    if v.is_empty() {
245        return None;
246    }
247    let (num, mult) = if let Some(n) = v.strip_suffix("kb").or_else(|| v.strip_suffix('k')) {
248        (n, 1u64 << 10)
249    } else if let Some(n) = v.strip_suffix("mb").or_else(|| v.strip_suffix('m')) {
250        (n, 1u64 << 20)
251    } else if let Some(n) = v.strip_suffix("gb").or_else(|| v.strip_suffix('g')) {
252        (n, 1u64 << 30)
253    } else if let Some(n) = v.strip_suffix("tb").or_else(|| v.strip_suffix('t')) {
254        (n, 1u64 << 40)
255    } else if let Some(n) = v.strip_suffix('b') {
256        (n, 1)
257    } else {
258        (v.as_str(), 1)
259    };
260    let num = num.trim();
261    if num.is_empty() {
262        return None;
263    }
264    if let Ok(i) = num.parse::<u64>() {
265        return i.checked_mul(mult);
266    }
267    let f: f64 = num.parse().ok()?;
268    if !f.is_finite() || f < 0.0 {
269        return None;
270    }
271    Some((f * mult as f64) as u64)
272}
273
274/// `dm:` value → [start, end) civil-date bounds.
275/// Forms: `2024`, `2024-03`, `2024/03/05`, `a..b`, `>x`, `>=x`, `<x`, `<=x`.
276fn parse_date_range(v: &str) -> Option<(Option<Civil>, Option<Civil>)> {
277    if let Some((a, b)) = v.split_once("..") {
278        let (sa, _) = parse_date_period(a)?;
279        let (_, eb) = parse_date_period(b)?;
280        return Some((Some(sa), Some(eb)));
281    }
282    if let Some(r) = v.strip_prefix(">=") {
283        let (s, _) = parse_date_period(r)?;
284        return Some((Some(s), None));
285    }
286    if let Some(r) = v.strip_prefix("<=") {
287        let (_, e) = parse_date_period(r)?;
288        return Some((None, Some(e)));
289    }
290    if let Some(r) = v.strip_prefix('>') {
291        let (_, e) = parse_date_period(r)?;
292        return Some((Some(e), None));
293    }
294    if let Some(r) = v.strip_prefix('<') {
295        let (s, _) = parse_date_period(r)?;
296        return Some((None, Some(s)));
297    }
298    let (s, e) = parse_date_period(v)?;
299    Some((Some(s), Some(e)))
300}
301
302/// One date period → [start, `end_exclusive`).
303fn parse_date_period(v: &str) -> Option<(Civil, Civil)> {
304    let parts: Vec<&str> = v.trim().split(['-', '/']).collect();
305    let nums: Vec<u32> = parts
306        .iter()
307        .map(|p| p.parse::<u32>().ok())
308        .collect::<Option<_>>()?;
309    // Reject years outside Civil's supported range (see `Civil::is_valid`:
310    // 1601..=9999) before building any Civil. Such a year can never match, and
311    // the i32 successor/predecessor math below — `*y as i32 + 1`, the eagerly
312    // evaluated `first_of_next_month`, and `days_from_civil`'s `c.y - 1` reached
313    // via `next_day` — overflows for values near i32::MAX/MIN (e.g.
314    // `dm:2147483647`, `dm:2147483648-1-1`), which panics under the debug
315    // profile's overflow-checks. The literals mirror is_valid's range, kept in
316    // u32 so the parsed value compares without a sign-losing cast.
317    let &year = nums.first()?;
318    if !(1601..=9999).contains(&year) {
319        return None;
320    }
321    match nums.as_slice() {
322        [y] => {
323            let start = Civil {
324                y: *y as i32,
325                m: 1,
326                d: 1,
327            };
328            let end = Civil {
329                y: *y as i32 + 1,
330                m: 1,
331                d: 1,
332            };
333            (start.is_valid() && end.is_valid()).then_some((start, end))
334        }
335        [y, m] => {
336            let start = Civil {
337                y: *y as i32,
338                m: *m,
339                d: 1,
340            };
341            start
342                .is_valid()
343                .then_some((start, start.first_of_next_month()))
344        }
345        [y, m, d] => {
346            let start = Civil {
347                y: *y as i32,
348                m: *m,
349                d: *d,
350            };
351            start.is_valid().then_some((start, start.next_day()))
352        }
353        _ => None,
354    }
355}
356
357#[cfg(test)]
358mod tests {
359    use super::*;
360
361    fn p(s: &str) -> Ast {
362        parse(s).unwrap()
363    }
364
365    #[test]
366    fn plain_words_are_anded() {
367        assert_eq!(
368            p("foo bar").groups,
369            vec![vec![Term::Name("foo".into()), Term::Name("bar".into())]]
370        );
371    }
372
373    #[test]
374    fn or_splits_groups() {
375        assert_eq!(
376            p("foo | bar baz").groups,
377            vec![
378                vec![Term::Name("foo".into())],
379                vec![Term::Name("bar".into()), Term::Name("baz".into())],
380            ]
381        );
382    }
383
384    #[test]
385    fn negation_and_double_negation() {
386        assert_eq!(
387            p("!tmp").groups[0][0],
388            Term::Not(Box::new(Term::Name("tmp".into())))
389        );
390        assert_eq!(p("!!tmp").groups[0][0], Term::Name("tmp".into()));
391    }
392
393    #[test]
394    fn phrase_keeps_spaces() {
395        assert_eq!(
396            p(r#""two words""#).groups[0][0],
397            Term::Name("two words".into())
398        );
399    }
400
401    #[test]
402    fn backslash_switches_to_path() {
403        assert_eq!(
404            p(r"docs\reports").groups[0][0],
405            Term::Path(r"docs\reports".into())
406        );
407    }
408
409    #[test]
410    fn wildcards_detected() {
411        assert_eq!(p("*.rs").groups[0][0], Term::Wildcard("*.rs".into()));
412        assert_eq!(
413            p(r"src\*.rs").groups[0][0],
414            Term::PathWildcard(r"src\*.rs".into())
415        );
416    }
417
418    #[test]
419    fn ext_filter_splits_csv() {
420        assert_eq!(
421            p("ext:jpg;png,.gif").groups[0][0],
422            Term::Ext(vec!["jpg".into(), "png".into(), "gif".into()])
423        );
424    }
425
426    #[test]
427    fn dm_out_of_range_year_is_rejected_without_overflow() {
428        // Years near i32::MAX/MIN must not overflow the date arithmetic in
429        // parse_date_period (the debug profile's overflow-checks would panic);
430        // they are rejected as InvalidDate. Regression for `dm:2147483647`
431        // (year+1 in the [y] arm), `dm:2147483647-12` (first_of_next_month,
432        // evaluated eagerly even though the start is invalid), and
433        // `dm:2147483648-1-1` (`c.y - 1` in days_from_civil via next_day).
434        for v in [
435            "dm:2147483647",
436            "dm:2147483647-12",
437            "dm:2147483648-1-1",
438            "dm:2020..2147483647",
439            "dm:4294967295",
440            "dm:99999999",
441        ] {
442            assert!(
443                matches!(parse(v), Err(ParseError::InvalidDate(_))),
444                "{v} should be rejected as InvalidDate, not panic",
445            );
446        }
447        // A year inside the supported range still parses to an Mtime term.
448        assert!(matches!(p("dm:2024").groups[0][0], Term::Mtime { .. }));
449    }
450
451    #[test]
452    fn quoted_filter_value() {
453        assert_eq!(
454            p(r#"path:"Program Files""#).groups[0][0],
455            Term::Path("Program Files".into())
456        );
457    }
458
459    #[test]
460    fn size_ranges() {
461        assert_eq!(
462            p("size:>1kb").groups[0][0],
463            Term::Size {
464                min: 1025,
465                max: u64::MAX
466            }
467        );
468        assert_eq!(
469            p("size:1kb..1mb").groups[0][0],
470            Term::Size {
471                min: 1024,
472                max: 1 << 20
473            }
474        );
475        assert_eq!(
476            p("size:1.5kb").groups[0][0],
477            Term::Size {
478                min: 1536,
479                max: 1536
480            }
481        );
482        assert!(parse("size:abc").is_err());
483    }
484
485    #[test]
486    fn date_ranges() {
487        let y2024 = Civil {
488            y: 2024,
489            m: 1,
490            d: 1,
491        };
492        let y2025 = Civil {
493            y: 2025,
494            m: 1,
495            d: 1,
496        };
497        assert_eq!(
498            p("dm:2024").groups[0][0],
499            Term::Mtime {
500                start: Some(y2024),
501                end: Some(y2025)
502            }
503        );
504        assert_eq!(
505            p("dm:>=2024-03").groups[0][0],
506            Term::Mtime {
507                start: Some(Civil {
508                    y: 2024,
509                    m: 3,
510                    d: 1
511                }),
512                end: None
513            }
514        );
515        assert_eq!(
516            p("dm:2024/02/28..2024/02/29").groups[0][0],
517            Term::Mtime {
518                start: Some(Civil {
519                    y: 2024,
520                    m: 2,
521                    d: 28
522                }),
523                end: Some(Civil {
524                    y: 2024,
525                    m: 3,
526                    d: 1
527                }),
528            }
529        );
530        assert!(parse("dm:2023-02-29").is_err());
531    }
532
533    #[test]
534    fn folder_with_value_expands() {
535        assert_eq!(
536            p("folder:src").groups[0],
537            vec![Term::IsDir(true), Term::Name("src".into())]
538        );
539        assert!(parse("!folder:src").is_err());
540        assert_eq!(
541            p("!folder:").groups[0][0],
542            Term::Not(Box::new(Term::IsDir(true)))
543        );
544    }
545
546    #[test]
547    fn unknown_colon_atom_is_a_name() {
548        assert_eq!(p("12:30").groups[0][0], Term::Name("12:30".into()));
549    }
550
551    #[test]
552    fn unclosed_quote_errors() {
553        assert_eq!(parse(r#""abc"#), Err(ParseError::UnclosedQuote));
554    }
555
556    #[test]
557    fn empty_query_is_match_all() {
558        assert_eq!(p("").groups, vec![Vec::<Term>::new()]);
559    }
560}
561
562#[cfg(test)]
563mod proptests {
564    use proptest::{prop_assert, proptest};
565
566    use super::parse;
567
568    proptest! {
569        // `parse` must never panic on ANY input — the documented
570        // "# Panics: Does not panic" contract, validated across the whole input
571        // space (a lightweight fuzz). On success the AST always has at least one
572        // group (it is seeded with one and only grows).
573        #[test]
574        fn parse_never_panics(s in ".*") {
575            if let Ok(ast) = parse(&s) {
576                prop_assert!(!ast.groups.is_empty());
577            }
578        }
579
580        // Same property, biased to the query alphabet (operators, filters,
581        // quotes, backslash) so the filter/operator parsing paths get dense
582        // coverage rather than mostly-inert random Unicode.
583        #[test]
584        fn parse_never_panics_on_query_alphabet(
585            s in "[a-zA-Z0-9 |!\"*?:;.<>=,/\\\\-]{0,40}"
586        ) {
587            if let Ok(ast) = parse(&s) {
588                prop_assert!(!ast.groups.is_empty());
589            }
590        }
591    }
592}