Skip to main content

fmf_core\scan/
walk.rs

1//! Non-elevated folder-walk scanner for scope mode (ADR-0024).
2//!
3//! The privileged path streams the whole $MFT (`scan_volume`); this one walks
4//! only the roots the user can read, with no admin and no raw volume handle.
5//! It builds the same [`VolumeIndex`] the $MFT scanner does, so the query
6//! layer and the worker are untouched — the only differences are the record
7//! key (a synthetic path hash, `walk_id`) and the change source (the Phase 2
8//! `WatcherJournalSource` instead of the USN journal).
9//!
10//! Layout trick (no index-format change): each configured root is pushed as a
11//! detached top-level entry whose *name is its absolute base path*; the synthetic
12//! `VolumeIndex::ROOT` (slot 0) carries an empty name that
13//! `append_parent_path` skips. So a root's children reconstruct as
14//! `C:\Users\me\Documents\sub\file.txt` with no special path code.
15//!
16//! Memory safety: enumeration is pure safe `std::fs` — on Windows `read_dir`
17//! caches each `WIN32_FIND_DATAW`, so `DirEntry::metadata()` serves
18//! size/mtime/attributes from that cache with no extra syscall and no `unsafe`.
19
20use std::collections::HashSet;
21use std::path::PathBuf;
22use std::time::Instant;
23
24use std::os::windows::ffi::OsStrExt;
25use std::os::windows::fs::MetadataExt;
26
27use crate::index::{Frn, RawEntry, VolumeIndex, VolumeIndexBuilder};
28use crate::wtf8::push_wtf8_pair;
29
30use super::ScanStats;
31use super::walk_id::path_record;
32
33// Raw `dwFileAttributes` bits we classify on (windows_sys mirrors these; kept
34// local so the hot enumeration loop has no import churn).
35const ATTR_DIRECTORY: u32 = 0x10;
36const ATTR_HIDDEN: u32 = 0x2;
37const ATTR_SYSTEM: u32 = 0x4;
38const ATTR_REPARSE_POINT: u32 = 0x400;
39
40/// Depth cap, kept under `append_parent_path`'s 128-deep chain buffer so a
41/// pathological tree can never produce a truncated path silently.
42const MAX_DEPTH: u32 = 100;
43
44/// The synthetic record of `VolumeIndex::ROOT` (slot 0). Roots attach here;
45/// the value 0 is reserved (a real path colliding with it is a ~2⁻⁴⁸ event
46/// that only shadows slot 0, which carries no name).
47const SCOPE_ROOT_RECORD: u64 = 0;
48
49/// One directory queued for enumeration: its on-disk path plus the data its
50/// children need — the parent's folded path (to extend, not re-fold) and the
51/// parent's record (the children's `parent_frn`).
52struct Pending {
53    path: PathBuf,
54    folded: Vec<u8>,
55    record: u64,
56    depth: u32,
57}
58
59/// Walk `roots` (absolute base paths), pruning any subtree whose folded path
60/// matches an entry in `excludes` (ADR-0025), and build a queryable
61/// [`VolumeIndex`].
62///
63/// Infallible by design (don't crash): an unreadable root or directory is counted
64/// in [`ScanStats::walk_read_errors`] and skipped, never propagated. The
65/// worker maps that count to a counter + warn at its single mapping point.
66#[must_use]
67pub fn walk_scan(roots: &[String], excludes: &[String]) -> (VolumeIndex, ScanStats) {
68    let t0 = Instant::now();
69    let mut stats = ScanStats {
70        volume: "scope".to_string(),
71        ..Default::default()
72    };
73    let mut b = VolumeIndexBuilder::new("", SCOPE_ROOT_RECORD);
74
75    // Reused across every entry so the walk allocates O(depth), not O(entries).
76    let mut units: Vec<u16> = Vec::new();
77    let mut name_buf: Vec<u8> = Vec::new();
78    let mut lower_buf: Vec<u8> = Vec::new();
79    let mut stack: Vec<Pending> = Vec::new();
80
81    // Fold each excluded subtree to the same per-char lowercase form the walk
82    // builds for every entry (ADR-0003/-0025), so the prune check below is one
83    // case-insensitive hash lookup that never re-folds a path. Empty (lookups
84    // all-miss) in the common no-exclude case.
85    let prune_set: HashSet<Vec<u8>> = excludes
86        .iter()
87        .map(|e| {
88            units.clear();
89            units.extend(PathBuf::from(e).as_os_str().encode_wide());
90            name_buf.clear();
91            lower_buf.clear();
92            push_wtf8_pair(&units, &mut name_buf, &mut lower_buf);
93            lower_buf.clone()
94        })
95        .collect();
96
97    for root in roots {
98        let path = PathBuf::from(root);
99        let md = match std::fs::metadata(&path) {
100            Ok(m) if m.is_dir() => m,
101            _ => {
102                stats.walk_read_errors += 1;
103                continue;
104            }
105        };
106        units.clear();
107        units.extend(path.as_os_str().encode_wide());
108        name_buf.clear();
109        lower_buf.clear();
110        push_wtf8_pair(&units, &mut name_buf, &mut lower_buf);
111        let record = path_record(&lower_buf);
112        let attrs = md.file_attributes();
113        // The root's *name* is its whole absolute base path; with slot 0's
114        // empty name skipped by append_parent_path, children resolve to a
115        // correct absolute path with no path-code change.
116        b.push(RawEntry {
117            parent_frn: Frn(SCOPE_ROOT_RECORD),
118            frn: Frn(record),
119            name_utf16: &units,
120            is_dir: true,
121            is_reparse: attrs & ATTR_REPARSE_POINT != 0,
122            is_hidden: attrs & ATTR_HIDDEN != 0,
123            is_system: attrs & ATTR_SYSTEM != 0,
124            size: md.file_size(),
125            mtime: md.last_write_time() as i64,
126        });
127        stats.dirs += 1;
128        // A root that folds to a trailing separator (drive roots: `C:\` →
129        // `c:\`) would seed its children as `c:\\users`; strip it so they read
130        // `c:\users` — canonical, matching fold(full_path), the exclude set
131        // above, and the watcher's stateless recompute.
132        let mut folded = lower_buf.clone();
133        if folded.last() == Some(&b'\\') {
134            folded.pop();
135        }
136        stack.push(Pending {
137            path,
138            folded,
139            record,
140            depth: 0,
141        });
142    }
143
144    while let Some(cur) = stack.pop() {
145        let Ok(rd) = std::fs::read_dir(&cur.path) else {
146            stats.walk_read_errors += 1;
147            continue;
148        };
149        for entry in rd {
150            let Ok(entry) = entry else {
151                stats.walk_read_errors += 1;
152                continue;
153            };
154            // No extra syscall on Windows: served from the cached find data,
155            // and (like symlink_metadata) it does not traverse reparse points.
156            let Ok(md) = entry.metadata() else {
157                stats.walk_read_errors += 1;
158                continue;
159            };
160            let attrs = md.file_attributes();
161            let is_dir = attrs & ATTR_DIRECTORY != 0;
162            let is_reparse = attrs & ATTR_REPARSE_POINT != 0;
163
164            let name = entry.file_name();
165            units.clear();
166            units.extend(name.encode_wide());
167            name_buf.clear();
168            lower_buf.clear();
169            // lower_buf = folded name; folding is per-char and length-
170            // preserving (ADR-0003), so folded(parent)+"\"+folded(name) equals
171            // folded(full path) — the watcher recomputes the same key.
172            push_wtf8_pair(&units, &mut name_buf, &mut lower_buf);
173            let mut folded_child = cur.folded.clone();
174            folded_child.push(b'\\');
175            folded_child.extend_from_slice(&lower_buf);
176            let record = path_record(&folded_child);
177
178            // User-excluded subtree (scope mode, ADR-0025): drop the directory
179            // and skip descent so nothing under it is indexed. Matched on the
180            // folded path already built for the record — one lookup, no re-fold.
181            if is_dir && prune_set.contains(&folded_child) {
182                stats.walk_excluded_pruned += 1;
183                continue;
184            }
185
186            b.push(RawEntry {
187                parent_frn: Frn(cur.record),
188                frn: Frn(record),
189                name_utf16: &units,
190                is_dir,
191                is_reparse,
192                is_hidden: attrs & ATTR_HIDDEN != 0,
193                is_system: attrs & ATTR_SYSTEM != 0,
194                size: md.file_size(),
195                mtime: md.last_write_time() as i64,
196            });
197            if is_dir {
198                stats.dirs += 1;
199            } else {
200                stats.files += 1;
201            }
202
203            // Descend into real directories only — never follow reparse points
204            // (junctions/symlinks loop and can escape the root).
205            if is_dir && !is_reparse {
206                if cur.depth + 1 < MAX_DEPTH {
207                    stack.push(Pending {
208                        path: entry.path(),
209                        folded: folded_child,
210                        record,
211                        depth: cur.depth + 1,
212                    });
213                } else {
214                    stats.walk_depth_truncated += 1;
215                }
216            }
217        }
218    }
219
220    stats.elapsed_walk_ms = t0.elapsed().as_millis() as u64;
221    stats.walk_dirs = stats.dirs;
222    stats.walk_files = stats.files;
223    let (idx, finish) = b.finish_timed();
224    stats.elapsed_build_ms = finish.build_ms;
225    stats.elapsed_sort_ms = finish.sort_ms;
226    stats.elapsed_total_ms = t0.elapsed().as_millis() as u64;
227    stats.peak_working_set_bytes = crate::mft::peak_working_set();
228    (idx, stats)
229}
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234
235    /// A real on-disk tree under a unique temp dir, removed on drop.
236    struct Tree(PathBuf);
237    impl Tree {
238        fn new(tag: &str) -> Self {
239            let dir =
240                std::env::temp_dir().join(format!("fmf-walk-test-{tag}-{}", std::process::id()));
241            let _ = std::fs::remove_dir_all(&dir);
242            std::fs::create_dir_all(&dir).expect("create temp tree root");
243            Self(dir)
244        }
245    }
246    impl Drop for Tree {
247        fn drop(&mut self) {
248            let _ = std::fs::remove_dir_all(&self.0);
249        }
250    }
251
252    fn folded(path: &str) -> Vec<u8> {
253        let units: Vec<u16> = std::path::Path::new(path)
254            .as_os_str()
255            .encode_wide()
256            .collect();
257        let (mut n, mut l) = (Vec::new(), Vec::new());
258        push_wtf8_pair(&units, &mut n, &mut l);
259        l
260    }
261
262    #[test]
263    fn walk_builds_queryable_index_with_correct_paths() {
264        let tree = Tree::new("paths");
265        let root = tree.0.join("Documents");
266        std::fs::create_dir_all(root.join("sub")).unwrap();
267        std::fs::write(root.join("alpha.txt"), b"a").unwrap();
268        std::fs::write(root.join("sub").join("beta.log"), b"bb").unwrap();
269
270        let root_str = root.to_str().unwrap().to_string();
271        let (idx, stats) = walk_scan(std::slice::from_ref(&root_str), &[]);
272
273        // Root + sub (2 dirs) and alpha.txt + beta.log (2 files).
274        assert_eq!(stats.dirs, 2, "dirs");
275        assert_eq!(stats.files, 2, "files");
276        assert_eq!(stats.walk_read_errors, 0, "no read errors");
277
278        // The deep file reconstructs to its true absolute path.
279        let beta_path = format!("{root_str}\\sub\\beta.log");
280        let id = idx
281            .entry_by_record(path_record(&folded(&beta_path)))
282            .expect("beta.log indexed");
283        let mut out = Vec::new();
284        idx.append_path(id, &mut out);
285        assert_eq!(String::from_utf8(out).unwrap(), beta_path);
286
287        // Parent linkage resolved across walk order: beta.log → sub → root.
288        let sub_id = idx
289            .entry_by_record(path_record(&folded(&format!("{root_str}\\sub"))))
290            .expect("sub indexed");
291        assert_eq!(idx.parent(id), sub_id);
292        let root_id = idx
293            .entry_by_record(path_record(&folded(&root_str)))
294            .expect("root indexed");
295        assert_eq!(idx.parent(sub_id), root_id);
296        assert!(idx.is_dir(sub_id));
297        assert_eq!(idx.size(id), 2);
298    }
299
300    #[test]
301    fn multiple_roots_share_one_index() {
302        let tree = Tree::new("multiroot");
303        let a = tree.0.join("RootA");
304        let b = tree.0.join("RootB");
305        std::fs::create_dir_all(&a).unwrap();
306        std::fs::create_dir_all(&b).unwrap();
307        std::fs::write(a.join("one.txt"), b"1").unwrap();
308        std::fs::write(b.join("two.txt"), b"2").unwrap();
309
310        let (idx, stats) = walk_scan(
311            &[
312                a.to_str().unwrap().to_string(),
313                b.to_str().unwrap().to_string(),
314            ],
315            &[],
316        );
317        assert_eq!(stats.files, 2, "one file per root");
318
319        for (root, name) in [(&a, "one.txt"), (&b, "two.txt")] {
320            let p = format!("{}\\{name}", root.to_str().unwrap());
321            let id = idx
322                .entry_by_record(path_record(&folded(&p)))
323                .unwrap_or_else(|| panic!("{p} indexed"));
324            let mut out = Vec::new();
325            idx.append_path(id, &mut out);
326            assert_eq!(String::from_utf8(out).unwrap(), p);
327        }
328    }
329
330    #[test]
331    fn missing_root_is_counted_not_fatal() {
332        let (idx, stats) = walk_scan(&["Z:\\does\\not\\exist-fmf".to_string()], &[]);
333        assert_eq!(stats.walk_read_errors, 1);
334        // Only the synthetic ROOT slot exists; no real entries.
335        assert_eq!(stats.files, 0);
336        assert_eq!(stats.dirs, 0);
337        assert_eq!(idx.live_len(), 1, "just the empty synthetic root");
338    }
339
340    #[test]
341    fn excluded_subtree_is_pruned() {
342        let tree = Tree::new("exclude");
343        let root = tree.0.join("Projects");
344        std::fs::create_dir_all(root.join("app").join("node_modules")).unwrap();
345        std::fs::create_dir_all(root.join("archive")).unwrap();
346        std::fs::write(root.join("app").join("main.rs"), b"m").unwrap();
347        std::fs::write(root.join("app").join("node_modules").join("dep.js"), b"d").unwrap();
348        std::fs::write(root.join("archive").join("old.txt"), b"o").unwrap();
349
350        let root_str = root.to_str().unwrap().to_string();
351        let nm = format!("{root_str}\\app\\node_modules");
352        let archive = format!("{root_str}\\archive");
353        let (idx, stats) = walk_scan(
354            std::slice::from_ref(&root_str),
355            &[nm.clone(), archive.clone()],
356        );
357
358        // Both excluded subtrees are pruned at their roots.
359        assert_eq!(stats.walk_excluded_pruned, 2, "two subtrees pruned");
360        // app/main.rs survives; the excluded files (and the excluded dirs
361        // themselves) do not.
362        let present = |p: &str| idx.entry_by_record(path_record(&folded(p))).is_some();
363        assert!(
364            present(&format!("{root_str}\\app\\main.rs")),
365            "sibling kept"
366        );
367        assert!(!present(&format!("{nm}\\dep.js")), "excluded file dropped");
368        assert!(
369            !present(&format!("{archive}\\old.txt")),
370            "excluded file dropped"
371        );
372        assert!(!present(&nm), "excluded dir itself dropped");
373        assert!(!present(&archive), "excluded dir itself dropped");
374    }
375
376    #[test]
377    fn root_with_trailing_separator_resolves_and_prunes_canonically() {
378        let tree = Tree::new("trailingsep");
379        let base = tree.0.join("Data");
380        std::fs::create_dir_all(base.join("keep")).unwrap();
381        std::fs::create_dir_all(base.join("skip")).unwrap();
382        std::fs::write(base.join("keep").join("a.txt"), b"a").unwrap();
383        std::fs::write(base.join("skip").join("b.txt"), b"b").unwrap();
384
385        let base_str = base.to_str().unwrap().to_string();
386        // Root passed WITH a trailing separator (how a drive root like `C:\`
387        // arrives): children must still key on single-separator folded paths,
388        // so both lookups and exclude matching work.
389        let root_with_sep = format!("{base_str}\\");
390        let skip = format!("{base_str}\\skip");
391        let (idx, stats) = walk_scan(
392            std::slice::from_ref(&root_with_sep),
393            std::slice::from_ref(&skip),
394        );
395
396        assert_eq!(
397            stats.walk_excluded_pruned, 1,
398            "trailing-sep root still prunes"
399        );
400        let present = |p: &str| idx.entry_by_record(path_record(&folded(p))).is_some();
401        assert!(
402            present(&format!("{base_str}\\keep\\a.txt")),
403            "kept file resolves"
404        );
405        assert!(!present(&format!("{skip}\\b.txt")), "excluded file dropped");
406    }
407}