Skip to main content

fmf_core/
mft.rs

1//! Initial full-volume index source: raw $MFT scan via ntfs-reader.
2//! Holds the measurement spike (`spike_scan`) and the whole-$MFT reference
3//! scanner used as the streaming scanner's equivalence gate.
4
5use std::time::Instant;
6
7use ntfs_reader::api::{NtfsAttributeType, NtfsFileName, NtfsFileNamespace};
8use ntfs_reader::errors::NtfsReaderError;
9use ntfs_reader::file::NtfsFile;
10use ntfs_reader::mft::Mft;
11use ntfs_reader::volume::Volume;
12use thiserror::Error;
13
14use crate::index::{Frn, RawEntry, VolumeIndex, VolumeIndexBuilder};
15
16// The production scanner (and the ScanStats both scanners fill) lives in
17// crate::scan; re-exported here so callers keep one import path.
18pub use crate::scan::{ScanStats, scan_volume};
19
20/// Failure modes of a raw $MFT volume scan.
21#[derive(Debug, Error)]
22pub enum MftError {
23    /// The process lacks the privileges to open the raw volume (MFT/USN reads
24    /// require an elevated process; run from an administrator terminal).
25    #[error("volume scan requires an elevated process (run from an administrator terminal)")]
26    NotElevated,
27    /// An error surfaced by the underlying ntfs-reader (volume open or $MFT read).
28    #[error("ntfs-reader: {0}")]
29    Ntfs(#[from] NtfsReaderError),
30}
31
32/// Measurements from a full $MFT scan of one volume.
33#[derive(Debug, Default)]
34pub struct SpikeStats {
35    /// Drive letter spec of the scanned volume (e.g. `C:`).
36    pub volume: String,
37    /// Time to open the raw volume handle, in milliseconds.
38    pub elapsed_volume_open_ms: u64,
39    /// Time for `Mft::new`: reads the whole $MFT into memory + fixups.
40    pub elapsed_mft_load_ms: u64,
41    /// Time to walk every in-use record and extract name/size/dates.
42    pub elapsed_iterate_ms: u64,
43    /// Size of the raw $MFT — the peak-RAM driver of this approach.
44    pub mft_bytes: u64,
45    /// Total number of $MFT records walked (in-use and free), a count.
46    pub total_records: u64,
47    /// Number of named file records indexed, a count.
48    pub files: u64,
49    /// Number of named directory records indexed, a count.
50    pub dirs: u64,
51    /// Number of records whose name is a reparse point (junction/symlink), a count.
52    pub reparse_points: u64,
53    /// Records where the base record holds no usable $`FILE_NAME` (needs
54    /// attribute-list handling in M0).
55    pub no_name_in_base_record: u64,
56    /// Sum of name lengths across all named records, in UTF-16 code units.
57    pub name_utf16_units_total: u64,
58    /// Longest single name encountered, in UTF-16 code units.
59    pub max_name_utf16_units: u64,
60    /// Sanity check that `reference_number()` carries a sequence value.
61    pub frn_sequence_nonzero: u64,
62    /// Peak working set of the process during the scan, in bytes.
63    pub peak_working_set_bytes: u64,
64}
65
66impl SpikeStats {
67    /// Mean name length across named records, in UTF-16 code units.
68    #[must_use]
69    pub fn avg_name_utf16_units(&self) -> f64 {
70        let named = (self.files + self.dirs).max(1);
71        self.name_utf16_units_total as f64 / named as f64
72    }
73}
74
75/// Pick the display name: prefer Win32 / Win32+DOS
76/// namespaces, fall back to POSIX, ignore DOS-only short names. Unlike
77/// ntfs-reader's `get_best_file_name`, reparse-point names are kept —
78/// junctions and symlinks are indexed as plain entries.
79pub(crate) fn pick_name(file: &NtfsFile) -> Option<NtfsFileName> {
80    let mut best: Option<NtfsFileName> = None;
81    file.attributes(|att| {
82        if att.header.type_id != NtfsAttributeType::FileName as u32 {
83            return;
84        }
85        let Some(name) = att.as_name() else { return };
86        let ns = name.header.namespace;
87        let win32 =
88            ns == NtfsFileNamespace::Win32 as u8 || ns == NtfsFileNamespace::Win32AndDos as u8;
89        if win32 || (ns == NtfsFileNamespace::Posix as u8 && best.is_none()) {
90            best = Some(name);
91        }
92    });
93    best
94}
95
96/// Scan one volume's $MFT end to end and report measurements.
97/// `drive` is a drive letter spec like `C:`.
98///
99/// # Errors
100///
101/// Returns [`MftError::NotElevated`] when the process lacks the privileges to
102/// open the raw volume, or [`MftError::Ntfs`] if opening the volume or
103/// reading the $MFT fails.
104pub fn spike_scan(drive: &str) -> Result<SpikeStats, MftError> {
105    let volume_path = format!(r"\\.\{}", drive.trim_end_matches(['\\', '/']));
106    let mut stats = SpikeStats {
107        volume: drive.to_string(),
108        ..Default::default()
109    };
110
111    let t0 = Instant::now();
112    let volume = Volume::new(&volume_path).map_err(|e| match e {
113        NtfsReaderError::ElevationError => MftError::NotElevated,
114        other => MftError::Ntfs(other),
115    })?;
116    stats.elapsed_volume_open_ms = t0.elapsed().as_millis() as u64;
117
118    let t1 = Instant::now();
119    let mft = Mft::new(volume)?;
120    stats.elapsed_mft_load_ms = t1.elapsed().as_millis() as u64;
121    stats.mft_bytes = mft.data.len() as u64;
122    stats.total_records = mft.max_record;
123
124    let t2 = Instant::now();
125    let mut std_info_seen = 0u64;
126    for file in mft.files() {
127        let Some(name) = pick_name(&file) else {
128            stats.no_name_in_base_record += 1;
129            continue;
130        };
131
132        let len = name.header.name_length as u64;
133        stats.name_utf16_units_total += len;
134        stats.max_name_utf16_units = stats.max_name_utf16_units.max(len);
135
136        if file.is_directory() {
137            stats.dirs += 1;
138        } else {
139            stats.files += 1;
140        }
141        if name.is_reparse_point() {
142            stats.reparse_points += 1;
143        }
144        if file.reference_number() >> 48 != 0 {
145            stats.frn_sequence_nonzero += 1;
146        }
147
148        // Touch $STANDARD_INFORMATION and $DATA the way the real indexer will,
149        // so iteration cost is representative.
150        file.attributes(|att| {
151            if att.header.type_id == NtfsAttributeType::StandardInformation as u32
152                && att.as_standard_info().is_some()
153            {
154                std_info_seen += 1;
155            }
156        });
157    }
158    // Keep the optimizer from dropping the attribute walk.
159    std::hint::black_box(std_info_seen);
160    stats.elapsed_iterate_ms = t2.elapsed().as_millis() as u64;
161    stats.peak_working_set_bytes = peak_working_set();
162
163    Ok(stats)
164}
165
166/// Full initial scan: read the volume's $MFT and build the in-memory index.
167/// `drive` is a drive letter spec like `C:`.
168///
169/// # Errors
170///
171/// Returns [`MftError::NotElevated`] when the process lacks the privileges to
172/// open the raw volume, or [`MftError::Ntfs`] if opening the volume or
173/// reading the $MFT fails.
174pub fn scan_volume_reference(drive: &str) -> Result<(VolumeIndex, ScanStats), MftError> {
175    use ntfs_reader::api::ROOT_RECORD;
176
177    const FILE_ATTRIBUTE_HIDDEN: u32 = 0x2;
178    const FILE_ATTRIBUTE_SYSTEM: u32 = 0x4;
179    const FILE_ATTRIBUTE_REPARSE_POINT: u32 = 0x400;
180
181    let drive = drive.trim_end_matches(['\\', '/']);
182    let volume_path = format!(r"\\.\{drive}");
183    let mut stats = ScanStats {
184        volume: drive.to_string(),
185        ..Default::default()
186    };
187
188    let t0 = Instant::now();
189    let volume = Volume::new(&volume_path).map_err(|e| match e {
190        NtfsReaderError::ElevationError => MftError::NotElevated,
191        other => MftError::Ntfs(other),
192    })?;
193    let t1 = Instant::now();
194    let mft = Mft::new(volume)?;
195    stats.elapsed_mft_load_ms = t1.elapsed().as_millis() as u64;
196    stats.mft_bytes = mft.data.len() as u64;
197
198    let mut b = VolumeIndexBuilder::new(drive, ROOT_RECORD);
199    for file in mft.files() {
200        // files() yields extension records too (no base_reference filter in
201        // ntfs-reader). They are parts of other files; indexing them would
202        // duplicate every fragmented file that keeps its $FILE_NAME in an
203        // extension record — skip, like the streaming scanner does.
204        if { file.header.base_reference } & 0x0000_FFFF_FFFF_FFFF != 0 {
205            stats.extension_records += 1;
206            continue;
207        }
208        // Names of heavily fragmented files live in extension records via
209        // $ATTRIBUTE_LIST — fall back to ntfs-reader's resolver for those
210        // (~4% of records on a real C:).
211        let Some(name) = pick_name(&file).or_else(|| file.get_best_file_name(&mft)) else {
212            stats.skipped_no_name += 1;
213            continue;
214        };
215        // Copy fields out of the packed structs before borrowing.
216        let name_data = name.data;
217        let name_len = name.header.name_length as usize;
218        let parent_frn = name.header.parent_directory_reference;
219
220        let mut size = 0u64;
221        let mut mtime = 0i64;
222        // Attribute flags in $FILE_NAME are updated lazily by NTFS; the
223        // authoritative copy lives in $STANDARD_INFORMATION.
224        let mut is_reparse = false;
225        let mut is_hidden = false;
226        let mut is_system = false;
227        file.attributes(|att| {
228            if att.header.type_id == NtfsAttributeType::StandardInformation as u32 {
229                if let Some(si) = att.as_standard_info() {
230                    mtime = si.modification_time as i64;
231                    is_reparse = si.file_attributes & FILE_ATTRIBUTE_REPARSE_POINT != 0;
232                    is_hidden = si.file_attributes & FILE_ATTRIBUTE_HIDDEN != 0;
233                    is_system = si.file_attributes & FILE_ATTRIBUTE_SYSTEM != 0;
234                }
235            } else if att.header.type_id == NtfsAttributeType::Data as u32 {
236                if att.header.is_non_resident == 0 {
237                    if let Some(h) = att.resident_header() {
238                        size = h.value_length as u64;
239                    }
240                } else if let Some(h) = att.nonresident_header() {
241                    size = h.data_size;
242                }
243            }
244        });
245
246        if file.is_directory() {
247            stats.dirs += 1;
248        } else {
249            stats.files += 1;
250        }
251        b.push(RawEntry {
252            parent_frn: Frn(parent_frn),
253            frn: Frn(file.reference_number()),
254            name_utf16: &name_data[..name_len],
255            is_dir: file.is_directory(),
256            is_reparse,
257            is_hidden,
258            is_system,
259            size,
260            mtime,
261        });
262    }
263
264    let idx = b.finish();
265    stats.elapsed_total_ms = t0.elapsed().as_millis() as u64;
266    stats.peak_working_set_bytes = peak_working_set();
267    Ok((idx, stats))
268}
269
270/// Peak working set of the current process, in bytes (0 if the query fails).
271#[must_use]
272pub fn peak_working_set() -> u64 {
273    memory_counters().map_or(0, |c| c.PeakWorkingSetSize as u64)
274}
275
276/// Current working set — the steady-state number the RAM gate cares about
277/// (the peak includes transient scan buffers).
278#[must_use]
279pub fn current_working_set() -> u64 {
280    memory_counters().map_or(0, |c| c.WorkingSetSize as u64)
281}
282
283fn memory_counters() -> Option<windows_sys::Win32::System::ProcessStatus::PROCESS_MEMORY_COUNTERS> {
284    use windows_sys::Win32::System::ProcessStatus::{
285        GetProcessMemoryInfo, PROCESS_MEMORY_COUNTERS,
286    };
287    use windows_sys::Win32::System::Threading::GetCurrentProcess;
288
289    unsafe {
290        let mut counters: PROCESS_MEMORY_COUNTERS = std::mem::zeroed();
291        counters.cb = size_of::<PROCESS_MEMORY_COUNTERS>() as u32;
292        let ok = GetProcessMemoryInfo(
293            GetCurrentProcess(),
294            &raw mut counters,
295            size_of::<PROCESS_MEMORY_COUNTERS>() as u32,
296        );
297        (ok != 0).then_some(counters)
298    }
299}