Skip to main content

fmf_core/
diag.rs

1//! Process-wide diagnostics: capture every WARN+ tracing event and panic.
2//!
3//! Each one ends up in (a) the rolling log file, (b) a global ring buffer
4//! surfaced through `MetricsSnapshot` → the F12 panel and `fmf stats`, and
5//! (c) any registered sinks (the FFI forwards them as `ENGINE_ERROR` events
6//! to the UI).
7//! The layer responsible for the "don't go silent" arm of
8//! "don't crash / don't hang / don't go silent".
9
10use std::collections::VecDeque;
11use std::sync::atomic::{AtomicU64, Ordering};
12use std::sync::{Arc, Once, OnceLock};
13use std::time::Instant;
14
15use parking_lot::Mutex;
16use serde::Serialize;
17
18/// Severity class of a captured diagnostic event, in ascending order.
19#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize)]
20#[serde(rename_all = "lowercase")]
21pub enum Severity {
22    /// A degraded path that recovered via fallback (the `degrade!` form).
23    Warn,
24    /// A handled failure surfaced to the operator.
25    Error,
26    /// A panic captured by the `install_panic_hook` backtrace path.
27    Panic,
28}
29
30impl Severity {
31    /// Numeric form carried in the POD FFI event payload.
32    #[must_use]
33    pub const fn as_u64(self) -> u64 {
34        match self {
35            Self::Warn => 1,
36            Self::Error => 2,
37            Self::Panic => 3,
38        }
39    }
40}
41
42/// One captured diagnostic event: the unit stored in the ring and fanned out
43/// to sinks.
44#[derive(Clone, Debug, Serialize)]
45pub struct ErrorEvent {
46    /// Monotonic sequence number, assigned per event from process start (1-based).
47    pub seq: u64,
48    /// Milliseconds since the first diagnostics call (process uptime, ms).
49    pub uptime_ms: u64,
50    /// Severity class of this event.
51    pub severity: Severity,
52    /// tracing target (module path) — "where".
53    pub area: String,
54    /// Volume the event pertains to (e.g. "C:"), when known.
55    pub volume: Option<String>,
56    /// Human-readable message — "what".
57    pub message: String,
58}
59
60const RING_CAP: usize = 128;
61
62static START: OnceLock<Instant> = OnceLock::new();
63static SEQ: AtomicU64 = AtomicU64::new(1);
64static RING: Mutex<VecDeque<ErrorEvent>> = Mutex::new(VecDeque::new());
65static SINKS: Mutex<Vec<(u64, Sink)>> = Mutex::new(Vec::new());
66static SINK_IDS: AtomicU64 = AtomicU64::new(1);
67
68type Sink = Arc<dyn Fn(&ErrorEvent) + Send + Sync>;
69
70/// Register a fan-out target for new error events; dropping the guard
71/// unregisters it (the FFI ties this to the engine handle's lifetime).
72pub fn register_sink(sink: Sink) -> SinkGuard {
73    let id = SINK_IDS.fetch_add(1, Ordering::Relaxed);
74    SINKS.lock().push((id, sink));
75    SinkGuard(id)
76}
77
78/// Lifetime handle for a registered sink; dropping it unregisters the sink.
79pub struct SinkGuard(u64);
80
81impl Drop for SinkGuard {
82    fn drop(&mut self) {
83        SINKS.lock().retain(|(id, _)| *id != self.0);
84    }
85}
86
87/// Record one diagnostic event (ring + sinks). Normally reached via the
88/// tracing layer rather than called directly.
89pub fn record(severity: Severity, area: &str, volume: Option<String>, message: String) {
90    let ev = ErrorEvent {
91        seq: SEQ.fetch_add(1, Ordering::Relaxed),
92        uptime_ms: START.get_or_init(Instant::now).elapsed().as_millis() as u64,
93        severity,
94        area: area.to_string(),
95        volume,
96        message,
97    };
98    {
99        let mut ring = RING.lock();
100        if ring.len() == RING_CAP {
101            ring.pop_front();
102        }
103        ring.push_back(ev.clone());
104    }
105    // Snapshot the sinks first: a sink may log (or register) and must not
106    // re-enter the lock.
107    let sinks: Vec<Sink> = SINKS.lock().iter().map(|(_, s)| s.clone()).collect();
108    for s in sinks {
109        s(&ev);
110    }
111}
112
113/// Snapshot of the diagnostics ring (oldest first), capped at the ring size.
114pub fn recent_errors() -> Vec<ErrorEvent> {
115    RING.lock().iter().cloned().collect()
116}
117
118// ── tracing layer ───────────────────────────────────────────────────────
119
120/// Routes WARN/ERROR tracing events into the diagnostics ring + sinks.
121/// Events with `target: "panic"` are classified as panics.
122pub struct DiagLayer;
123
124struct FieldGrab {
125    message: String,
126    volume: Option<String>,
127}
128
129impl tracing::field::Visit for FieldGrab {
130    fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
131        match field.name() {
132            "message" => self.message = format!("{value:?}"),
133            "volume" => self.volume = Some(format!("{value:?}").trim_matches('"').to_string()),
134            _ => {}
135        }
136    }
137
138    fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
139        match field.name() {
140            "message" => self.message = value.to_string(),
141            "volume" => self.volume = Some(value.to_string()),
142            _ => {}
143        }
144    }
145}
146
147impl<S: tracing::Subscriber> tracing_subscriber::Layer<S> for DiagLayer {
148    fn on_event(
149        &self,
150        event: &tracing::Event<'_>,
151        _ctx: tracing_subscriber::layer::Context<'_, S>,
152    ) {
153        let level = *event.metadata().level();
154        if level > tracing::Level::WARN {
155            return; // Level orders ERROR < WARN < INFO …
156        }
157        let mut grab = FieldGrab {
158            message: String::new(),
159            volume: None,
160        };
161        event.record(&mut grab);
162        let target = event.metadata().target();
163        let severity = if target == "panic" {
164            Severity::Panic
165        } else if level == tracing::Level::ERROR {
166            Severity::Error
167        } else {
168            Severity::Warn
169        };
170        record(severity, target, grab.volume, grab.message);
171    }
172}
173
174// ── panic hook & logging bootstrap ──────────────────────────────────────
175
176/// Route every panic (any thread) through tracing with a backtrace, so it
177/// reaches the log file, the ring and the UI. Idempotent.
178pub fn install_panic_hook() {
179    static ONCE: Once = Once::new();
180    ONCE.call_once(|| {
181        let previous = std::panic::take_hook();
182        std::panic::set_hook(Box::new(move |info| {
183            let payload = info
184                .payload()
185                .downcast_ref::<&str>()
186                .map(ToString::to_string)
187                .or_else(|| info.payload().downcast_ref::<String>().cloned())
188                .unwrap_or_else(|| "<non-string panic payload>".to_string());
189            let location = info.location().map_or_else(
190                || "<unknown>".to_string(),
191                |l| format!("{}:{}", l.file(), l.line()),
192            );
193            let backtrace = std::backtrace::Backtrace::force_capture();
194            tracing::error!(
195                target: "panic",
196                "panic at {location}: {payload}\n{backtrace}"
197            );
198            previous(info);
199        }));
200    });
201}
202
203/// Resolves the engine log directory.
204///
205/// An explicit override (config key, CLI flag) wins; otherwise the log sits in
206/// a `logs` subdir of `index_dir` — the location the caller already chose to
207/// write the index, so it shares the index's writability and pollution domain
208/// (portable → `<exe>\data\index\logs`, scope → `%LOCALAPPDATA%\…\index\logs`).
209/// There is deliberately **no machine-wide default**: falling back to
210/// `%ProgramData%` dirtied the machine for non-elevated callers and panicked
211/// when that dir existed but was not writable. The machine service does not rely
212/// on this fallback — it passes its own `%ProgramData%\find-my-files\logs`
213/// explicitly. The one implementation of this rule — every entry point resolves
214/// through here (ADR-0018; the rule's prose lives in docs/ARCHITECTURE.md).
215#[must_use]
216pub fn resolve_log_dir(
217    explicit: Option<std::path::PathBuf>,
218    index_dir: &std::path::Path,
219) -> std::path::PathBuf {
220    explicit.unwrap_or_else(|| index_dir.join("logs"))
221}
222
223/// The one diagnostics bootstrap: file/stderr logging + panic capture +
224/// diag-ring wiring, idempotent — FFI `fmf_engine_create`, the service
225/// entry points and the CLI all call exactly this (ADR-0018).
226pub fn init_diag(log_dir: Option<&std::path::Path>, level: &str) {
227    init_logging(log_dir, level);
228    install_panic_hook();
229}
230
231/// Full error cause chain as one line.
232///
233/// The single implementation behind the FFI `fmf_last_error` detail and the
234/// pipe error-response payload. Capped at 4 KiB (a pathological chain of
235/// nested I/O errors must not balloon a frame); the cap is part of the
236/// contract's error-detail spec.
237pub fn error_chain(e: &dyn std::error::Error) -> String {
238    const CAP: usize = 4096;
239    let mut s = e.to_string();
240    let mut src = e.source();
241    while let Some(cause) = src {
242        s.push_str(" — caused by: ");
243        s.push_str(&cause.to_string());
244        src = cause.source();
245    }
246    if s.len() > CAP {
247        let mut end = CAP;
248        while !s.is_char_boundary(end) {
249            end -= 1;
250        }
251        s.truncate(end);
252        s.push('…');
253    }
254    s
255}
256
257/// The one way to record a degraded path (one that recovers via fallback):
258/// warn and counter increment done atomically (the syntactic form of
259/// "don't go silent" — ADR-0018).
260///
261/// `rg degrade!` enumerates every degraded path. Batch paths (scan internals)
262/// return degradation via stats fields and the worker layer maps them to
263/// counters in one place (don't scatter the macro across the hot path).
264#[macro_export]
265macro_rules! degrade {
266    ($counter:expr, $($arg:tt)*) => {{
267        $crate::metrics::Counters::bump(&$counter);
268        tracing::warn!($($arg)*);
269    }};
270}
271
272static LOG_GUARD: OnceLock<tracing_appender::non_blocking::WorkerGuard> = OnceLock::new();
273
274/// Initialize process-wide logging once.
275///
276/// `log_dir = Some(..)` writes a daily rolling `engine.log`; `None` writes to
277/// stderr (CLI). The `FMF_LOG` env var overrides `level`. Safe to call
278/// repeatedly — later calls no-op.
279pub fn init_logging(log_dir: Option<&std::path::Path>, level: &str) {
280    use tracing_subscriber::layer::SubscriberExt;
281    use tracing_subscriber::util::SubscriberInitExt;
282
283    START.get_or_init(Instant::now);
284    let filter = tracing_subscriber::EnvFilter::try_from_env("FMF_LOG")
285        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(level));
286    let registry = tracing_subscriber::registry().with(filter).with(DiagLayer);
287
288    match log_dir {
289        Some(dir) => {
290            if let Err(e) = std::fs::create_dir_all(dir) {
291                // Last resort: stderr, and leave a breadcrumb in the ring.
292                record(
293                    Severity::Error,
294                    "diag",
295                    None,
296                    format!("cannot create log dir {}: {e}", dir.display()),
297                );
298                let _ = registry
299                    .with(
300                        tracing_subscriber::fmt::layer()
301                            .with_ansi(false)
302                            .with_writer(std::io::stderr),
303                    )
304                    .try_init();
305                return;
306            }
307            let appender = tracing_appender::rolling::daily(dir, "engine.log");
308            let (writer, guard) = tracing_appender::non_blocking(appender);
309            let _ = registry
310                .with(
311                    tracing_subscriber::fmt::layer()
312                        .with_ansi(false)
313                        .with_writer(writer),
314                )
315                .try_init();
316            let _ = LOG_GUARD.set(guard);
317        }
318        None => {
319            let _ = registry
320                .with(
321                    tracing_subscriber::fmt::layer()
322                        .with_ansi(false)
323                        .with_writer(std::io::stderr),
324                )
325                .try_init();
326        }
327    }
328}
329
330#[cfg(test)]
331mod tests {
332    use super::*;
333
334    /// Single test for the global pipeline (ring/sinks/hook are process-wide
335    /// state; parallel tests would interleave).
336    #[test]
337    fn ring_sinks_layer_and_panic_hook() {
338        use tracing_subscriber::layer::SubscriberExt;
339
340        // Layer → ring + sink.
341        let seen = Arc::new(Mutex::new(Vec::<ErrorEvent>::new()));
342        let seen2 = seen.clone();
343        let guard = register_sink(Arc::new(move |ev| seen2.lock().push(ev.clone())));
344
345        let subscriber = tracing_subscriber::registry().with(DiagLayer);
346        tracing::subscriber::with_default(subscriber, || {
347            tracing::warn!(volume = "C:", "snapshot stale");
348            tracing::error!("boom");
349            tracing::info!("ignored");
350
351            // The hook fires during unwinding on this thread (with_default
352            // is thread-local, so a spawned thread would miss the layer).
353            install_panic_hook();
354            let _ = std::panic::catch_unwind(|| panic!("test panic"));
355        });
356
357        let events = seen.lock().clone();
358        assert!(events.len() >= 3, "expected 3+ events, got {events:?}");
359        let warn = events
360            .iter()
361            .find(|e| e.severity == Severity::Warn)
362            .unwrap();
363        assert_eq!(warn.volume.as_deref(), Some("C:"));
364        assert!(warn.message.contains("snapshot stale"));
365        assert!(events.iter().any(|e| e.severity == Severity::Error));
366        let panic_ev = events
367            .iter()
368            .find(|e| e.severity == Severity::Panic)
369            .expect("panic captured");
370        assert!(panic_ev.message.contains("test panic"));
371        assert!(panic_ev.message.contains("diag.rs") || panic_ev.message.contains("backtrace"));
372
373        // Ring kept them too, with monotonically increasing seq.
374        let ring = recent_errors();
375        assert!(ring.len() >= 3);
376        assert!(ring.windows(2).all(|w| w[0].seq < w[1].seq));
377
378        // Dropping the guard unregisters the sink.
379        drop(guard);
380        let before = seen.lock().len();
381        record(Severity::Warn, "t", None, "after-drop".into());
382        assert_eq!(seen.lock().len(), before);
383    }
384
385    #[test]
386    fn log_dir_defaults_next_to_the_index_never_machine_wide() {
387        use std::path::{Path, PathBuf};
388
389        let index = Path::new("some").join("writable").join("index");
390
391        // Default: the log co-locates under the caller's index dir, so it
392        // inherits the index's writability and pollution domain.
393        assert_eq!(resolve_log_dir(None, &index), index.join("logs"));
394
395        // An explicit override (config/CLI) always wins.
396        let explicit = PathBuf::from("elsewhere").join("logs");
397        assert_eq!(resolve_log_dir(Some(explicit.clone()), &index), explicit);
398
399        // Never falls back to a hard-coded machine-wide ProgramData path
400        // (that dirtied the machine for non-elevated callers and panicked when
401        // unwritable).
402        assert!(
403            !resolve_log_dir(None, &index)
404                .to_string_lossy()
405                .contains("ProgramData")
406        );
407    }
408}