diff --git a/Cargo.lock b/Cargo.lock index 1a4c45ea9a..b9a26a0bc2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3111,8 +3111,7 @@ dependencies = [ [[package]] name = "libdd-libunwind-sys" version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eecc52581f5ccbcced4e2264fb7dd95112dba55f47d217c859420dd64e62c43a" +source = "git+https://github.com/DataDog/libdatadog-libunwind?branch=gyuheon0h%2Fremote-unwind-api#5e54c7a99830648e7cf53e9e49c4329b75766d15" dependencies = [ "cc", "libc", diff --git a/bin_tests/src/modes/behavior.rs b/bin_tests/src/modes/behavior.rs index 969f8dc59f..8ed10d0361 100644 --- a/bin_tests/src/modes/behavior.rs +++ b/bin_tests/src/modes/behavior.rs @@ -138,6 +138,7 @@ pub fn get_behavior(mode_str: &str) -> Box { "panic_hook_string" => Box::new(test_014_panic_hook_string::Test), "panic_hook_unknown_type" => Box::new(test_015_panic_hook_unknown_type::Test), "errno_preservation" => Box::new(test_016_errno_preservation::Test), + "multi_thread_collection" => Box::new(test_017_multi_thread_collection::Test), "runtime_preload_logger" => Box::new(test_000_donothing::Test), _ => panic!("Unknown mode: {mode_str}"), } diff --git a/bin_tests/src/modes/unix/mod.rs b/bin_tests/src/modes/unix/mod.rs index 73638943c2..0edfe9d72b 100644 --- a/bin_tests/src/modes/unix/mod.rs +++ b/bin_tests/src/modes/unix/mod.rs @@ -17,3 +17,4 @@ pub mod test_013_panic_hook_after_fork; pub mod test_014_panic_hook_string; pub mod test_015_panic_hook_unknown_type; pub mod test_016_errno_preservation; +pub mod test_017_multi_thread_collection; diff --git a/bin_tests/src/modes/unix/test_017_multi_thread_collection.rs b/bin_tests/src/modes/unix/test_017_multi_thread_collection.rs new file mode 100644 index 0000000000..a4ccb71c3a --- /dev/null +++ b/bin_tests/src/modes/unix/test_017_multi_thread_collection.rs @@ -0,0 +1,94 @@ +// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +//! Tests that the crashtracker collects stack information for all threads, not +//! just the crashing thread. +//! +//! Two named background threads are spawned with distinct, recognisable call +//! chains so that the captured stacks are visually interesting and clearly +//! distinguishable in the crash report. +//! +//! ct_worker_0: worker_entry_0 -> wait_for_work_0 -> thread::sleep +//! ct_worker_1: worker_entry_1 -> wait_for_work_1 -> thread::sleep +//! +//! All intermediate functions are #[inline(never)] so they appear as distinct +//! frames in the libunwind output. + +use crate::modes::behavior::Behavior; +use libdd_crashtracker::CrashtrackerConfiguration; +use std::path::Path; +use std::sync::{Arc, Barrier}; +use std::thread; +use std::time::Duration; + +pub struct Test; + +#[inline(never)] +fn wait_for_work_0() { + let _ = std::hint::black_box(10u64); + thread::sleep(Duration::from_secs(300)); +} + +#[inline(never)] +fn worker_entry_0() { + let _ = std::hint::black_box(20u64); + wait_for_work_0(); +} + +#[inline(never)] +fn wait_for_work_1() { + let _ = std::hint::black_box(11u64); + thread::sleep(Duration::from_secs(300)); +} + +#[inline(never)] +fn worker_entry_1() { + let _ = std::hint::black_box(21u64); + wait_for_work_1(); +} + +impl Behavior for Test { + fn setup( + &self, + _output_dir: &Path, + config: &mut CrashtrackerConfiguration, + ) -> anyhow::Result<()> { + config.set_collect_all_threads(true); + config.set_max_threads(32); + Ok(()) + } + + fn pre(&self, _output_dir: &Path) -> anyhow::Result<()> { + Ok(()) + } + + /// Spawn two named worker threads with distinct call chains, then leak the + /// handles so the threads outlive post() and are still live when the crash fires. + fn post(&self, _output_dir: &Path) -> anyhow::Result<()> { + // 2 workers + 1 for this (main) thread. + let barrier = Arc::new(Barrier::new(3)); + + let b0 = Arc::clone(&barrier); + let h0 = thread::Builder::new() + .name("ct_worker_0".to_string()) + .spawn(move || { + b0.wait(); + worker_entry_0(); + })?; + + let b1 = Arc::clone(&barrier); + let h1 = thread::Builder::new() + .name("ct_worker_1".to_string()) + .spawn(move || { + b1.wait(); + worker_entry_1(); + })?; + + barrier.wait(); + thread::sleep(Duration::from_millis(20)); + + std::mem::forget(h0); + std::mem::forget(h1); + Ok(()) + } +} diff --git a/bin_tests/src/test_types.rs b/bin_tests/src/test_types.rs index 1c47aac864..7961d8eb57 100644 --- a/bin_tests/src/test_types.rs +++ b/bin_tests/src/test_types.rs @@ -20,6 +20,7 @@ pub enum TestMode { RuntimeCallbackFrameInvalidUtf8, RuntimePreloadLogger, ErrnoPreservation, + MultiThreadCollection, } impl TestMode { @@ -41,6 +42,7 @@ impl TestMode { Self::RuntimeCallbackFrameInvalidUtf8 => "runtime_callback_frame_invalid_utf8", Self::RuntimePreloadLogger => "runtime_preload_logger", Self::ErrnoPreservation => "errno_preservation", + Self::MultiThreadCollection => "multi_thread_collection", } } @@ -62,6 +64,7 @@ impl TestMode { Self::RuntimeCallbackFrameInvalidUtf8, Self::RuntimePreloadLogger, Self::ErrnoPreservation, + Self::MultiThreadCollection, ] } } @@ -92,6 +95,7 @@ impl std::str::FromStr for TestMode { "runtime_callback_frame_invalid_utf8" => Ok(Self::RuntimeCallbackFrameInvalidUtf8), "runtime_preload_logger" => Ok(Self::RuntimePreloadLogger), "errno_preservation" => Ok(Self::ErrnoPreservation), + "multi_thread_collection" => Ok(Self::MultiThreadCollection), _ => Err(format!("Unknown test mode: {}", s)), } } diff --git a/bin_tests/tests/crashtracker_bin_test.rs b/bin_tests/tests/crashtracker_bin_test.rs index 700c87f129..c751ce9939 100644 --- a/bin_tests/tests/crashtracker_bin_test.rs +++ b/bin_tests/tests/crashtracker_bin_test.rs @@ -188,6 +188,135 @@ fn test_crash_tracking_bin_runtime_callback_frame() { run_crash_test_with_artifacts(&config, &artifacts_map, &artifacts, validator).unwrap(); } +/// Tests that when `collect_all_threads` is enabled, the crash report contains +/// entries in `error.threads` for background threads beyond the crashing thread. +/// +/// The behavior (test_017_multi_thread_collection.rs) enables `collect_all_threads`, +/// spawns two named sleeping worker threads in `post()`, and then crashes the main thread. +/// +/// Thread collection now happens in the receiver process using libunwind remote unwinding +/// via ptrace (_UPT_create / unw_init_remote / unw_step_remote). The parent process stays +/// alive until the receiver completes, guaranteeing threads are valid ptrace targets. +/// +/// We verify: +/// - `error.threads` is non-empty. +/// - Each thread entry is well-formed: `crashed=false`, `name`, and `stack` present. +/// - None of the additional threads are marked as crashed (the crashing thread is in +/// `error.stack`, not `error.threads`). +/// - Both worker threads are present by name (ct_worker_0, ct_worker_1). +/// - Each worker has a multi-frame stack trace including their entry function, confirming that +/// libunwind remote unwinding produced a full call chain rather than a single syscall frame. +#[test] +#[cfg(target_os = "linux")] +#[cfg_attr(miri, ignore)] +fn test_crash_tracking_multi_thread_collection() { + let config = CrashTestConfig::new( + BuildProfile::Release, + TestMode::MultiThreadCollection, + CrashType::NullDeref, + ); + let artifacts = StandardArtifacts::new(config.profile); + let artifacts_map = fetch_built_artifacts(&artifacts.as_slice()).unwrap(); + + let validator: ValidatorFn = Box::new(|payload, _fixtures| { + let error = &payload["error"]; + // assert!( + // false, + // "{}", + // serde_json::to_string_pretty(error).unwrap_or_default() + // ); + let threads = error["threads"] + .as_array() + .expect("error.threads should be a JSON array"); + + let thread_names: Vec<&str> = threads + .iter() + .map(|t| t["name"].as_str().unwrap_or("")) + .collect(); + + assert!( + !threads.is_empty(), + "error.threads should be non-empty when collect_all_threads is enabled; \ + got payload: {}", + serde_json::to_string_pretty(payload).unwrap_or_default() + ); + + // Every thread entry must be structurally valid and non-crashing + for thread in threads { + assert!( + thread["name"].is_string(), + "thread entry missing 'name': {thread:?}" + ); + assert!( + thread["crashed"].is_boolean(), + "thread entry missing 'crashed': {thread:?}" + ); + assert!( + thread["stack"].is_object(), + "thread entry missing 'stack': {thread:?}" + ); + assert!( + !thread["crashed"].as_bool().unwrap_or(true), + "threads in error.threads must have crashed=false: {thread:?}" + ); + } + + // Both named workers must be present; the behavior spawns exactly two + for expected in ["ct_worker_0", "ct_worker_1"] { + assert!( + thread_names.contains(&expected), + "Expected worker thread '{expected}' in error.threads; \ + got: {thread_names:?}" + ); + } + + // Each worker must have a multi-frame stack trace. + // + // The workers sleep in thread::sleep -> wait_for_work_N -> worker_entry_N. + // With libunwind remote unwinding, we expect the full call chain rather than + // a single syscall frame. We verify: + // - More than one frame was captured. + // - At least one frame contains the worker's entry function by name. + for expected in ["ct_worker_0", "ct_worker_1"] { + let worker = threads + .iter() + .find(|t| t["name"].as_str() == Some(expected)) + .unwrap_or_else(|| panic!("{expected} should be in threads")); + + let frames = worker["stack"]["frames"] + .as_array() + .unwrap_or_else(|| panic!("{expected} stack.frames should be an array")); + + assert!( + frames.len() > 1, + "{expected} should have a multi-frame stack trace from remote libunwind; \ + got {} frame(s): {frames:?}", + frames.len() + ); + + let entry_fn = if expected == "ct_worker_0" { + "worker_entry_0" + } else { + "worker_entry_1" + }; + let has_entry_frame = frames.iter().any(|f| { + f["function"] + .as_str() + .map(|name| name.contains(entry_fn)) + .unwrap_or(false) + }); + assert!( + has_entry_frame, + "{expected} stack should contain a frame for '{entry_fn}' but got: {frames:?}" + ); + } + + Ok(()) + }); + + run_crash_test_with_artifacts(&config, &artifacts_map, &artifacts, validator).unwrap(); +} + #[test] #[cfg(target_os = "linux")] #[cfg_attr(miri, ignore)] diff --git a/libdd-crashtracker/Cargo.toml b/libdd-crashtracker/Cargo.toml index 876b8c6de7..cf52ac8820 100644 --- a/libdd-crashtracker/Cargo.toml +++ b/libdd-crashtracker/Cargo.toml @@ -42,7 +42,7 @@ cxx = ["dep:cxx", "dep:cxx-build"] blazesym = "=0.2.3" [target.'cfg(target_os = "linux")'.dependencies] -libdd-libunwind-sys = { version = "1.0.0" } +libdd-libunwind-sys = { git = "https://github.com/DataDog/libdatadog-libunwind", branch = "gyuheon0h/remote-unwind-api" } [dependencies] anyhow = "1.0" diff --git a/libdd-crashtracker/src/collector/collector_manager.rs b/libdd-crashtracker/src/collector/collector_manager.rs index 3a3f134ea7..767aba5549 100644 --- a/libdd-crashtracker/src/collector/collector_manager.rs +++ b/libdd-crashtracker/src/collector/collector_manager.rs @@ -57,8 +57,8 @@ impl Collector { tid, ); } - pid if pid > 0 => Ok(Self { - handle: ProcessHandle::new(receiver.handle.uds_fd, Some(pid)), + child_pid if child_pid > 0 => Ok(Self { + handle: ProcessHandle::new(receiver.handle.uds_fd, Some(child_pid)), }), code => { // Error diff --git a/libdd-crashtracker/src/collector/crash_handler.rs b/libdd-crashtracker/src/collector/crash_handler.rs index 2febb153d8..22d7922cf3 100644 --- a/libdd-crashtracker/src/collector/crash_handler.rs +++ b/libdd-crashtracker/src/collector/crash_handler.rs @@ -303,6 +303,18 @@ fn handle_posix_signal_impl( let receiver = Receiver::from_crashtracker_config(config)?; + // Enable ptrace permissions for receiver if multi-thread collection is enabled + #[cfg(target_os = "linux")] + if config.collect_all_threads() { + if let Some(receiver_pid) = receiver.handle.pid { + // Allow the receiver to ptrace this process for thread context collection + // SAFETY: prctl is async-signal-safe and we're just setting ptrace permissions + unsafe { + libc::prctl(libc::PR_SET_PTRACER, receiver_pid as libc::c_ulong, 0, 0, 0); + } + } + } + let collector = Collector::spawn( &receiver, config, diff --git a/libdd-crashtracker/src/crash_info/errors_intake.rs b/libdd-crashtracker/src/crash_info/errors_intake.rs index 1b5777db93..6520651890 100644 --- a/libdd-crashtracker/src/crash_info/errors_intake.rs +++ b/libdd-crashtracker/src/crash_info/errors_intake.rs @@ -6,7 +6,7 @@ use std::time::SystemTime; use crate::{OsInfo, SigInfo, Ucontext}; use super::{ - build_crash_ping_message, CrashInfo, Experimental, Metadata, ProcInfo, StackTrace, + build_crash_ping_message, CrashInfo, Experimental, Metadata, ProcInfo, StackTrace, ThreadData, TARGET_TRIPLE, }; use anyhow::Context; @@ -253,6 +253,8 @@ pub struct ErrorObject { pub stack: Option, #[serde(skip_serializing_if = "Option::is_none")] pub thread_name: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub threads: Option>, } #[derive(serde::Serialize, Debug)] @@ -430,6 +432,7 @@ impl ErrorsIntakePayload { is_crash: Some(true), source_type: Some("Crashtracking".to_string()), experimental: crash_info.experimental.clone(), + threads: Some(crash_info.error.threads.clone()), }, trace_id: None, ucontext: crash_info.ucontext.clone(), @@ -496,6 +499,7 @@ impl ErrorsIntakePayload { is_crash: Some(false), source_type: Some("Crashtracking".to_string()), experimental: None, + threads: None, }, sig_info: sig_info.cloned(), trace_id: None, diff --git a/libdd-crashtracker/src/receiver/mod.rs b/libdd-crashtracker/src/receiver/mod.rs index 036663f0f5..1265cfca66 100644 --- a/libdd-crashtracker/src/receiver/mod.rs +++ b/libdd-crashtracker/src/receiver/mod.rs @@ -7,6 +7,8 @@ pub use entry_points::{ async_receiver_entry_point_unix_listener, async_receiver_entry_point_unix_socket, get_receiver_unix_socket, receiver_entry_point_stdin, receiver_entry_point_unix_socket, }; +#[cfg(target_os = "linux")] +mod ptrace_collector; mod receive_report; #[cfg(feature = "benchmarking")] diff --git a/libdd-crashtracker/src/receiver/ptrace_collector.rs b/libdd-crashtracker/src/receiver/ptrace_collector.rs new file mode 100644 index 0000000000..2ab98157dc --- /dev/null +++ b/libdd-crashtracker/src/receiver/ptrace_collector.rs @@ -0,0 +1,329 @@ +// Copyright 2026-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +//! Ptrace-based thread context collection with libunwind remote unwinding. +//! This module is compiled for Linux only. +//! +//! This module provides ptrace-based thread context collection that runs in the +//! receiver process. It uses libunwind's remote unwinding APIs to generate full +//! stack traces for all threads in the crashed process. +//! +//! The flow is: +//! 1. Enumerate threads from /proc//task/ +//! 2. Attach to each thread using PTRACE_SEIZE + PTRACE_INTERRUPT (stops the thread) +//! 3. While the thread is stopped, use libunwind remote APIs to unwind the stack: +//! - _UPT_create(tid) create ptrace unwinding state +//! - unw_create_addr_space() create address space with ptrace accessors +//! - unw_init_remote() initialize remote cursor +//! - unw_step_remote() loop walk frames +//! - unw_get_proc_name_remote() resolve symbol names +//! - _UPT_destroy() + cleanup clean up +//! 4. Detach from the thread via PTRACE_DETACH +//! +//! The crashed parent process stays alive (blocked in the signal handler) until +//! receiver.finish() completes. This guarantees the target process remains a valid +//! ptrace target for the entire duration of thread collection. +//! +//! The parent calls prctl(PR_SET_PTRACER, receiver_pid) before forking the collector, +//! which grants the receiver ptrace permission + +use std::ptr; +use std::time::{Duration, Instant}; + +use libdd_libunwind_sys::{ + UnwAddrSpaceT, UnwCursor, UnwWord, _UPT_accessors, _UPT_create, _UPT_destroy, + unw_create_addr_space, unw_destroy_addr_space, unw_get_proc_name_remote, unw_get_reg_remote, + unw_init_remote, unw_step_remote, UNW_REG_FP, UNW_REG_IP, UNW_REG_SP, +}; + +use crate::crash_info::{StackFrame, StackTrace}; + +/// Maximum number of threads to collect contexts for +const MAX_TRACKED_THREADS: usize = 128; + +/// Maximum number of stack frames to capture per thread +const MAX_FRAMES: usize = 512; + +/// A captured thread context containing a full remote stack trace +pub struct CapturedThreadContext { + pub stack_trace: StackTrace, +} + +#[derive(Debug)] +pub enum PtraceError { + /// Failed to enumerate threads from /proc filesystem + Enumeration(std::io::Error), + /// Failed to attach to a thread + Attach(libc::pid_t, i32), + /// Failed to detach from a thread + Detach(libc::pid_t, i32), +} + +impl std::fmt::Display for PtraceError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + PtraceError::Enumeration(e) => write!(f, "Failed to enumerate threads: {}", e), + PtraceError::Attach(tid, errno) => { + write!(f, "Failed to attach to thread {}: errno {}", tid, errno) + } + PtraceError::Detach(tid, errno) => { + write!(f, "Failed to detach from thread {}: errno {}", tid, errno) + } + } + } +} + +impl std::error::Error for PtraceError {} + +/// Enumerate all thread IDs for a given process from /proc//task/ +pub fn enumerate_threads(pid: libc::pid_t) -> Result, PtraceError> { + let task_dir = format!("/proc/{}/task", pid); + let entries = std::fs::read_dir(&task_dir).map_err(PtraceError::Enumeration)?; + + let mut tids = Vec::new(); + for entry in entries { + let entry = entry.map_err(PtraceError::Enumeration)?; + if let Ok(name) = entry.file_name().into_string() { + if let Ok(tid) = name.parse::() { + tids.push(tid); + } + } + } + Ok(tids) +} + +/// Attach to a thread using PTRACE_SEIZE + PTRACE_INTERRUPT, then wait for it to stop. +/// +/// After this call the thread is stopped and ready for register reads and remote unwinding. +fn attach_thread(tid: libc::pid_t) -> Result<(), PtraceError> { + // SAFETY: PTRACE_SEIZE attaches without stopping the thread + let result = unsafe { + libc::ptrace( + libc::PTRACE_SEIZE, + tid as libc::c_long, + ptr::null_mut::(), + ptr::null_mut::(), + ) + }; + if result == -1 { + let errno = unsafe { *libc::__errno_location() }; + return Err(PtraceError::Attach(tid, errno)); + } + + // SAFETY: PTRACE_INTERRUPT delivers a stop to the seized thread + let result = unsafe { + libc::ptrace( + libc::PTRACE_INTERRUPT, + tid as libc::c_long, + ptr::null_mut::(), + ptr::null_mut::(), + ) + }; + if result == -1 { + let errno = unsafe { *libc::__errno_location() }; + let _ = detach_thread(tid); + return Err(PtraceError::Attach(tid, errno)); + } + + // Wait for the stop signal to be delivered + let mut status = 0; + // SAFETY: waitpid with valid tid + unsafe { libc::waitpid(tid, &mut status, 0) }; + + Ok(()) +} + +fn detach_thread(tid: libc::pid_t) -> Result<(), PtraceError> { + // SAFETY: PTRACE_DETACH is valid for a currently-traced thread + let result = unsafe { + libc::ptrace( + libc::PTRACE_DETACH, + tid as libc::c_long, + ptr::null_mut::(), + ptr::null_mut::(), + ) + }; + if result == -1 { + let errno = unsafe { *libc::__errno_location() }; + return Err(PtraceError::Detach(tid, errno)); + } + Ok(()) +} + +/// Capture the full stack trace for a stopped thread using libunwind remote unwinding. +/// +/// The thread must already be stopped (via `attach_thread`) before calling this. +/// The caller is responsible for detaching after this returns. +/// +/// libunwind's ptrace backend (`_UPT_*`) implements the accessor callbacks that +/// libunwind uses to read memory and registers from the target process via ptrace. +/// `unw_create_addr_space` wires those accessors into a remote address space +/// `unw_init_remote` seeds a cursor from the thread's current register state +/// `unw_step_remote` walks each frame, reading the target's stack memory via ptrace +fn unwind_remote_thread( + tid: libc::pid_t, + resolve_frames: crate::StacktraceCollection, +) -> StackTrace { + // SAFETY: _UPT_create allocates a ptrace unwinding context for the given tid. + // The thread must already be stopped via ptrace for this to succeed. + let upt_info = unsafe { _UPT_create(tid) }; + if upt_info.is_null() { + return StackTrace::new_incomplete(); + } + + // SAFETY: _UPT_accessors is a static accessor table provided by libunwind-ptrace. + // byteorder=0 means native byte order. + let addr_space: UnwAddrSpaceT = + unsafe { unw_create_addr_space(&raw const _UPT_accessors as *mut _, 0) }; + if addr_space.is_null() { + unsafe { _UPT_destroy(upt_info) }; + return StackTrace::new_incomplete(); + } + + // SAFETY: cursor is zeroed; unw_init_remote seeds it from the thread's registers + // using ptrace with upt_info as the accessor argument. + let mut cursor: UnwCursor = unsafe { std::mem::zeroed() }; + let ret = unsafe { unw_init_remote(&mut cursor, addr_space, upt_info) }; + if ret != 0 { + unsafe { + _UPT_destroy(upt_info); + unw_destroy_addr_space(addr_space); + } + return StackTrace::new_incomplete(); + } + + let mut frames = Vec::new(); + + for _ in 0..MAX_FRAMES { + let mut ip: UnwWord = 0; + let mut sp: UnwWord = 0; + let mut fp: UnwWord = 0; + + // SAFETY: cursor is initialized; unw_get_reg_remote reads from target via ptrace + if unsafe { unw_get_reg_remote(&mut cursor, UNW_REG_IP, &mut ip) } != 0 || ip == 0 { + break; + } + let _ = unsafe { unw_get_reg_remote(&mut cursor, UNW_REG_SP, &mut sp) }; + let _ = unsafe { unw_get_reg_remote(&mut cursor, UNW_REG_FP, &mut fp) }; + + let mut frame = StackFrame { + ip: Some(format!("0x{:x}", ip)), + sp: Some(format!("0x{:x}", sp)), + module_base_address: None, + symbol_address: None, + build_id: None, + build_id_type: None, + file_type: None, + path: None, + relative_address: None, + column: None, + file: None, + function: None, + line: None, + type_name: None, + mangled_name: None, + comments: vec![], + }; + + // Resolve the function name if symbol resolution is enabled + // We don't care whether it is EnabledWithInprocessSymbols or + // EnabledWithSymbolsInReceiver since this is happening in the receiver + if resolve_frames != crate::StacktraceCollection::Disabled + && resolve_frames != crate::StacktraceCollection::WithoutSymbols + { + let mut name_buf = [0i8; 256]; + let mut offset: UnwWord = 0; + // SAFETY: cursor is valid; unw_get_proc_name_remote reads symbol info via ptrace + if unsafe { + unw_get_proc_name_remote( + &mut cursor, + name_buf.as_mut_ptr(), + name_buf.len(), + &mut offset, + ) + } == 0 + { + // SAFETY: libunwind wrote a null-terminated string into name_buf + let name = unsafe { std::ffi::CStr::from_ptr(name_buf.as_ptr()) }; + if let Ok(s) = name.to_str() { + frame.function = Some(s.to_string()); + } + } + } + + frames.push(frame); + + // SAFETY: cursor is valid + if unsafe { unw_step_remote(&mut cursor) } <= 0 { + break; + } + } + + // SAFETY: cleaning up; these were created above + unsafe { + _UPT_destroy(upt_info); + unw_destroy_addr_space(addr_space); + } + + StackTrace::from_frames(frames, false) +} + +/// Attach to a thread, capture its full stack trace using remote libunwind, then detach. +pub fn capture_thread_context( + _pid: libc::pid_t, + tid: libc::pid_t, + resolve_frames: crate::StacktraceCollection, +) -> Result { + attach_thread(tid)?; + + let stack_trace = unwind_remote_thread(tid, resolve_frames); + + detach_thread(tid)?; + + Ok(CapturedThreadContext { stack_trace }) +} + +/// Stream thread contexts to a callback, one at a time, without intermediate storage. +/// +/// For each non-crashing thread the callback receives the TID and an optional +/// `CapturedThreadContext` (None if attachment or unwinding failed). The callback +/// returns `true` to continue or `false` to stop early. +pub fn stream_thread_contexts( + parent_pid: libc::pid_t, + crashing_tid: libc::pid_t, + max_threads: usize, + timeout: Duration, + resolve_frames: crate::StacktraceCollection, + mut callback: F, +) -> Result<(), PtraceError> +where + F: FnMut(libc::pid_t, Option<&CapturedThreadContext>) -> bool, +{ + let start_time = Instant::now(); + let tids = enumerate_threads(parent_pid)?; + let max_count = max_threads.min(MAX_TRACKED_THREADS); + let mut processed = 0; + + for tid in tids { + if start_time.elapsed() >= timeout { + break; + } + if tid == crashing_tid { + continue; + } + if processed >= max_count { + break; + } + + let context = capture_thread_context(parent_pid, tid, resolve_frames).ok(); + + let should_continue = callback(tid, context.as_ref()); + processed += 1; + + if !should_continue { + break; + } + } + + Ok(()) +} diff --git a/libdd-crashtracker/src/receiver/receive_report.rs b/libdd-crashtracker/src/receiver/receive_report.rs index e796bd58ca..5edbeca166 100644 --- a/libdd-crashtracker/src/receiver/receive_report.rs +++ b/libdd-crashtracker/src/receiver/receive_report.rs @@ -4,7 +4,7 @@ use crate::{ crash_info::{ CrashInfo, CrashInfoBuilder, ErrorKind, SigInfo, Span, StackFrame, TelemetryCrashUploader, - Ucontext, + ThreadData, Ucontext, }, runtime_callback::RuntimeStack, shared::constants::*, @@ -98,6 +98,37 @@ impl From for StackFrame { } } +/// Partial data accumulated while parsing a single thread block. +#[derive(Debug)] +pub(super) struct ThreadInProgress { + crashed: bool, + name: String, + state: Option, + /// Stack frames collected inside the nested STACKTRACE block. + frames: Vec, + /// Whether DD_CRASHTRACK_END_STACKTRACE was received (marks the stack complete). + stack_complete: bool, +} + +impl ThreadInProgress { + fn into_thread_data(self) -> ThreadData { + let mut stack = if self.frames.is_empty() { + StackTrace::new_incomplete() + } else { + StackTrace::from_frames(self.frames, !self.stack_complete) + }; + if self.stack_complete { + let _ = stack.set_complete(); + } + ThreadData { + crashed: self.crashed, + name: self.name, + stack, + state: self.state, + } + } +} + /// The crashtracker collector sends data in blocks. /// This enum tracks which block we're currently in, and, for multi-line blocks, /// collects the partial data until the block is closed and it can be appended @@ -120,6 +151,11 @@ pub(crate) enum StdinState { Waiting, WholeStackTrace, ThreadName(Option), + /// Parsing a thread block: waiting for BEGIN_STACKTRACE or END_THREAD. + /// The Option is None until the first (JSON header) line has been parsed. + Thread(Option), + /// Inside the STACKTRACE block nested within a thread block. + ThreadStackTrace(ThreadInProgress), // StackFrame is always emitted as one stream of all the frames but StackString // may have lines that we need to accumulate depending on runtime (e.g. Python) RuntimeStackFrame(Vec), @@ -321,6 +357,58 @@ fn process_line( StdinState::Ucontext } + StdinState::Thread(None) if line.starts_with(DD_CRASHTRACK_END_THREAD) => { + // Empty thread block with no header; log and move on. + builder + .with_log_message("Thread block ended without a header line".to_string(), true)?; + StdinState::Waiting + } + StdinState::Thread(None) => { + // First line is the JSON header: {tid, crashed, name, state} + #[derive(serde::Deserialize)] + struct ThreadHeader { + crashed: bool, + name: String, + #[serde(default)] + state: Option, + } + let header: ThreadHeader = serde_json::from_str(line)?; + StdinState::Thread(Some(ThreadInProgress { + crashed: header.crashed, + name: header.name, + state: header.state, + frames: vec![], + stack_complete: false, + })) + } + + // State: Thread(Some) — waiting for BEGIN_STACKTRACE or END_THREAD. + StdinState::Thread(Some(partial)) if line.starts_with(DD_CRASHTRACK_END_THREAD) => { + builder.with_thread(partial.into_thread_data())?; + StdinState::Waiting + } + StdinState::Thread(Some(partial)) if line.starts_with(DD_CRASHTRACK_BEGIN_STACKTRACE) => { + StdinState::ThreadStackTrace(partial) + } + StdinState::Thread(partial) => { + // Unexpected line inside a thread block; log it and stay. + builder + .with_log_message(format!("Unexpected line inside thread block: {line}"), true)?; + StdinState::Thread(partial) + } + + // State: ThreadStackTrace — collecting frames for the current thread. + StdinState::ThreadStackTrace(mut partial) + if line.starts_with(DD_CRASHTRACK_END_STACKTRACE) => + { + partial.stack_complete = true; + StdinState::Thread(Some(partial)) + } + StdinState::ThreadStackTrace(mut partial) => { + let frame: StackFrame = serde_json::from_str(line)?; + partial.frames.push(frame); + StdinState::ThreadStackTrace(partial) + } StdinState::Waiting if line.starts_with(DD_CRASHTRACK_BEGIN_ADDITIONAL_TAGS) => { StdinState::AdditionalTags } @@ -359,6 +447,9 @@ fn process_line( StdinState::Waiting if line.starts_with(DD_CRASHTRACK_BEGIN_THREAD_NAME) => { StdinState::ThreadName(None) } + StdinState::Waiting if line.starts_with(DD_CRASHTRACK_BEGIN_THREAD) => { + StdinState::Thread(None) + } StdinState::Waiting if line.starts_with(DD_CRASHTRACK_BEGIN_UCONTEXT) => { StdinState::Ucontext } @@ -529,6 +620,17 @@ pub(crate) async fn receive_report_from_stream( } } + // Collect thread contexts if enabled - this is done here in the receiver + // because the parent process stays alive until the receiver completes + #[cfg(target_os = "linux")] + if config.collect_all_threads() { + if let Some(proc_info) = builder.proc_info.as_ref() { + let parent_pid = proc_info.pid; + let crashing_tid = proc_info.tid; + collect_and_add_thread_contexts(&mut builder, &config, parent_pid, crashing_tid)?; + } + } + let crash_info = builder.build()?; if crash_info.incomplete { @@ -544,29 +646,129 @@ pub(crate) async fn receive_report_from_stream( Ok(Some((config, crash_info))) } +#[cfg(target_os = "linux")] +fn collect_and_add_thread_contexts( + builder: &mut CrashInfoBuilder, + config: &CrashtrackerConfiguration, + parent_pid: u32, + crashing_tid: Option, +) -> anyhow::Result<()> { + use crate::crash_info::ThreadData; + use crate::receiver::ptrace_collector::stream_thread_contexts; + use std::time::Duration; + + let crashing_tid = crashing_tid.unwrap_or(0) as i32; + let parent_pid = parent_pid as i32; + + // Timeout for ptrace collection: half the overall crash timeout, capped at 200ms + let context_timeout = Duration::from_millis((config.timeout().as_millis() / 2).min(200) as u64); + + let mut collected_threads = Vec::new(); + + let _ = stream_thread_contexts( + parent_pid, + crashing_tid, + config.max_threads(), + context_timeout, + config.resolve_frames(), + |tid, captured_context| { + let name = read_thread_name(parent_pid, tid).unwrap_or_else(|| tid.to_string()); + let state = read_thread_state(parent_pid, tid); + + let stack = match captured_context { + Some(ctx) => ctx.stack_trace.clone(), + None => crate::crash_info::StackTrace::empty(), + }; + + collected_threads.push(ThreadData { + crashed: false, + name, + stack, + state, + }); + + true // Continue with next thread + }, + ); + + if !collected_threads.is_empty() { + builder.with_threads(collected_threads)?; + } + + Ok(()) +} + +// Helper functions for reading thread metadata (moved from collector/emitters.rs) +#[cfg(target_os = "linux")] +fn read_thread_name(pid: i32, tid: i32) -> Option { + use std::fs; + let path = format!("/proc/{pid}/task/{tid}/comm"); + fs::read_to_string(&path) + .ok() + .map(|s| s.trim_end_matches('\n').to_string()) +} + +#[cfg(target_os = "linux")] +fn read_thread_state(pid: i32, tid: i32) -> Option { + use std::fs; + let path = format!("/proc/{pid}/task/{tid}/stat"); + fs::read_to_string(&path).ok().and_then(|content| { + // The state is the 3rd field in /proc/pid/stat + content.split_whitespace().nth(2).map(|s| s.to_string()) + }) +} + #[cfg(target_os = "linux")] fn enrich_thread_name(builder: &mut CrashInfoBuilder) -> anyhow::Result<()> { use std::{fs, path::PathBuf}; - if builder.error.thread_name.is_some() { - return Ok(()); + // Enrich the primary crashing thread's name if not already set. + if builder.error.thread_name.is_none() { + if let Some(proc_info) = builder.proc_info.as_ref() { + if let Some(tid) = proc_info.tid { + let pid = proc_info.pid; + let path = PathBuf::from(format!("/proc/{pid}/task/{tid}/comm")); + if let Ok(comm) = fs::read_to_string(&path) { + let thread_name = comm.trim_end_matches('\n'); + if !thread_name.is_empty() { + builder.with_thread_name(thread_name.to_string())?; + } + } + } + } } - let Some(proc_info) = builder.proc_info.as_ref() else { - return Ok(()); - }; - let Some(tid) = proc_info.tid else { - return Ok(()); - }; - let pid = proc_info.pid; - let path = PathBuf::from(format!("/proc/{pid}/task/{tid}/comm")); - let Ok(comm) = fs::read_to_string(&path) else { - return Ok(()); - }; - let thread_name = comm.trim_end_matches('\n'); - if thread_name.is_empty() { - return Ok(()); + + // Enrich names for any additional threads that were emitted without a name or + // with only a TID string as name (fallback when /proc was not readable at emit + // time). We use the receiver-side /proc//task//comm read here because + // the collector child may have been unable to read it in time. + // + // Note: builder.error.threads holds ThreadData values only after they have been + // parsed from the unix socket. We iterate them and patch names that look like + // bare integers (the TID fallback). + if let Some(pid_info) = builder.proc_info.as_ref() { + let pid = pid_info.pid; + if let Some(threads) = builder.error.threads.as_mut() { + for thread in threads.iter_mut() { + // If the name is already meaningful (non-numeric), skip it. + if thread.name.parse::().is_err() { + continue; + } + // Name is a bare TID integer; try to read from /proc. + let Ok(tid) = thread.name.parse::() else { + continue; + }; + let path = PathBuf::from(format!("/proc/{pid}/task/{tid}/comm")); + if let Ok(comm) = fs::read_to_string(&path) { + let name = comm.trim_end_matches('\n'); + if !name.is_empty() { + thread.name = name.to_string(); + } + } + } + } } - builder.with_thread_name(thread_name.to_string())?; + Ok(()) } diff --git a/libdd-crashtracker/src/shared/configuration/builder.rs b/libdd-crashtracker/src/shared/configuration/builder.rs index c0ad491cfb..571d66e781 100644 --- a/libdd-crashtracker/src/shared/configuration/builder.rs +++ b/libdd-crashtracker/src/shared/configuration/builder.rs @@ -10,6 +10,7 @@ use super::{CrashtrackerConfiguration, StacktraceCollection}; #[derive(Debug, Default)] pub struct CrashtrackerConfigurationBuilder { additional_files: Vec, + collect_all_threads: bool, create_alt_stack: bool, demangle_names: bool, endpoint_url: Option, @@ -17,6 +18,7 @@ pub struct CrashtrackerConfigurationBuilder { endpoint_timeout_ms: Option, endpoint_test_token: Option, endpoint_use_system_resolver: bool, + max_threads: Option, resolve_frames: StacktraceCollection, signals: Vec, timeout: Option, @@ -30,6 +32,11 @@ impl CrashtrackerConfigurationBuilder { self } + pub fn collect_all_threads(mut self, collect: bool) -> Self { + self.collect_all_threads = collect; + self + } + pub fn create_alt_stack(mut self, create: bool) -> Self { self.create_alt_stack = create; self @@ -72,6 +79,11 @@ impl CrashtrackerConfigurationBuilder { self } + pub fn max_threads(mut self, max: usize) -> Self { + self.max_threads = Some(max); + self + } + pub fn resolve_frames(mut self, resolve: StacktraceCollection) -> Self { self.resolve_frames = resolve; self @@ -138,9 +150,11 @@ impl CrashtrackerConfigurationBuilder { // before the receiver is started when using an async-receiver. Ok(CrashtrackerConfiguration { additional_files: self.additional_files, + collect_all_threads: self.collect_all_threads, create_alt_stack: self.create_alt_stack, use_alt_stack: self.use_alt_stack, endpoint, + max_threads: self.max_threads.unwrap_or(128), resolve_frames: self.resolve_frames, signals, timeout, diff --git a/libdd-crashtracker/src/shared/configuration/mod.rs b/libdd-crashtracker/src/shared/configuration/mod.rs index a5b2971c7d..17a4bdbedf 100644 --- a/libdd-crashtracker/src/shared/configuration/mod.rs +++ b/libdd-crashtracker/src/shared/configuration/mod.rs @@ -30,10 +30,14 @@ pub enum StacktraceCollection { pub struct CrashtrackerConfiguration { // Paths to any additional files to track, if any additional_files: Vec, + #[serde(default)] + collect_all_threads: bool, create_alt_stack: bool, // Whether to demangle symbol names in stack traces demangle_names: bool, endpoint: Option, + #[serde(default = "default_max_threads")] + max_threads: usize, resolve_frames: StacktraceCollection, signals: Vec, timeout: Duration, @@ -41,6 +45,10 @@ pub struct CrashtrackerConfiguration { use_alt_stack: bool, } +fn default_max_threads() -> usize { + 128 +} + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)] pub struct CrashtrackerReceiverConfig { pub args: Vec, @@ -84,10 +92,18 @@ impl CrashtrackerConfiguration { &self.additional_files } + pub fn collect_all_threads(&self) -> bool { + self.collect_all_threads + } + pub fn create_alt_stack(&self) -> bool { self.create_alt_stack } + pub fn max_threads(&self) -> usize { + self.max_threads + } + pub fn use_alt_stack(&self) -> bool { self.use_alt_stack } @@ -116,6 +132,14 @@ impl CrashtrackerConfiguration { self.demangle_names } + pub fn set_collect_all_threads(&mut self, collect: bool) { + self.collect_all_threads = collect; + } + + pub fn set_max_threads(&mut self, max: usize) { + self.max_threads = max; + } + pub fn set_create_alt_stack(&mut self, create_alt_stack: bool) -> anyhow::Result<()> { anyhow::ensure!( !create_alt_stack || self.use_alt_stack, diff --git a/libdd-crashtracker/src/shared/constants.rs b/libdd-crashtracker/src/shared/constants.rs index 361e9f5ced..a5c6951ae1 100644 --- a/libdd-crashtracker/src/shared/constants.rs +++ b/libdd-crashtracker/src/shared/constants.rs @@ -11,6 +11,7 @@ pub const DD_CRASHTRACK_BEGIN_FILE: &str = "DD_CRASHTRACK_BEGIN_FILE"; pub const DD_CRASHTRACK_BEGIN_KIND: &str = "DD_CRASHTRACK_BEGIN_KIND"; pub const DD_CRASHTRACK_BEGIN_METADATA: &str = "DD_CRASHTRACK_BEGIN_METADATA"; pub const DD_CRASHTRACK_BEGIN_PROCINFO: &str = "DD_CRASHTRACK_BEGIN_PROCESSINFO"; +pub const DD_CRASHTRACK_BEGIN_THREAD: &str = "DD_CRASHTRACK_BEGIN_THREAD"; pub const DD_CRASHTRACK_BEGIN_THREAD_NAME: &str = "DD_CRASHTRACK_BEGIN_THREAD_NAME"; pub const DD_CRASHTRACK_BEGIN_RUNTIME_STACK_FRAME: &str = "DD_CRASHTRACK_BEGIN_RUNTIME_STACK_FRAME"; pub const DD_CRASHTRACK_BEGIN_RUNTIME_STACK_STRING: &str = @@ -30,6 +31,7 @@ pub const DD_CRASHTRACK_END_FILE: &str = "DD_CRASHTRACK_END_FILE"; pub const DD_CRASHTRACK_END_KIND: &str = "DD_CRASHTRACK_END_KIND"; pub const DD_CRASHTRACK_END_METADATA: &str = "DD_CRASHTRACK_END_METADATA"; pub const DD_CRASHTRACK_END_PROCINFO: &str = "DD_CRASHTRACK_END_PROCESSINFO"; +pub const DD_CRASHTRACK_END_THREAD: &str = "DD_CRASHTRACK_END_THREAD"; pub const DD_CRASHTRACK_END_THREAD_NAME: &str = "DD_CRASHTRACK_END_THREAD_NAME"; pub const DD_CRASHTRACK_END_RUNTIME_STACK_FRAME: &str = "DD_CRASHTRACK_END_RUNTIME_STACK_FRAME"; pub const DD_CRASHTRACK_END_RUNTIME_STACK_STRING: &str = "DD_CRASHTRACK_END_RUNTIME_STACK_STRING";