Skip to content

Commit 73e73f6

Browse files
authored
feat(crashtracking): collect all native stacks for unhandled exception and also crashing thread (#2155)
# What does this PR do? Previously, when a runtime reported an unhandled exception, only the runtime-provided stack trace was captured; no native stacks were collected for any thread because `PR_SET_PTRACER` was not set for unhandled exceptions. Additionally, when all-thread collection was enabled for signal-based crashes, the crashing thread was excluded from the threads array. This change: 1. Enables native stack collection for all threads (including the crashing thread) when reporting unhandled exceptions with `collect_all_threads` enabled. The receiver now has the necessary ptrace permissions to unwind every thread in the process. 2. Includes the crashing thread in the `error.threads` array (marked with `crashed: true`) for both signal-based crashes and unhandled exceptions. Its native stack is collected by the receiver using ptrace, same as every other thread. The runtime-provided stack remains in `error.stack` as the canonical crash context, while `error.threads` provides a uniform native view of all threads at the time of the crash. This gives us two immediate benefits: 1. Unhandled exception crash reports now include native stacks for all threads, providing visibility into what the process was doing at the OS level; not just the managed runtime view. 2. The crashing thread's native stack is now collected in the receiver alongside all other threads, establishing a path toward consolidating all stack collection in the receiver process in the future. # Motivation What inspired you to submit this pull request? # Additional Notes Anything else we should know when reviewing? # How to test the change? Describe here in detail how the change can be validated.
1 parent 4e8e6cc commit 73e73f6

11 files changed

Lines changed: 308 additions & 80 deletions

File tree

bin_tests/src/bin/crashtracker_bin_test.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,12 +161,10 @@ mod unix {
161161
"unhandled_exception" => {
162162
let mut stacktrace = StackTrace::new_incomplete();
163163
let mut stackframe1 = StackFrame::new();
164-
stackframe1.with_ip(1234);
165164
stackframe1.with_function("test_function1".to_string());
166165
stackframe1.with_file("test_file1".to_string());
167166

168167
let mut stackframe2 = StackFrame::new();
169-
stackframe2.with_ip(5678);
170168
stackframe2.with_function("test_function2".to_string());
171169
stackframe2.with_file("test_file2".to_string());
172170

bin_tests/src/modes/behavior.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,9 @@ pub fn get_behavior(mode_str: &str) -> Box<dyn Behavior> {
144144
"sidecar_multi_thread_collection" => {
145145
Box::new(test_020_sidecar_multi_thread_collection::Test)
146146
}
147+
"unhandled_exception_multi_thread" => {
148+
Box::new(test_021_unhandled_exception_multi_thread::Test)
149+
}
147150
"runtime_preload_logger" => Box::new(test_000_donothing::Test),
148151
_ => panic!("Unknown mode: {mode_str}"),
149152
}

bin_tests/src/modes/unix/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,4 @@ pub mod test_017_multi_thread_collection;
2121
pub mod test_018_thread_limit;
2222
pub mod test_019_sidecar_donothing;
2323
pub mod test_020_sidecar_multi_thread_collection;
24+
pub mod test_021_unhandled_exception_multi_thread;

bin_tests/src/modes/unix/test_018_thread_limit.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,10 @@ impl Behavior for Test {
3434
}
3535

3636
fn post(&self, _output_dir: &Path) -> anyhow::Result<()> {
37-
let barrier = Arc::new(Barrier::new(THREAD_COUNT + 1));
37+
let barrier = Arc::new(Barrier::new(THREAD_COUNT));
3838

39-
for i in 0..THREAD_COUNT {
39+
// Make space for the crashing thread
40+
for i in 0..(THREAD_COUNT - 1) {
4041
let barrier = Arc::clone(&barrier);
4142
std::thread::Builder::new()
4243
.name(format!("worker-{i}"))
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
// Copyright 2026-Present Datadog, Inc. https://www.datadoghq.com/
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//! Tests that the crashtracker collects stack information for all threads when
5+
//! reporting an unhandled exception (not just signal-based crashes).
6+
7+
use crate::modes::behavior::Behavior;
8+
use libdd_crashtracker::CrashtrackerConfiguration;
9+
use std::path::Path;
10+
use std::sync::atomic::{AtomicBool, Ordering};
11+
use std::thread;
12+
use std::time::{Duration, Instant};
13+
14+
pub struct Test;
15+
16+
impl Behavior for Test {
17+
fn setup(
18+
&self,
19+
_output_dir: &Path,
20+
config: &mut CrashtrackerConfiguration,
21+
) -> anyhow::Result<()> {
22+
config.set_collect_all_threads(true);
23+
config.set_max_threads(32);
24+
Ok(())
25+
}
26+
27+
fn pre(&self, _output_dir: &Path) -> anyhow::Result<()> {
28+
Ok(())
29+
}
30+
31+
fn post(&self, _output_dir: &Path) -> anyhow::Result<()> {
32+
static WORKER_0_READY: AtomicBool = AtomicBool::new(false);
33+
static WORKER_1_READY: AtomicBool = AtomicBool::new(false);
34+
35+
#[inline(never)]
36+
fn worker_fn_0() {
37+
WORKER_0_READY.store(true, Ordering::Relaxed);
38+
loop {
39+
std::hint::black_box(0x21_00u64);
40+
std::hint::spin_loop();
41+
}
42+
}
43+
44+
#[inline(never)]
45+
fn worker_fn_1() {
46+
WORKER_1_READY.store(true, Ordering::Relaxed);
47+
loop {
48+
std::hint::black_box(0x21_01u64);
49+
std::hint::spin_loop();
50+
}
51+
}
52+
53+
WORKER_0_READY.store(false, Ordering::Relaxed);
54+
WORKER_1_READY.store(false, Ordering::Relaxed);
55+
56+
let h0 = thread::Builder::new()
57+
.name("ct_worker_0".to_string())
58+
.spawn(worker_fn_0)?;
59+
60+
let h1 = thread::Builder::new()
61+
.name("ct_worker_1".to_string())
62+
.spawn(worker_fn_1)?;
63+
64+
let deadline = Instant::now() + Duration::from_secs(5);
65+
while !WORKER_0_READY.load(Ordering::Acquire) || !WORKER_1_READY.load(Ordering::Acquire) {
66+
if Instant::now() >= deadline {
67+
panic!("Workers did not reach spin loop within 5s");
68+
}
69+
thread::yield_now();
70+
}
71+
72+
std::mem::forget(h0);
73+
std::mem::forget(h1);
74+
Ok(())
75+
}
76+
}

bin_tests/src/test_types.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ pub enum TestMode {
2424
ThreadLimit,
2525
SidecarDoNothing,
2626
SidecarMultiThreadCollection,
27+
UnhandledExceptionMultiThread,
2728
}
2829

2930
impl TestMode {
@@ -49,6 +50,7 @@ impl TestMode {
4950
Self::ThreadLimit => "thread_limit",
5051
Self::SidecarDoNothing => "sidecar_donothing",
5152
Self::SidecarMultiThreadCollection => "sidecar_multi_thread_collection",
53+
Self::UnhandledExceptionMultiThread => "unhandled_exception_multi_thread",
5254
}
5355
}
5456

@@ -74,6 +76,7 @@ impl TestMode {
7476
Self::ThreadLimit,
7577
Self::SidecarDoNothing,
7678
Self::SidecarMultiThreadCollection,
79+
Self::UnhandledExceptionMultiThread,
7780
]
7881
}
7982
}
@@ -108,6 +111,7 @@ impl std::str::FromStr for TestMode {
108111
"thread_limit" => Ok(Self::ThreadLimit),
109112
"sidecar_donothing" => Ok(Self::SidecarDoNothing),
110113
"sidecar_multi_thread_collection" => Ok(Self::SidecarMultiThreadCollection),
114+
"unhandled_exception_multi_thread" => Ok(Self::UnhandledExceptionMultiThread),
111115
_ => Err(format!("Unknown test mode: {}", s)),
112116
}
113117
}

bin_tests/tests/crashtracker_bin_test.rs

Lines changed: 139 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,115 @@ fn test_crash_tracking_bin_unhandled_exception() {
159159
run_crash_test_with_artifacts(&config, &artifacts_map, &artifacts, validator).unwrap();
160160
}
161161

162+
/// Tests that when `collect_all_threads` is enabled and the crash is reported via
163+
/// `report_unhandled_exception`, the crash report contains entries in `error.threads`
164+
/// for background threads with valid stack traces.
165+
///
166+
/// This verifies that `PR_SET_PTRACER` is correctly called in the unhandled exception
167+
/// path so the receiver can ptrace the still-alive parent process.
168+
#[test]
169+
#[cfg(target_os = "linux")]
170+
#[cfg_attr(miri, ignore)]
171+
fn test_crash_tracking_bin_unhandled_exception_multi_thread() {
172+
let config = CrashTestConfig::new(
173+
BuildProfile::Release,
174+
TestMode::UnhandledExceptionMultiThread,
175+
CrashType::UnhandledException,
176+
);
177+
let artifacts = StandardArtifacts::new(config.profile);
178+
let artifacts_map = fetch_built_artifacts(&artifacts.as_slice()).unwrap();
179+
180+
let validator: ValidatorFn = Box::new(|payload, _fixtures| {
181+
PayloadValidator::new(payload)
182+
.validate_error_kind("UnhandledException")?
183+
.validate_error_message_contains(
184+
"Process was terminated due to an unhandled exception of type 'RuntimeException'",
185+
)?;
186+
187+
let all_threads = payload["error"]["threads"]
188+
.as_array()
189+
.expect("error.threads should be a JSON array of thread objects");
190+
191+
assert!(
192+
all_threads.len() >= 3,
193+
"error.threads should contain at least 3 threads (1 crashed + 2 workers); got {} in payload: {}",
194+
all_threads.len(),
195+
serde_json::to_string_pretty(payload).unwrap_or_default()
196+
);
197+
198+
let thread_names: Vec<&str> = all_threads
199+
.iter()
200+
.map(|t| t["name"].as_str().unwrap_or("<none>"))
201+
.collect();
202+
203+
let crashed_threads: Vec<_> = all_threads
204+
.iter()
205+
.filter(|t| t["crashed"].as_bool() == Some(true))
206+
.collect();
207+
assert_eq!(
208+
crashed_threads.len(),
209+
1,
210+
"exactly one thread should have crashed=true; got: {crashed_threads:?}"
211+
);
212+
213+
for thread in all_threads {
214+
assert!(
215+
thread["name"].is_string(),
216+
"thread entry missing 'name': {thread:?}"
217+
);
218+
assert!(
219+
thread["crashed"].is_boolean(),
220+
"thread entry missing 'crashed': {thread:?}"
221+
);
222+
assert!(
223+
thread["stack"].is_object(),
224+
"thread entry missing 'stack': {thread:?}"
225+
);
226+
}
227+
228+
for expected in ["ct_worker_0", "ct_worker_1"] {
229+
assert!(
230+
thread_names.contains(&expected),
231+
"Expected worker thread '{expected}' in error.threads; got: {thread_names:?}"
232+
);
233+
234+
let worker = all_threads
235+
.iter()
236+
.find(|t| t["name"].as_str() == Some(expected))
237+
.unwrap_or_else(|| panic!("{expected} should be in threads"));
238+
239+
let frames = worker["stack"]["frames"]
240+
.as_array()
241+
.unwrap_or_else(|| panic!("{expected} stack.frames should be an array"));
242+
243+
assert!(
244+
!frames.is_empty(),
245+
"{expected} should have non-empty stack frames (ptrace should succeed with PR_SET_PTRACER)"
246+
);
247+
248+
let worker_fn = if expected == "ct_worker_0" {
249+
"worker_fn_0"
250+
} else {
251+
"worker_fn_1"
252+
};
253+
let has_worker_frame = frames.iter().any(|f| {
254+
f["function"]
255+
.as_str()
256+
.map(|name| name.contains(worker_fn))
257+
.unwrap_or(false)
258+
});
259+
assert!(
260+
has_worker_frame,
261+
"{expected} stack should contain a frame for '{worker_fn}' but got: {frames:?}"
262+
);
263+
}
264+
265+
Ok(())
266+
});
267+
268+
run_crash_test_with_artifacts(&config, &artifacts_map, &artifacts, validator).unwrap();
269+
}
270+
162271
#[test]
163272
#[cfg_attr(miri, ignore)]
164273
fn test_crash_tracking_bin_runtime_callback_frame() {
@@ -189,7 +298,7 @@ fn test_crash_tracking_bin_runtime_callback_frame() {
189298
}
190299

191300
/// Tests that when `collect_all_threads` is enabled, the crash report contains
192-
/// entries in `error.threads` for background threads beyond the crashing thread.
301+
/// entries in `error.threads` for all threads including the crashing thread.
193302
///
194303
/// The behavior (test_017_multi_thread_collection.rs) enables `collect_all_threads`,
195304
/// spawns two named sleeping worker threads in `post()`, and then crashes the main thread.
@@ -200,8 +309,8 @@ fn test_crash_tracking_bin_runtime_callback_frame() {
200309
///
201310
/// We verify:
202311
/// - `error.threads` is a non-empty array of thread objects.
203-
/// - Each thread entry is well-formed: `crashed=false`, `name`, and `stack` present.
204-
/// - The crashing thread stack is in `error.stack`, not `error.threads`.
312+
/// - Each thread entry is well-formed: `crashed`, `name`, and `stack` present.
313+
/// - Exactly one thread has `crashed=true` (the crashing thread).
205314
/// - Both worker threads are present by name (ct_worker_0, ct_worker_1).
206315
/// - Each worker has their work frame in the stack trace.
207316
#[test]
@@ -220,10 +329,9 @@ fn test_crash_tracking_multi_thread_collection() {
220329
let all_threads = payload["error"]["threads"]
221330
.as_array()
222331
.expect("error.threads should be a JSON array of thread objects");
223-
224332
assert!(
225-
all_threads.len() >= 2,
226-
"error.threads should be non-empty when collect_all_threads is enabled; got payload: {}",
333+
all_threads.len() >= 3,
334+
"error.threads should contain at least 3 threads (1 crashed + 2 workers); got payload: {}",
227335
serde_json::to_string_pretty(payload).unwrap_or_default()
228336
);
229337

@@ -232,6 +340,16 @@ fn test_crash_tracking_multi_thread_collection() {
232340
.map(|t| t["name"].as_str().unwrap_or("<none>"))
233341
.collect();
234342

343+
let crashed_threads: Vec<_> = all_threads
344+
.iter()
345+
.filter(|t| t["crashed"].as_bool() == Some(true))
346+
.collect();
347+
assert_eq!(
348+
crashed_threads.len(),
349+
1,
350+
"exactly one thread should have crashed=true; got: {crashed_threads:?}"
351+
);
352+
235353
for thread in all_threads {
236354
assert!(
237355
thread["name"].is_string(),
@@ -245,10 +363,6 @@ fn test_crash_tracking_multi_thread_collection() {
245363
thread["stack"].is_object(),
246364
"thread entry missing 'stack': {thread:?}"
247365
);
248-
assert!(
249-
!thread["crashed"].as_bool().unwrap_or(true),
250-
"threads in error.threads must have crashed=false: {thread:?}"
251-
);
252366
}
253367

254368
for expected in ["ct_worker_0", "ct_worker_1"] {
@@ -312,11 +426,22 @@ fn test_crash_tracking_thread_limit() {
312426
.expect("error.threads should be a JSON array of thread objects");
313427

314428
assert!(
315-
thread_array.len() >= THREAD_COUNT,
316-
"expected at least {THREAD_COUNT} thread entries, got {}",
429+
thread_array.len() == THREAD_COUNT,
430+
"expected {} thread entries ({THREAD_COUNT} workers), got {}",
431+
THREAD_COUNT,
317432
thread_array.len(),
318433
);
319434

435+
let crashed_threads: Vec<_> = thread_array
436+
.iter()
437+
.filter(|t| t["crashed"].as_bool() == Some(true))
438+
.collect();
439+
assert_eq!(
440+
crashed_threads.len(),
441+
1,
442+
"exactly one thread should have crashed=true; got: {crashed_threads:?}"
443+
);
444+
320445
for thread in thread_array {
321446
assert!(
322447
thread["name"].is_string(),
@@ -330,10 +455,6 @@ fn test_crash_tracking_thread_limit() {
330455
thread["stack"].is_object(),
331456
"thread entry missing 'stack': {thread:?}"
332457
);
333-
assert!(
334-
!thread["crashed"].as_bool().unwrap_or(true),
335-
"threads in error.threads must have crashed=false: {thread:?}"
336-
);
337458
}
338459

339460
Ok(())
@@ -529,8 +650,8 @@ fn test_crash_tracking_sidecar_multi_thread_collection() {
529650
let all_threads = all_threads.unwrap();
530651

531652
assert!(
532-
all_threads.len() >= 2,
533-
"error.threads should have at least 2 entries (sidecar multi-thread); got {}: {}",
653+
all_threads.len() >= 3,
654+
"error.threads should have at least 3 entries (1 crashed + 2 workers, sidecar multi-thread); got {}: {}",
534655
all_threads.len(),
535656
serde_json::to_string_pretty(&crash_payload).unwrap_or_default()
536657
);

0 commit comments

Comments
 (0)