Skip to content

Commit e7d104a

Browse files
committed
Use remote ptrace
1 parent d3f1ee9 commit e7d104a

7 files changed

Lines changed: 231 additions & 300 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bin_tests/tests/crashtracker_bin_test.rs

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -191,14 +191,21 @@ fn test_crash_tracking_bin_runtime_callback_frame() {
191191
/// Tests that when `collect_all_threads` is enabled, the crash report contains
192192
/// entries in `error.threads` for background threads beyond the crashing thread.
193193
///
194-
/// The behavior enables `collect_all_threads`, spawns two named
195-
/// sleeping worker threads in `post()`, and then crashes the main thread.
194+
/// The behavior (test_017_multi_thread_collection.rs) enables `collect_all_threads`,
195+
/// spawns two named sleeping worker threads in `post()`, and then crashes the main thread.
196+
///
197+
/// Thread collection now happens in the receiver process using libunwind remote unwinding
198+
/// via ptrace (_UPT_create / unw_init_remote / unw_step_remote). The parent process stays
199+
/// alive until the receiver completes, guaranteeing threads are valid ptrace targets.
200+
///
196201
/// We verify:
197-
/// - `error.threads` is non-empty
198-
/// - Each thread entry is well-formed: `crashed`, `name`, `stack` present.
202+
/// - `error.threads` is non-empty.
203+
/// - Each thread entry is well-formed: `crashed=false`, `name`, and `stack` present.
199204
/// - None of the additional threads are marked as crashed (the crashing thread is in
200205
/// `error.stack`, not `error.threads`).
201-
/// - The worker thread names are recognizable (ct_worker_0, ct_worker_1).
206+
/// - Both worker threads are present by name (ct_worker_0, ct_worker_1).
207+
/// - Each worker has a multi-frame stack trace including their entry function, confirming that
208+
/// libunwind remote unwinding produced a full call chain rather than a single syscall frame.
202209
#[test]
203210
#[cfg(target_os = "linux")]
204211
#[cfg_attr(miri, ignore)]
@@ -213,11 +220,11 @@ fn test_crash_tracking_multi_thread_collection() {
213220

214221
let validator: ValidatorFn = Box::new(|payload, _fixtures| {
215222
let error = &payload["error"];
216-
assert!(
217-
false,
218-
"{}",
219-
serde_json::to_string_pretty(error).unwrap_or_default()
220-
);
223+
// assert!(
224+
// false,
225+
// "{}",
226+
// serde_json::to_string_pretty(error).unwrap_or_default()
227+
// );
221228
let threads = error["threads"]
222229
.as_array()
223230
.expect("error.threads should be a JSON array");
@@ -254,8 +261,7 @@ fn test_crash_tracking_multi_thread_collection() {
254261
);
255262
}
256263

257-
// Both named workers must be present; the behavior (test_017_multi_thread_collection.rs)
258-
//spawns exactly two
264+
// Both named workers must be present; the behavior spawns exactly two
259265
for expected in ["ct_worker_0", "ct_worker_1"] {
260266
assert!(
261267
thread_names.contains(&expected),
@@ -264,9 +270,13 @@ fn test_crash_tracking_multi_thread_collection() {
264270
);
265271
}
266272

267-
// Each worker must have captured at least one stack frame
268-
// (they were sleeping in a syscall, so SIGUSR2 interrupted them and
269-
// unw_init_local2 should produce at minimum the syscall entry frame).
273+
// Each worker must have a multi-frame stack trace.
274+
//
275+
// The workers sleep in thread::sleep -> wait_for_work_N -> worker_entry_N.
276+
// With libunwind remote unwinding, we expect the full call chain rather than
277+
// a single syscall frame. We verify:
278+
// - More than one frame was captured.
279+
// - At least one frame contains the worker's entry function by name.
270280
for expected in ["ct_worker_0", "ct_worker_1"] {
271281
let worker = threads
272282
.iter()
@@ -278,8 +288,26 @@ fn test_crash_tracking_multi_thread_collection() {
278288
.unwrap_or_else(|| panic!("{expected} stack.frames should be an array"));
279289

280290
assert!(
281-
!frames.is_empty(),
282-
"{expected} should have at least one stack frame; worker: {worker:?}"
291+
frames.len() > 1,
292+
"{expected} should have a multi-frame stack trace from remote libunwind; \
293+
got {} frame(s): {frames:?}",
294+
frames.len()
295+
);
296+
297+
let entry_fn = if expected == "ct_worker_0" {
298+
"worker_entry_0"
299+
} else {
300+
"worker_entry_1"
301+
};
302+
let has_entry_frame = frames.iter().any(|f| {
303+
f["function"]
304+
.as_str()
305+
.map(|name| name.contains(entry_fn))
306+
.unwrap_or(false)
307+
});
308+
assert!(
309+
has_entry_frame,
310+
"{expected} stack should contain a frame for '{entry_fn}' but got: {frames:?}"
283311
);
284312
}
285313

libdd-crashtracker/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ cxx = ["dep:cxx", "dep:cxx-build"]
4242
blazesym = "=0.2.3"
4343

4444
[target.'cfg(target_os = "linux")'.dependencies]
45-
libdd-libunwind-sys = { version = "1.0.0" }
45+
libdd-libunwind-sys = { git = "https://github.com/DataDog/libdatadog-libunwind", branch = "gyuheon0h/remote-unwind-api" }
4646

4747
[dependencies]
4848
anyhow = "1.0"

libdd-crashtracker/src/collector/emitters.rs

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@ use std::{
2020
};
2121
use thiserror::Error;
2222

23-
#[cfg(target_os = "linux")]
24-
use std::io::BufRead;
25-
2623
#[derive(Debug, Error)]
2724
pub enum EmitterError {
2825
#[error("Failed to write to output: {0}")]
@@ -423,63 +420,6 @@ unsafe fn emit_whole_stacktrace(
423420
Ok(())
424421
}
425422

426-
/// Emit one thread block over the wire protocol.
427-
///
428-
/// Format:
429-
/// ```text
430-
/// DD_CRASHTRACK_BEGIN_THREAD
431-
/// {"tid": <tid>, "crashed": <bool>, "name": "<name>", "state": "<state>"}
432-
/// DD_CRASHTRACK_BEGIN_STACKTRACE
433-
/// ...frame JSON lines (if a ucontext was captured for this thread)...
434-
/// DD_CRASHTRACK_END_STACKTRACE
435-
/// DD_CRASHTRACK_END_THREAD
436-
/// ```
437-
#[cfg(target_os = "linux")]
438-
fn emit_thread_block(
439-
w: &mut impl Write,
440-
tid: libc::pid_t,
441-
crashed: bool,
442-
name: &str,
443-
state: Option<&str>,
444-
resolve_frames: StacktraceCollection,
445-
ucontext: Option<*const ucontext_t>,
446-
) -> Result<(), EmitterError> {
447-
writeln!(w, "{DD_CRASHTRACK_BEGIN_THREAD}")?;
448-
449-
// Header JSON line
450-
write!(w, "{{\"tid\": {tid}, \"crashed\": {crashed}")?;
451-
let safe_name = name
452-
.replace('\\', "\\\\")
453-
.replace('"', "\\\"")
454-
.replace('\n', "\\n");
455-
write!(w, ", \"name\": \"{safe_name}\"")?;
456-
if let Some(s) = state {
457-
let safe_state = s
458-
.replace('\\', "\\\\")
459-
.replace('"', "\\\"")
460-
.replace('\n', "\\n");
461-
write!(w, ", \"state\": \"{safe_state}\"")?;
462-
}
463-
writeln!(w, "}}")?;
464-
w.flush()?;
465-
466-
// Stack trace (only if we have a ucontext for this thread)
467-
writeln!(w, "{DD_CRASHTRACK_BEGIN_STACKTRACE}")?;
468-
if let Some(uc) = ucontext {
469-
if resolve_frames != StacktraceCollection::Disabled {
470-
// SAFETY: uc was written by handle_collect_context_signal and is valid for
471-
// the duration of the collector child (COW mapping from the parent).
472-
let _ = unsafe { emit_backtrace_via_libunwind(w, resolve_frames, uc) };
473-
}
474-
}
475-
writeln!(w, "{DD_CRASHTRACK_END_STACKTRACE}")?;
476-
w.flush()?;
477-
478-
writeln!(w, "{DD_CRASHTRACK_END_THREAD}")?;
479-
w.flush()?;
480-
Ok(())
481-
}
482-
483423
fn emit_config(w: &mut impl Write, config_str: &str) -> Result<(), EmitterError> {
484424
writeln!(w, "{DD_CRASHTRACK_BEGIN_CONFIG}")?;
485425
writeln!(w, "{config_str}")?;

libdd-crashtracker/src/collector/signal_handler_manager.rs

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -206,17 +206,6 @@ unsafe fn register_signal_handler(
206206
Ok((signal_type, old_handler))
207207
}
208208

209-
/// Register the SIGUSR2 handler used to capture thread contexts for multi-thread
210-
/// stack collection.
211-
///
212-
/// This must be called from a non-signal context before any crash can occur.
213-
/// The previous SIGUSR2 `sigaction` is saved so the handler can chain to it.
214-
///
215-
/// Note: if the application installs its own SIGUSR2 handler after this call
216-
/// and does not chain to the previous action, our handler will be silently lost and
217-
/// thread contexts will not be captured. Applications that own SIGUSR2 should
218-
/// install their handler first and chain correctly, or not enable `collect_all_threads`.
219-
220209
#[cfg(test)]
221210
mod tests {
222211
#[cfg(target_os = "linux")]

0 commit comments

Comments
 (0)