Skip to content

Commit e96cb1a

Browse files
committed
Add unit tests for thread context buffer
1 parent 3f70266 commit e96cb1a

3 files changed

Lines changed: 222 additions & 10 deletions

File tree

libdd-crashtracker/src/collector/emitters.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,6 @@ fn emit_thread_block(
454454

455455
// Header JSON line
456456
write!(w, "{{\"tid\": {tid}, \"crashed\": {crashed}")?;
457-
// Escape name for JSON: replace control chars and quotes
458457
let safe_name = name
459458
.replace('\\', "\\\\")
460459
.replace('"', "\\\"")
@@ -497,7 +496,7 @@ fn read_thread_name(pid: libc::pid_t, tid: libc::pid_t) -> Option<String> {
497496
}
498497

499498
/// Read a thread's scheduler state letter from /proc/<pid>/task/<tid>/status.
500-
/// Returns the single-letter state (e.g. "S", "R", "D") or None on failure.
499+
/// Returns the single-letter state ("S", "R", "D") or None on failure.
501500
#[cfg(target_os = "linux")]
502501
fn read_thread_state(pid: libc::pid_t, tid: libc::pid_t) -> Option<String> {
503502
let path = format!("/proc/{pid}/task/{tid}/status");
@@ -535,7 +534,7 @@ fn enumerate_task_tids(pid: libc::pid_t) -> Vec<libc::pid_t> {
535534
/// Called from the collector child (after fork), so std::fs is safe to use.
536535
/// For each thread:
537536
/// - Reads name and state from /proc/<ppid>/task/<tid>/
538-
/// - If a ucontext was captured (Phase 2 / collect_all_threads), emits the stack trace.
537+
/// - If a ucontext was captured (collect_all_threads), emits the stack trace.
539538
/// - Otherwise emits an empty (incomplete) stack trace.
540539
#[cfg(target_os = "linux")]
541540
fn emit_all_threads(
@@ -546,7 +545,7 @@ fn emit_all_threads(
546545
) -> Result<(), EmitterError> {
547546
use crate::collector::thread_context_buffer::iter_collected_contexts;
548547

549-
// Build a map from TID to captured ucontext pointer (may be empty if Phase 2 buffer
548+
// Build a map from TID to captured ucontext pointer (may be empty if buffer
550549
// was not initialised or the thread did not respond in time).
551550
let contexts: std::collections::HashMap<libc::pid_t, *const ucontext_t> =
552551
iter_collected_contexts()

libdd-crashtracker/src/collector/thread_context_buffer.rs

Lines changed: 216 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ static BUFFER: AtomicPtr<ThreadContextBuffer> = AtomicPtr::new(ptr::null_mut());
9999

100100
/// Allocate the thread context buffer.
101101
///
102-
/// Must be called once from a non-signal context (typically inside `init()`).
102+
/// Must be called once from a non-signal context (inside `init()`).
103103
/// Subsequent calls are a no-op.
104104
pub fn init_thread_context_buffer(capacity: usize) {
105105
let capacity = capacity.clamp(1, MAX_TRACKED_THREADS);
@@ -164,7 +164,7 @@ pub(crate) extern "C" fn handle_collect_context_signal(
164164
if slot.tid.load(Ordering::SeqCst) == tid {
165165
if !ucontext.is_null() {
166166
// SAFETY: ucontext is the kernel-saved register state, valid for the
167-
// duration of the signal handler. The slot ctx field is not accessed
167+
// duration of the signal handler. The slot ctx field is not accessed
168168
// by anyone else until ready is observed as true.
169169
unsafe {
170170
ptr::copy_nonoverlapping(
@@ -279,7 +279,7 @@ fn parse_tid(bytes: &[u8]) -> Option<libc::pid_t> {
279279
/// then wait for each contacted thread to write its ucontext into its slot.
280280
///
281281
/// Called from the primary crash signal handler before fork()-ing the collector
282-
/// child. Must be async-signal-safe.
282+
/// child. Must be async-signal-safe.
283283
///
284284
/// - `crashing_tid` TID of the thread that received the fatal signal (skipped).
285285
/// - `max_threads` cap on how many threads to sample.
@@ -380,6 +380,37 @@ pub fn iter_collected_contexts() -> impl Iterator<Item = CollectedContext> {
380380
#[cfg(test)]
381381
mod tests {
382382
use super::*;
383+
use std::sync::Arc;
384+
use std::time::Duration;
385+
386+
/// Claim the first free slot for `tid`. Returns the slot index on success.
387+
fn claim_slot(buf: &ThreadContextBuffer, tid: i32) -> Option<usize> {
388+
buf.slots.iter().enumerate().find_map(|(i, slot)| {
389+
slot.tid
390+
.compare_exchange(0, tid, Ordering::SeqCst, Ordering::Relaxed)
391+
.ok()
392+
.map(|_| i)
393+
})
394+
}
395+
396+
/// Install `handle_collect_context_signal` as the SIGUSR2 handler and return
397+
/// the previous `sigaction` so the caller can restore it.
398+
fn install_context_handler() -> libc::sigaction {
399+
let mut sa: libc::sigaction = unsafe { std::mem::zeroed() };
400+
sa.sa_sigaction = handle_collect_context_signal as *const () as usize;
401+
sa.sa_flags = libc::SA_SIGINFO | libc::SA_RESTART | libc::SA_NODEFER;
402+
let mut old: libc::sigaction = unsafe { std::mem::zeroed() };
403+
unsafe { libc::sigaction(libc::SIGUSR2, &sa, &mut old) };
404+
old
405+
}
406+
407+
fn restore_handler(old: &libc::sigaction) {
408+
unsafe { libc::sigaction(libc::SIGUSR2, old, std::ptr::null_mut()) };
409+
}
410+
411+
fn current_tid() -> i32 {
412+
unsafe { libc::syscall(libc::SYS_gettid) as i32 }
413+
}
383414

384415
#[test]
385416
fn test_parse_tid_valid() {
@@ -419,4 +450,186 @@ mod tests {
419450
assert_eq!(buf.slots[0].tid.load(Ordering::SeqCst), 0);
420451
assert!(!buf.slots[0].ready.load(Ordering::SeqCst));
421452
}
453+
454+
#[test]
455+
fn test_signal_handler_writes_context_and_sets_ready() {
456+
init_thread_context_buffer(16);
457+
reset_thread_context_buffer();
458+
459+
let tid = current_tid();
460+
let buf = get_buffer().unwrap();
461+
let idx = claim_slot(buf, tid).expect("should find a free slot");
462+
463+
let mut fake_ctx: libc::ucontext_t = unsafe { std::mem::zeroed() };
464+
fake_ctx.uc_flags = 0x1234_5678;
465+
466+
handle_collect_context_signal(
467+
libc::SIGUSR2,
468+
std::ptr::null_mut(),
469+
&mut fake_ctx as *mut libc::ucontext_t as *mut libc::c_void,
470+
);
471+
472+
let slot = &buf.slots[idx];
473+
assert!(
474+
slot.ready.load(Ordering::Acquire),
475+
"slot should be marked ready after handler runs"
476+
);
477+
// Verify the context bytes were actually copied.
478+
let copied_flags = unsafe { (*(*slot.ctx.get()).as_ptr()).uc_flags };
479+
assert_eq!(
480+
copied_flags, 0x1234_5678,
481+
"uc_flags sentinel should survive the copy"
482+
);
483+
484+
reset_thread_context_buffer();
485+
}
486+
487+
/// Even with a null ucontext (kernel gave us nothing), the slot must be
488+
/// marked ready so the spin-wait in collect_thread_contexts can complete.
489+
#[test]
490+
fn test_signal_handler_null_ucontext_still_sets_ready() {
491+
init_thread_context_buffer(16);
492+
reset_thread_context_buffer();
493+
494+
let tid = current_tid();
495+
let buf = get_buffer().unwrap();
496+
let idx = claim_slot(buf, tid).expect("should find a free slot");
497+
498+
handle_collect_context_signal(
499+
libc::SIGUSR2,
500+
std::ptr::null_mut(),
501+
std::ptr::null_mut(), // null ucontext
502+
);
503+
504+
assert!(
505+
buf.slots[idx].ready.load(Ordering::Acquire),
506+
"null ucontext should still set ready so the caller doesn't stall"
507+
);
508+
509+
reset_thread_context_buffer();
510+
}
511+
512+
/// The handler must silently do nothing when the current TID has no slot
513+
/// claimed for it
514+
#[test]
515+
fn test_signal_handler_ignores_unclaimed_tid() {
516+
init_thread_context_buffer(16);
517+
reset_thread_context_buffer(); // all slots tid=0, ready=false
518+
519+
let mut fake_ctx: libc::ucontext_t = unsafe { std::mem::zeroed() };
520+
// Call with no slot pre-claimed for the current TID.
521+
handle_collect_context_signal(
522+
libc::SIGUSR2,
523+
std::ptr::null_mut(),
524+
&mut fake_ctx as *mut libc::ucontext_t as *mut libc::c_void,
525+
);
526+
527+
let buf = get_buffer().unwrap();
528+
let tid = current_tid();
529+
for slot in &buf.slots {
530+
assert!(
531+
!slot.ready.load(Ordering::Acquire),
532+
"no slot should be ready when TID was not pre-claimed"
533+
);
534+
assert_ne!(
535+
slot.tid.load(Ordering::Acquire),
536+
tid,
537+
"handler must not claim a slot for an unclaimed TID"
538+
);
539+
}
540+
541+
reset_thread_context_buffer();
542+
}
543+
544+
/// The crashing thread shouldn't appear as one of the captured threads
545+
#[test]
546+
#[cfg_attr(miri, ignore)] // sigaction is not supported by Miri
547+
fn test_collect_never_captures_crashing_tid() {
548+
init_thread_context_buffer(16);
549+
reset_thread_context_buffer();
550+
551+
let old_sa = install_context_handler();
552+
let crashing_tid = unsafe { libc::syscall(libc::SYS_gettid) as libc::pid_t };
553+
554+
collect_thread_contexts(crashing_tid, 16, 300);
555+
556+
let buf = get_buffer().unwrap();
557+
for slot in &buf.slots {
558+
assert_ne!(
559+
slot.tid.load(Ordering::Acquire),
560+
crashing_tid as i32,
561+
"crashing TID ({crashing_tid}) must not appear in any slot"
562+
);
563+
}
564+
565+
restore_handler(&old_sa);
566+
reset_thread_context_buffer();
567+
}
568+
569+
/// Two background threads sleeping in a syscall should both respond to
570+
/// SIGUSR2 and have their contexts captured before the timeout fires.
571+
///
572+
/// We record each worker's TID before they block and verify that exactly
573+
/// those TIDs appear in the buffer afterwards.
574+
#[test]
575+
#[cfg_attr(miri, ignore)] // sigaction is not supported by Miri
576+
fn test_collect_captures_background_threads() {
577+
init_thread_context_buffer(16);
578+
reset_thread_context_buffer();
579+
580+
let old_sa = install_context_handler();
581+
582+
let (tx, rx) = std::sync::mpsc::channel::<i32>();
583+
let keep = Arc::new(std::sync::atomic::AtomicBool::new(true));
584+
585+
let handles: Vec<_> = (0..2)
586+
.map(|_| {
587+
let tx = tx.clone();
588+
let keep = Arc::clone(&keep);
589+
std::thread::spawn(move || {
590+
tx.send(current_tid()).unwrap();
591+
while keep.load(Ordering::Relaxed) {
592+
std::thread::sleep(Duration::from_millis(20));
593+
}
594+
})
595+
})
596+
.collect();
597+
598+
// Receive exactly one TID per worker
599+
drop(tx);
600+
let worker_tids: Vec<i32> = (0..2).map(|_| rx.recv().unwrap()).collect();
601+
std::thread::sleep(Duration::from_millis(50));
602+
603+
let crashing_tid = unsafe { libc::syscall(libc::SYS_gettid) as libc::pid_t };
604+
collect_thread_contexts(crashing_tid, 16, 500);
605+
606+
let buf = get_buffer().unwrap();
607+
608+
// Both workers must have responded and their contexts must be in the buffer.
609+
for &wtid in &worker_tids {
610+
let found = buf
611+
.slots
612+
.iter()
613+
.any(|s| s.tid.load(Ordering::Acquire) == wtid && s.ready.load(Ordering::Acquire));
614+
assert!(
615+
found,
616+
"worker TID {wtid} should have its context captured in the buffer"
617+
);
618+
}
619+
620+
// The crashing TID must still be absent.
621+
assert!(
622+
buf.slots
623+
.iter()
624+
.all(|s| s.tid.load(Ordering::Acquire) != crashing_tid as i32),
625+
"crashing TID ({crashing_tid}) must not be in the buffer"
626+
);
627+
628+
keep.store(false, Ordering::Relaxed);
629+
for h in handles {
630+
let _ = h.join();
631+
}
632+
restore_handler(&old_sa);
633+
reset_thread_context_buffer();
634+
}
422635
}

libdd-crashtracker/src/receiver/receive_report.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -661,8 +661,8 @@ fn enrich_thread_name(builder: &mut CrashInfoBuilder) -> anyhow::Result<()> {
661661
// the collector child may have been unable to read it in time.
662662
//
663663
// Note: builder.error.threads holds ThreadData values only after they have been
664-
// parsed from the wire protocol. We iterate them and patch names that look like
665-
// bare integers (i.e. the TID fallback).
664+
// parsed from the unix socket. We iterate them and patch names that look like
665+
// bare integers (the TID fallback).
666666
if let Some(pid_info) = builder.proc_info.as_ref() {
667667
let pid = pid_info.pid;
668668
if let Some(threads) = builder.error.threads.as_mut() {
@@ -671,7 +671,7 @@ fn enrich_thread_name(builder: &mut CrashInfoBuilder) -> anyhow::Result<()> {
671671
if thread.name.parse::<i32>().is_err() {
672672
continue;
673673
}
674-
// Name is a bare TID integer try to read from /proc.
674+
// Name is a bare TID integer; try to read from /proc.
675675
let Ok(tid) = thread.name.parse::<i32>() else {
676676
continue;
677677
};

0 commit comments

Comments
 (0)