Skip to content

Commit 70122e8

Browse files
jbachorikclaude
andauthored
Harden CodeCacheArray and profiler against signal-handler races (#413)
* Harden CodeCacheArray and profiler against signal-handler races - Guard CodeCacheArray readers against NULL during concurrent add() - Use CAS loop in add() to prevent exceeding MAX_NATIVE_LIBS - Make at() non-blocking (return NULL instead of spinning) - Cache anchor() in getJavaTraceAsync() to eliminate TOCTOU - Add NULL guards on all anchor() dereferences in signal context - Stop at first NULL in writeNativeLibraries() to avoid skipping entries - Skip patching sanitizer runtime libs to prevent heap corruption - Add NULL guard for lib->name() in patch_library_unlocked() Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Block profiling signals around RoutineInfo new/delete Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Address PR feedback: fix at() indentation, simplify writeNativeLibraries Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Harden J9+ASAN tests: forkEvery=1 and CI retry Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * ASAN test timeout, aarch64 retry parity, gdb watchdog Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Fix arm64 lockups: remove RT signal blocking, cap FP-chain and anchor recovery Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Address PR feedback: ACQUIRE ordering, assert in add(), comment fix Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Add arm64 ordering, concurrent iteration, and comment precision rules Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Pointer-first CodeCacheArray::add() eliminates NULL window Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c3fe02e commit 70122e8

14 files changed

Lines changed: 264 additions & 86 deletions

File tree

.github/scripts/prepare_reports.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ cp ddprof-test/javacore*.txt test-reports/ || true
1212
cp ddprof-test/build/hs_err* test-reports/ || true
1313
cp -r ddprof-lib/build/tmp test-reports/native_build || true
1414
cp -r ddprof-test/build/reports/tests test-reports/tests || true
15+
cp build/logs/gdb-watchdog.log test-reports/ || true
1516
cp -r /tmp/recordings test-reports/recordings || true
1617
find ddprof-lib/build -name 'libjavaProfiler.*' -exec cp {} test-reports/ \; || true
1718

.github/workflows/test_workflow.yml

Lines changed: 58 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -104,12 +104,25 @@ jobs:
104104
export LIBC=glibc
105105
export SANITIZER=${{ matrix.config }}
106106

107-
mkdir -p build/logs
108-
./gradlew -PCI -PkeepJFRs :ddprof-test:test${{ matrix.config }} --no-daemon --parallel --build-cache --no-watch-fs 2>&1 \
109-
| tee -a build/test-raw.log \
110-
| python3 -u .github/scripts/filter_gradle_log.py
111-
EXIT_CODE=${PIPESTATUS[0]}
112-
107+
MAX_ATTEMPTS=1
108+
if [[ "${{ matrix.config }}" == "asan" && "${{ matrix.java_version }}" =~ (j9|ibm) ]]; then
109+
MAX_ATTEMPTS=2
110+
fi
111+
112+
for attempt in $(seq 1 $MAX_ATTEMPTS); do
113+
mkdir -p build/logs
114+
./gradlew -PCI -PkeepJFRs :ddprof-test:test${{ matrix.config }} --no-daemon --parallel --build-cache --no-watch-fs 2>&1 \
115+
| tee -a build/test-raw.log \
116+
| python3 -u .github/scripts/filter_gradle_log.py
117+
EXIT_CODE=${PIPESTATUS[0]}
118+
119+
if [ $EXIT_CODE -eq 0 ]; then break; fi
120+
if [ $attempt -lt $MAX_ATTEMPTS ]; then
121+
echo "::warning::Attempt $attempt failed (exit $EXIT_CODE), retrying..."
122+
./gradlew --stop 2>/dev/null || true
123+
fi
124+
done
125+
113126
if [ $EXIT_CODE -ne 0 ]; then
114127
echo "glibc-${{ matrix.java_version }}-${{ matrix.config }}-amd64" >> failures_glibc-${{ matrix.java_version }}-${{ matrix.config }}-amd64.txt
115128
exit 1
@@ -327,7 +340,7 @@ jobs:
327340
sudo apt update -y
328341
sudo apt remove -y g++
329342
sudo apt autoremove -y
330-
sudo apt install -y curl zip unzip clang make build-essential binutils
343+
sudo apt install -y curl zip unzip clang make build-essential binutils gdb
331344
# Install debug symbols for system libraries
332345
sudo apt install -y libc6-dbg
333346
if [[ ${{ matrix.java_version }} =~ "-zing" ]]; then
@@ -350,11 +363,44 @@ jobs:
350363
export LIBC=glibc
351364
export SANITIZER=${{ matrix.config }}
352365
353-
mkdir -p build/logs
354-
./gradlew -PCI -PkeepJFRs :ddprof-test:test${{ matrix.config }} --no-daemon --parallel --build-cache --no-watch-fs 2>&1 \
355-
| tee -a build/test-raw.log \
356-
| python3 -u .github/scripts/filter_gradle_log.py
357-
EXIT_CODE=${PIPESTATUS[0]}
366+
# For ASAN: launch a gdb watchdog that dumps all native threads before the
367+
# 30-minute Gradle timeout kills the JVM, so we can diagnose hangs in native code.
368+
if [[ "${{ matrix.config }}" == "asan" ]]; then
369+
mkdir -p build/logs
370+
(
371+
sleep 1500 # 25 minutes — fires before the 30-min Gradle timeout
372+
echo "::warning::GDB watchdog triggered after 25 minutes"
373+
for pid in $(pgrep -f 'java.*ddprof-test'); do
374+
echo "=== Thread dump for PID $pid ===" >> build/logs/gdb-watchdog.log
375+
gdb -batch -ex 'thread apply all bt full' -p "$pid" >> build/logs/gdb-watchdog.log 2>&1 || true
376+
done
377+
) &
378+
GDB_WATCHDOG_PID=$!
379+
fi
380+
381+
MAX_ATTEMPTS=1
382+
if [[ "${{ matrix.config }}" == "asan" && "${{ matrix.java_version }}" =~ (j9|ibm) ]]; then
383+
MAX_ATTEMPTS=2
384+
fi
385+
386+
for attempt in $(seq 1 $MAX_ATTEMPTS); do
387+
mkdir -p build/logs
388+
./gradlew -PCI -PkeepJFRs :ddprof-test:test${{ matrix.config }} --no-daemon --parallel --build-cache --no-watch-fs 2>&1 \
389+
| tee -a build/test-raw.log \
390+
| python3 -u .github/scripts/filter_gradle_log.py
391+
EXIT_CODE=${PIPESTATUS[0]}
392+
393+
if [ $EXIT_CODE -eq 0 ]; then break; fi
394+
if [ $attempt -lt $MAX_ATTEMPTS ]; then
395+
echo "::warning::Attempt $attempt failed (exit $EXIT_CODE), retrying..."
396+
./gradlew --stop 2>/dev/null || true
397+
fi
398+
done
399+
400+
# Kill the watchdog if tests finished before it fired
401+
if [[ -n "${GDB_WATCHDOG_PID:-}" ]]; then
402+
kill "$GDB_WATCHDOG_PID" 2>/dev/null || true
403+
fi
358404
359405
if [ $EXIT_CODE -ne 0 ]; then
360406
echo "glibc-${{ matrix.java_version }}-${{ matrix.config }}-aarch64" >> failures_glibc-${{ matrix.java_version }}-${{ matrix.config }}-aarch64.txt

AGENTS.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,17 @@ The profiler uses a sophisticated double-buffered storage system for call traces
357357
- **Atomic Operations**: Instance ID management and counter updates use atomics
358358
- **Memory Allocation**: Minimize malloc() in hot paths, use pre-allocated containers
359359

360+
### Atomic Memory Ordering (Critical for arm64)
361+
arm64 has a weakly-ordered memory model (unlike x86 TSO). Incorrect ordering causes real lockups on arm64 that never reproduce on x86.
362+
- **Cross-thread reads**: Always use `__ATOMIC_ACQUIRE` for loads that must see stores from another thread. Never use `__ATOMIC_RELAXED` for cross-thread visibility unless you can prove no ordering dependency exists.
363+
- **Cross-thread writes**: Use `__ATOMIC_RELEASE` for stores that must be visible to other threads. Pair with `__ATOMIC_ACQUIRE` loads.
364+
- **Count + pointer patterns**: When a data structure publishes a count and a separate pointer (two-phase add), both the count load and pointer load need acquire semantics so the reader sees the pointer store that preceded the count increment.
365+
- **Default stance**: When in doubt, use acquire/release. The performance cost is negligible; the correctness cost of relaxed ordering bugs is enormous (silent arm64-only lockups).
366+
367+
### Concurrent Data Structure Iteration
368+
- **NULL gaps**: When iterating a concurrent array (e.g., `CodeCacheArray`), always NULL-check each slot — a slot may be count-allocated but pointer-not-yet-stored.
369+
- **Cursor advancement**: Never permanently advance an iteration cursor past NULL gaps. Stop at the first NULL or track the last contiguous non-NULL entry, so missing entries are retried on the next pass.
370+
360371
### 64-bit Trace ID System
361372
- **Collision Avoidance**: Instance-based IDs prevent collisions across storage swaps
362373
- **JFR Compatibility**: 64-bit IDs work with JFR constant pool indices
@@ -705,3 +716,9 @@ The CI caches JDKs via `.github/workflows/cache_java.yml`. When adding a new JDK
705716
- Always provide tests for bug fixes - test fails before the fix, passes after the fix
706717
- All code needs to strive to be lean in terms of resources consumption and easy to follow -
707718
do not shy away from factoring out self containing code to shorter functions with explicit name
719+
720+
### C/C++ Code Style
721+
- **Indentation**: Match the exact indentation style of the surrounding code in each file. Do not introduce inconsistent indentation — reviewers will flag it.
722+
- **Minimal complexity**: Do not split inline logic into separate helper functions unless the helpers are reused or the original is genuinely hard to follow. Unnecessary splits add indirection without value.
723+
- **Comment precision**: Comments explaining "why" must reference concrete mechanisms (e.g., "ASAN's allocator lock is reentrant" not "internal bookkeeping"). Vague comments get challenged in review. Every claim in a comment must be verifiable from the code or documented behavior of the referenced system (ASAN, glibc NPTL, HotSpot, etc.).
724+
- **No speculative comments**: Do not claim a system (HotSpot, glibc, ASAN) uses a specific mechanism unless you are certain. If unsure, describe the observable symptom instead of guessing the cause.

build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,4 +304,22 @@ object PlatformUtils {
304304
"or specify one with -Pnative.forceCompiler=/path/to/compiler"
305305
)
306306
}
307+
308+
/**
309+
* Returns true if the test JVM (from JAVA_TEST_HOME or JAVA_HOME) is an OpenJ9/J9 JVM.
310+
* Probes `java -version` stderr output for "J9" or "OpenJ9".
311+
*/
312+
fun isTestJvmJ9(): Boolean {
313+
val javaHome = testJavaHome()
314+
return try {
315+
val process = ProcessBuilder("$javaHome/bin/java", "-version")
316+
.redirectErrorStream(true)
317+
.start()
318+
val output = process.inputStream.bufferedReader().readText()
319+
process.waitFor(10, TimeUnit.SECONDS)
320+
output.contains("J9") || output.contains("OpenJ9")
321+
} catch (_: Exception) {
322+
false
323+
}
324+
}
307325
}

build-logic/conventions/src/main/kotlin/com/datadoghq/profiler/ProfilerTestPlugin.kt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import org.gradle.api.provider.Property
1515
import org.gradle.api.tasks.Exec
1616
import org.gradle.api.tasks.SourceSetContainer
1717
import org.gradle.api.tasks.testing.Test
18+
import java.time.Duration
1819
import javax.inject.Inject
1920

2021
/**
@@ -254,6 +255,17 @@ class ProfilerTestPlugin : Plugin<Project> {
254255
"asan" -> testTask.onlyIf { PlatformUtils.locateLibasan() != null }
255256
"tsan" -> testTask.onlyIf { PlatformUtils.locateLibtsan() != null }
256257
}
258+
259+
// J9+ASAN: isolate each test class in its own JVM to limit crash blast radius
260+
if (testConfig.configName == "asan") {
261+
testTask.doFirst {
262+
if (PlatformUtils.isTestJvmJ9()) {
263+
testTask.setForkEvery(1)
264+
}
265+
}
266+
// Kill hung ASAN test tasks after 30 minutes
267+
testTask.timeout.set(Duration.ofMinutes(30))
268+
}
257269
}
258270
}
259271

ddprof-lib/src/main/cpp/codeCache.h

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -254,34 +254,47 @@ class CodeCache {
254254
class CodeCacheArray {
255255
private:
256256
CodeCache *_libs[MAX_NATIVE_LIBS];
257-
volatile int _count;
257+
volatile int _reserved; // next slot to reserve (CAS by writers)
258+
volatile int _count; // published count (all indices < _count have non-NULL pointers)
258259
volatile size_t _used_memory;
259260

260261
public:
261-
CodeCacheArray() : _count(0), _used_memory(0) {
262+
CodeCacheArray() : _reserved(0), _count(0), _used_memory(0) {
262263
memset(_libs, 0, MAX_NATIVE_LIBS * sizeof(CodeCache *));
263264
}
264265

265266
CodeCache *operator[](int index) const { return __atomic_load_n(&_libs[index], __ATOMIC_ACQUIRE); }
266267

267-
int count() const { return __atomic_load_n(&_count, __ATOMIC_RELAXED); }
268+
// All indices < count() are guaranteed to have a non-NULL pointer.
269+
int count() const { return __atomic_load_n(&_count, __ATOMIC_ACQUIRE); }
268270

271+
// Pointer-first add: reserve a slot via CAS on _reserved, store the
272+
// pointer with RELEASE, then advance _count. Readers see count() grow
273+
// only after the pointer is visible, so indices < count() never yield NULL.
269274
void add(CodeCache *lib) {
270-
int index = __atomic_fetch_add(&_count, 1, __ATOMIC_RELAXED);
271-
if (index < MAX_NATIVE_LIBS) {
272-
__atomic_fetch_add(&_used_memory, lib->memoryUsage(), __ATOMIC_RELAXED);
273-
__atomic_store_n(&_libs[index], lib, __ATOMIC_RELEASE);
275+
int slot = __atomic_load_n(&_reserved, __ATOMIC_RELAXED);
276+
do {
277+
if (slot >= MAX_NATIVE_LIBS) return;
278+
} while (!__atomic_compare_exchange_n(&_reserved, &slot, slot + 1,
279+
true, __ATOMIC_RELAXED, __ATOMIC_RELAXED));
280+
assert(__atomic_load_n(&_libs[slot], __ATOMIC_RELAXED) == nullptr);
281+
__atomic_fetch_add(&_used_memory, lib->memoryUsage(), __ATOMIC_RELAXED);
282+
// Store pointer before publishing count. The RELEASE here pairs with
283+
// the ACQUIRE load in operator[]/at() and count().
284+
__atomic_store_n(&_libs[slot], lib, __ATOMIC_RELEASE);
285+
// Advance _count to publish the new slot. Spin until our slot is next
286+
// in line, preserving contiguous ordering when multiple adds race.
287+
while (__atomic_load_n(&_count, __ATOMIC_RELAXED) != slot) {
288+
// wait for preceding slots to publish
274289
}
290+
__atomic_store_n(&_count, slot + 1, __ATOMIC_RELEASE);
275291
}
276292

277293
CodeCache* at(int index) const {
278294
if (index >= MAX_NATIVE_LIBS) {
279-
return nullptr;
295+
return nullptr;
280296
}
281-
CodeCache* lib = nullptr;
282-
while ((lib = __atomic_load_n(&_libs[index], __ATOMIC_ACQUIRE)) == nullptr);
283-
284-
return lib;
297+
return __atomic_load_n(&_libs[index], __ATOMIC_ACQUIRE);
285298
}
286299

287300
size_t memoryUsage() const {

ddprof-lib/src/main/cpp/flightRecorder.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1110,10 +1110,12 @@ void Recording::writeNativeLibraries(Buffer *buf) {
11101110
const CodeCacheArray &native_libs = libraries->native_libs();
11111111
int native_lib_count = native_libs.count();
11121112

1113+
// Emit jdk.NativeLibrary events for newly loaded libraries.
1114+
// CodeCacheArray::add() stores the pointer before advancing count(),
1115+
// so all indices < native_lib_count are guaranteed non-NULL.
11131116
for (int i = _recorded_lib_count; i < native_lib_count; i++) {
11141117
CodeCache* lib = native_libs[i];
11151118

1116-
// Emit jdk.NativeLibrary event with extended fields (buildId and loadBias)
11171119
flushIfNeeded(buf, RECORDING_BUFFER_LIMIT - MAX_STRING_LENGTH);
11181120
int start = buf->skip(5);
11191121
buf->putVar64(T_NATIVE_LIBRARY);

ddprof-lib/src/main/cpp/guards.h

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,16 +121,12 @@ class SignalBlocker {
121121
sigset_t prof_signals;
122122
sigemptyset(&prof_signals);
123123

124-
// Add all profiling signals that could interrupt us
124+
// Block only the profiling signals that the profiler actually registers.
125+
// No profiler engine uses RT signals, so blocking them is unnecessary
126+
// and risks interfering with glibc NPTL internals (SIGRTMIN, SIGRTMIN+1)
127+
// or other JVM-internal signal usage.
125128
sigaddset(&prof_signals, SIGPROF); // Used by ITimer and CTimer
126129
sigaddset(&prof_signals, SIGVTALRM); // Used by WallClock
127-
#ifdef __linux__
128-
// Block RT signals (Linux-only)
129-
// This prevents any RT signal handler from interrupting TLS initialization
130-
for (int sig = SIGRTMIN; sig <= SIGRTMIN + 5 && sig <= SIGRTMAX; sig++) {
131-
sigaddset(&prof_signals, sig);
132-
}
133-
#endif
134130

135131
if (pthread_sigmask(SIG_BLOCK, &prof_signals, &_old_mask) == 0) {
136132
_active = true;

ddprof-lib/src/main/cpp/libraries.cpp

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -54,16 +54,22 @@ const void *Libraries::resolveSymbol(const char *name) {
5454
int native_lib_count = _native_libs.count();
5555
if (len > 0 && name[len - 1] == '*') {
5656
for (int i = 0; i < native_lib_count; i++) {
57-
const void *address = _native_libs[i]->findSymbolByPrefix(name, len - 1);
58-
if (address != NULL) {
59-
return address;
57+
CodeCache *lib = _native_libs[i];
58+
if (lib != NULL) {
59+
const void *address = lib->findSymbolByPrefix(name, len - 1);
60+
if (address != NULL) {
61+
return address;
62+
}
6063
}
6164
}
6265
} else {
6366
for (int i = 0; i < native_lib_count; i++) {
64-
const void *address = _native_libs[i]->findSymbol(name);
65-
if (address != NULL) {
66-
return address;
67+
CodeCache *lib = _native_libs[i];
68+
if (lib != NULL) {
69+
const void *address = lib->findSymbol(name);
70+
if (address != NULL) {
71+
return address;
72+
}
6773
}
6874
}
6975
}
@@ -79,11 +85,14 @@ CodeCache *Libraries::findLibraryByName(const char *lib_name) {
7985
const size_t lib_name_len = strlen(lib_name);
8086
const int native_lib_count = _native_libs.count();
8187
for (int i = 0; i < native_lib_count; i++) {
82-
const char *s = _native_libs[i]->name();
83-
if (s != NULL) {
84-
const char *p = strrchr(s, '/');
85-
if (p != NULL && strncmp(p + 1, lib_name, lib_name_len) == 0) {
86-
return _native_libs[i];
88+
CodeCache *lib = _native_libs[i];
89+
if (lib != NULL) {
90+
const char *s = lib->name();
91+
if (s != NULL) {
92+
const char *p = strrchr(s, '/');
93+
if (p != NULL && strncmp(p + 1, lib_name, lib_name_len) == 0) {
94+
return lib;
95+
}
8796
}
8897
}
8998
}
@@ -93,8 +102,9 @@ CodeCache *Libraries::findLibraryByName(const char *lib_name) {
93102
CodeCache *Libraries::findLibraryByAddress(const void *address) {
94103
const int native_lib_count = _native_libs.count();
95104
for (int i = 0; i < native_lib_count; i++) {
96-
if (_native_libs[i]->contains(address)) {
97-
return _native_libs[i];
105+
CodeCache *lib = _native_libs[i];
106+
if (lib != NULL && lib->contains(address)) {
107+
return lib;
98108
}
99109
}
100110
return NULL;

ddprof-lib/src/main/cpp/libraries.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class Libraries {
2424
// Note: Parameter is uint32_t to match lib_index packing (17 bits = max 131K libraries)
2525
CodeCache *getLibraryByIndex(uint32_t index) const {
2626
if (index < _native_libs.count()) {
27-
return _native_libs[index];
27+
return _native_libs[index]; // may be NULL during concurrent add()
2828
}
2929
return nullptr;
3030
}

0 commit comments

Comments
 (0)