Skip to content

Commit cb27fd6

Browse files
bm1549claude
andcommitted
Extend log-injection smoke test diagnostic to identify true root cause
PR #11075 narrowed the flake to "process alive, RC polling, captured stdout empty" but couldn't pinpoint where the JVM was wedged. This extends the failure diagnostic to capture every signal needed to discriminate the remaining hypotheses in a single failure. On the failure path of `waitForTraceCountAlive`: - SIGQUIT to the live process so its thread dump flows through the captured stdout pipe (when the OutputThreads writer is alive) - bounded tail of captured stdout via RandomAccessFile (avoids loading a now-thread-dump-inflated log into memory) - bounded tail of the application logger file (logback/log4j2/JBoss target via dd.test.logfile) — distinguishes "app never ran" from "app ran but stdout pipe lost output" - enumeration of the smoke-output ThreadGroup so we can see if the OutputThreads writer thread is dead - jcmd Thread.print fallback that drains stdout incrementally (codex review caught the pipe-buffer deadlock the naive form would hit) - previously-swallowed rcClientDecodingFailure surfaced - launch command line included so we don't have to dig through the Gradle test reports Process.pid() is guarded for Java 8, and resolveJcmdPath probes both JRE and JDK layouts. tag: no release note tag: ai generated Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 3ba8d13 commit cb27fd6

1 file changed

Lines changed: 241 additions & 23 deletions

File tree

dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy

Lines changed: 241 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ import datadog.environment.JavaVirtualMachine
66
import datadog.trace.agent.test.server.http.TestHttpServer.HandlerApi.RequestApi
77
import datadog.trace.api.config.GeneralConfig
88
import datadog.trace.test.util.Flaky
9+
import java.io.ByteArrayOutputStream
10+
import java.io.InputStream
11+
import java.io.RandomAccessFile
912
import java.nio.charset.StandardCharsets
1013
import java.util.zip.GZIPInputStream
1114
import spock.lang.AutoCleanup
@@ -53,6 +56,10 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
5356
@AutoCleanup
5457
MockBackend mockBackend = new MockBackend()
5558

59+
// Captured for inclusion in failure diagnostics so we don't have to dig through Gradle reports.
60+
@Shared
61+
String launchCommand
62+
5663
def setup() {
5764
mockBackend.reset()
5865
}
@@ -114,7 +121,8 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
114121
command.addAll(additionalArguments())
115122
command.addAll((String[]) ["-jar", loggingJar])
116123

117-
println "COMMANDS: " + command.join(" ")
124+
launchCommand = command.join(" ")
125+
println "COMMANDS: " + launchCommand
118126
ProcessBuilder processBuilder = new ProcessBuilder(command)
119127
processBuilder.directory(new File(buildDirectory))
120128

@@ -349,8 +357,12 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
349357

350358
/**
351359
* Like {@link AbstractSmokeTest#waitForTraceCount} but checks process liveness on every poll
352-
* iteration and dumps diagnostic state on failure, so CI failures produce actionable output
353-
* instead of a bare "Condition not satisfied" after a 30s timeout.
360+
* iteration and dumps comprehensive diagnostic state on failure. The previous iteration of
361+
* this method narrowed the flake to "process alive, RC polling, captured stdout empty" but
362+
* could not pinpoint where the JVM was wedged — see {@link #captureFullDiagnostic} for the
363+
* extended state captured here, designed to discriminate between a class-load deadlock,
364+
* stuck @Trace advice, broken trace writer, no-op tracer, dead OutputThreads writer, and a
365+
* stalled stdout pipe in a single failure.
354366
*/
355367
int waitForTraceCountAlive(int count) {
356368
try {
@@ -364,41 +376,247 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
364376
return
365377
}
366378
if (testedProcess != null && !testedProcess.isAlive()) {
367-
def lastLines = tailProcessLog(20)
368379
// RuntimeException (not AssertionError) so PollingConditions propagates
369380
// immediately instead of retrying for the full timeout.
370381
throw new RuntimeException(
371-
"Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces " +
372-
"(received ${traceCount.get()}, RC polls: ${rcClientMessages.size()}).\n" +
373-
"Last process output:\n${lastLines}")
382+
"Process exited while waiting for ${count} traces.\n" +
383+
captureFullDiagnostic(count))
374384
}
375385
assert traceCount.get() >= count
376386
}
377387
} catch (AssertionError e) {
378-
// The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state
379-
def alive = testedProcess?.isAlive()
380-
def lastLines = tailProcessLog(30)
388+
// The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state.
381389
throw new AssertionError(
382-
"Timed out waiting for ${count} traces after ${defaultPoll.timeout}s. " +
383-
"traceCount=${traceCount.get()}, process.alive=${alive}, " +
384-
"RC polls received: ${rcClientMessages.size()}.\n" +
385-
"Last process output:\n${lastLines}", e)
390+
"Timed out waiting for ${count} traces after ${defaultPoll.timeout}s.\n" +
391+
captureFullDiagnostic(count), e)
386392
}
387393
traceCount.get()
388394
}
389395

390-
private String tailProcessLog(int lines) {
396+
/**
397+
* Comprehensive diagnostic capture for the failure path. Triggers a SIGQUIT thread dump on
398+
* the live process, waits briefly for it to flow through the captured stdout pipe, then
399+
* collects everything needed to discriminate the remaining hypotheses for this flake:
400+
* captured stdout file size + tail (with thread dump if the OutputThreads writer is alive),
401+
* the application logger file (logback/log4j2/JBoss target via {@code dd.test.logfile}),
402+
* OutputThreads writer-thread health, and a {@code jcmd Thread.print} fallback that works
403+
* even if the writer thread is dead.
404+
*/
405+
private String captureFullDiagnostic(int targetCount) {
406+
boolean alive = testedProcess != null && testedProcess.isAlive()
407+
long pid = -1L
408+
if (testedProcess != null) {
409+
// Process.pid() is Java 9+; the test runner JVM may be Java 8 (zulu8). Fall back gracefully.
410+
try {
411+
pid = testedProcess.pid()
412+
} catch (Throwable ignored) {
413+
// Java 8 — SIGQUIT/jcmd paths get skipped; we still capture file state + thread groups.
414+
}
415+
}
416+
417+
// Trigger SIGQUIT for an in-process thread dump. The JVM writes the dump to its stderr,
418+
// which redirectErrorStream(true) merges into the captured stdout pipe. Sleep briefly to
419+
// let the writer thread drain it before we re-read the file. No-op (and harmless) on
420+
// Windows; in CI this runs on Linux containers.
421+
if (alive && pid > 0) {
422+
try {
423+
["kill", "-3", "${pid}".toString()].execute().waitFor(2, SECONDS)
424+
} catch (Throwable ignored) {
425+
// best-effort
426+
}
427+
try {
428+
Thread.sleep(1500)
429+
} catch (InterruptedException ignored) {
430+
Thread.currentThread().interrupt()
431+
}
432+
}
433+
434+
def sb = new StringBuilder()
435+
sb << "pid=${pid} alive=${alive}"
436+
if (testedProcess != null && !alive) {
437+
try {
438+
sb << " exitValue=${testedProcess.exitValue()}"
439+
} catch (Throwable ignored) {
440+
// exitValue may not be available
441+
}
442+
}
443+
sb << "\n"
444+
sb << "rcPolls=${rcClientMessages.size()} traceCount=${traceCount.get()} target=${targetCount}\n"
445+
if (rcClientDecodingFailure != null) {
446+
sb << "rcDecodingFailure=${rcClientDecodingFailure}\n"
447+
}
448+
if (traceDecodingFailure != null) {
449+
sb << "traceDecodingFailure=${traceDecodingFailure}\n"
450+
}
451+
sb << "launchCommand=${launchCommand}\n"
452+
sb << "outputThreads=${describeOutputThreadGroup()}\n"
453+
454+
def stdoutFile = new File(logFilePath)
455+
if (stdoutFile.exists()) {
456+
sb << "capturedStdout=${stdoutFile.absolutePath} size=${stdoutFile.length()} mtime=${new Date(stdoutFile.lastModified())}\n"
457+
sb << "--- captured stdout (last 60 lines, post-SIGQUIT) ---\n"
458+
sb << tailFile(stdoutFile, 60)
459+
sb << "\n"
460+
} else {
461+
sb << "capturedStdout=${logFilePath} (does not exist)\n"
462+
}
463+
464+
if (outputLogFile != null && outputLogFile.exists()) {
465+
sb << "appLogFile=${outputLogFile.absolutePath} size=${outputLogFile.length()}\n"
466+
sb << "--- app log (last 30 lines) ---\n"
467+
sb << tailFile(outputLogFile, 30)
468+
sb << "\n"
469+
} else if (outputLogFile != null) {
470+
sb << "appLogFile=${outputLogFile.absolutePath} (does not exist)\n"
471+
}
472+
473+
if (pid > 0) {
474+
sb << "--- jcmd Thread.print (fallback, works even if OutputThreads writer is dead) ---\n"
475+
sb << jcmdThreadPrint(pid)
476+
sb << "\n"
477+
}
478+
return sb.toString()
479+
}
480+
481+
private String describeOutputThreadGroup() {
482+
// OutputThreads creates threads in a group named "smoke-output" — see OutputThreads.java.
483+
ThreadGroup root = Thread.currentThread().threadGroup
484+
while (root.parent != null) {
485+
root = root.parent
486+
}
487+
Thread[] threads = new Thread[root.activeCount() + 16]
488+
int n = root.enumerate(threads, true)
489+
def details = []
490+
for (int i = 0; i < n; i++) {
491+
def t = threads[i]
492+
if (t == null) {
493+
continue
494+
}
495+
ThreadGroup tg = t.threadGroup
496+
if (tg != null && tg.name == "smoke-output") {
497+
details << "${t.name}/${t.state}/alive=${t.alive}"
498+
}
499+
}
500+
return "smoke-output threads=${details.size()} [${details.join(", ")}]"
501+
}
502+
503+
// Cap on diagnostic chunks so a wedged JVM with a giant thread dump can't OOM the Gradle
504+
// worker or produce an unreadable test report. ~32KB is enough for typical thread dumps.
505+
private static final int DIAG_TAIL_BYTES = 32 * 1024
506+
507+
/**
508+
* Returns the last {@code lines} lines (or up to {@link #DIAG_TAIL_BYTES} from the file end,
509+
* whichever is smaller) without loading the whole file into memory. Important on the failure
510+
* path because we have just appended a full JVM thread dump to the captured stdout file via
511+
* SIGQUIT — {@code readLines()} on that file could OOM the Gradle worker on a wedged JVM with
512+
* a large dump.
513+
*/
514+
private String tailFile(File f, int lines) {
515+
try (RandomAccessFile raf = new RandomAccessFile(f, "r")) {
516+
long len = raf.length()
517+
long start = Math.max(0L, len - DIAG_TAIL_BYTES)
518+
raf.seek(start)
519+
byte[] buf = new byte[(int) (len - start)]
520+
raf.readFully(buf)
521+
String chunk = new String(buf, StandardCharsets.UTF_8)
522+
// If we started mid-line, drop the partial first line so the tail reads cleanly.
523+
if (start > 0) {
524+
int nl = chunk.indexOf('\n')
525+
if (nl >= 0) {
526+
chunk = chunk.substring(nl + 1)
527+
}
528+
}
529+
String[] all = chunk.split("\\R", -1)
530+
int from = Math.max(0, all.length - lines)
531+
def out = new StringBuilder()
532+
if (start > 0) {
533+
out << "...(truncated to last ${len - start} bytes)...\n"
534+
}
535+
for (int i = from; i < all.length; i++) {
536+
if (i > from) {
537+
out << "\n"
538+
}
539+
out << all[i]
540+
}
541+
return out.toString()
542+
} catch (Throwable e) {
543+
return "(failed to read ${f.name}: ${e.message})"
544+
}
545+
}
546+
547+
private String jcmdThreadPrint(long pid) {
548+
String jcmdPath = resolveJcmdPath()
549+
if (jcmdPath == null) {
550+
return "(jcmd not found on java.home; skipping)"
551+
}
391552
try {
392-
def logFile = new File(logFilePath)
393-
if (!logFile.exists()) {
394-
return "(log file does not exist: ${logFilePath})"
553+
// Merge stderr into stdout and drain incrementally — for a wedged JVM the dump can be
554+
// larger than the OS pipe buffer (~64KB on Linux), and waiting for exit before reading
555+
// would deadlock both jcmd and us. See codex review.
556+
ProcessBuilder pb = new ProcessBuilder(jcmdPath, Long.toString(pid), "Thread.print")
557+
pb.redirectErrorStream(true)
558+
Process proc = pb.start()
559+
ByteArrayOutputStream baos = new ByteArrayOutputStream()
560+
byte[] buf = new byte[8192]
561+
long deadline = System.nanoTime() + SECONDS.toNanos(5)
562+
InputStream is = proc.getInputStream()
563+
while (true) {
564+
if (is.available() > 0) {
565+
int n = is.read(buf)
566+
if (n < 0) {
567+
break
568+
}
569+
// Bound the in-memory buffer so a runaway dump can't OOM us.
570+
if (baos.size() < DIAG_TAIL_BYTES * 2) {
571+
baos.write(buf, 0, n)
572+
}
573+
} else if (!proc.isAlive()) {
574+
// Drain any remaining bytes after exit.
575+
int n
576+
while ((n = is.read(buf)) >= 0) {
577+
if (baos.size() < DIAG_TAIL_BYTES * 2) {
578+
baos.write(buf, 0, n)
579+
}
580+
}
581+
break
582+
} else if (System.nanoTime() > deadline) {
583+
proc.destroyForcibly()
584+
proc.waitFor(1, SECONDS)
585+
if (baos.size() == 0) {
586+
return "(jcmd timed out with no output)"
587+
}
588+
break
589+
} else {
590+
Thread.sleep(50)
591+
}
592+
}
593+
String out = new String(baos.toByteArray(), StandardCharsets.UTF_8)
594+
if (out.size() > DIAG_TAIL_BYTES) {
595+
// Keep the head — application/agent threads tend to be earlier in the dump than
596+
// generic VM/GC/JIT threads, and the head is what reveals where main is wedged.
597+
return out.substring(0, DIAG_TAIL_BYTES) + "\n...(truncated; full dump was ${out.size()} bytes)..."
598+
}
599+
return out
600+
} catch (Throwable e) {
601+
return "(jcmd unavailable: ${e.message})"
602+
}
603+
}
604+
605+
private String resolveJcmdPath() {
606+
// On Java 8, java.home points to the JRE subdirectory, so jcmd is at ../bin/jcmd; on
607+
// Java 9+ it's at bin/jcmd. Try both, fall back to PATH.
608+
String javaHome = System.getProperty("java.home")
609+
if (javaHome != null) {
610+
File direct = new File(javaHome, "bin/jcmd")
611+
if (direct.canExecute()) {
612+
return direct.absolutePath
613+
}
614+
File jdkSibling = new File(new File(javaHome).parentFile, "bin/jcmd")
615+
if (jdkSibling.canExecute()) {
616+
return jdkSibling.absolutePath
395617
}
396-
def allLines = logFile.readLines()
397-
def tail = allLines.size() > lines ? allLines[-lines..-1] : allLines
398-
return tail.join("\n")
399-
} catch (Exception e) {
400-
return "(failed to read log: ${e.message})"
401618
}
619+
return "jcmd"
402620
}
403621

404622
def parseTraceFromStdOut( String line ) {

0 commit comments

Comments
 (0)