@@ -6,6 +6,9 @@ import datadog.environment.JavaVirtualMachine
66import datadog.trace.agent.test.server.http.TestHttpServer.HandlerApi.RequestApi
77import datadog.trace.api.config.GeneralConfig
88import datadog.trace.test.util.Flaky
9+ import java.io.ByteArrayOutputStream
10+ import java.io.InputStream
11+ import java.io.RandomAccessFile
912import java.nio.charset.StandardCharsets
1013import java.util.zip.GZIPInputStream
1114import spock.lang.AutoCleanup
@@ -53,6 +56,10 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
5356 @AutoCleanup
5457 MockBackend mockBackend = new MockBackend ()
5558
59+ // Captured for inclusion in failure diagnostics so we don't have to dig through Gradle reports.
60+ @Shared
61+ String launchCommand
62+
5663 def setup () {
5764 mockBackend. reset()
5865 }
@@ -114,7 +121,8 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
114121 command. addAll(additionalArguments())
115122 command. addAll((String []) [" -jar" , loggingJar])
116123
117- println " COMMANDS: " + command. join(" " )
124+ launchCommand = command. join(" " )
125+ println " COMMANDS: " + launchCommand
118126 ProcessBuilder processBuilder = new ProcessBuilder (command)
119127 processBuilder. directory(new File (buildDirectory))
120128
@@ -349,8 +357,12 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
349357
350358 /**
351359 * Like {@link AbstractSmokeTest#waitForTraceCount} but checks process liveness on every poll
352- * iteration and dumps diagnostic state on failure, so CI failures produce actionable output
353- * instead of a bare "Condition not satisfied" after a 30s timeout.
360+ * iteration and dumps comprehensive diagnostic state on failure. The previous iteration of
361+ * this method narrowed the flake to "process alive, RC polling, captured stdout empty" but
362+ * could not pinpoint where the JVM was wedged — see {@link #captureFullDiagnostic} for the
363+ * extended state captured here, designed to discriminate between a class-load deadlock,
364+ * stuck @Trace advice, broken trace writer, no-op tracer, dead OutputThreads writer, and a
365+ * stalled stdout pipe in a single failure.
354366 */
355367 int waitForTraceCountAlive (int count ) {
356368 try {
@@ -364,41 +376,247 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
364376 return
365377 }
366378 if (testedProcess != null && ! testedProcess. isAlive()) {
367- def lastLines = tailProcessLog(20 )
368379 // RuntimeException (not AssertionError) so PollingConditions propagates
369380 // immediately instead of retrying for the full timeout.
370381 throw new RuntimeException (
371- " Process exited with code ${ testedProcess.exitValue()} while waiting for ${ count} traces " +
372- " (received ${ traceCount.get()} , RC polls: ${ rcClientMessages.size()} ).\n " +
373- " Last process output:\n ${ lastLines} " )
382+ " Process exited while waiting for ${ count} traces.\n " +
383+ captureFullDiagnostic(count))
374384 }
375385 assert traceCount. get() >= count
376386 }
377387 } catch (AssertionError e) {
378- // The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state
379- def alive = testedProcess?. isAlive()
380- def lastLines = tailProcessLog(30 )
388+ // The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state.
381389 throw new AssertionError (
382- " Timed out waiting for ${ count} traces after ${ defaultPoll.timeout} s. " +
383- " traceCount=${ traceCount.get()} , process.alive=${ alive} , " +
384- " RC polls received: ${ rcClientMessages.size()} .\n " +
385- " Last process output:\n ${ lastLines} " , e)
390+ " Timed out waiting for ${ count} traces after ${ defaultPoll.timeout} s.\n " +
391+ captureFullDiagnostic(count), e)
386392 }
387393 traceCount. get()
388394 }
389395
390- private String tailProcessLog (int lines ) {
396+ /**
397+ * Comprehensive diagnostic capture for the failure path. Triggers a SIGQUIT thread dump on
398+ * the live process, waits briefly for it to flow through the captured stdout pipe, then
399+ * collects everything needed to discriminate the remaining hypotheses for this flake:
400+ * captured stdout file size + tail (with thread dump if the OutputThreads writer is alive),
401+ * the application logger file (logback/log4j2/JBoss target via {@code dd.test.logfile }),
402+ * OutputThreads writer-thread health, and a {@code jcmd Thread.print } fallback that works
403+ * even if the writer thread is dead.
404+ */
405+ private String captureFullDiagnostic (int targetCount ) {
406+ boolean alive = testedProcess != null && testedProcess. isAlive()
407+ long pid = -1L
408+ if (testedProcess != null ) {
409+ // Process.pid() is Java 9+; the test runner JVM may be Java 8 (zulu8). Fall back gracefully.
410+ try {
411+ pid = testedProcess. pid()
412+ } catch (Throwable ignored) {
413+ // Java 8 — SIGQUIT/jcmd paths get skipped; we still capture file state + thread groups.
414+ }
415+ }
416+
417+ // Trigger SIGQUIT for an in-process thread dump. The JVM writes the dump to its stderr,
418+ // which redirectErrorStream(true) merges into the captured stdout pipe. Sleep briefly to
419+ // let the writer thread drain it before we re-read the file. No-op (and harmless) on
420+ // Windows; in CI this runs on Linux containers.
421+ if (alive && pid > 0 ) {
422+ try {
423+ [" kill" , " -3" , " ${ pid} " . toString()]. execute(). waitFor(2 , SECONDS )
424+ } catch (Throwable ignored) {
425+ // best-effort
426+ }
427+ try {
428+ Thread . sleep(1500 )
429+ } catch (InterruptedException ignored) {
430+ Thread . currentThread(). interrupt()
431+ }
432+ }
433+
434+ def sb = new StringBuilder ()
435+ sb << " pid=${ pid} alive=${ alive} "
436+ if (testedProcess != null && ! alive) {
437+ try {
438+ sb << " exitValue=${ testedProcess.exitValue()} "
439+ } catch (Throwable ignored) {
440+ // exitValue may not be available
441+ }
442+ }
443+ sb << " \n "
444+ sb << " rcPolls=${ rcClientMessages.size()} traceCount=${ traceCount.get()} target=${ targetCount} \n "
445+ if (rcClientDecodingFailure != null ) {
446+ sb << " rcDecodingFailure=${ rcClientDecodingFailure} \n "
447+ }
448+ if (traceDecodingFailure != null ) {
449+ sb << " traceDecodingFailure=${ traceDecodingFailure} \n "
450+ }
451+ sb << " launchCommand=${ launchCommand} \n "
452+ sb << " outputThreads=${ describeOutputThreadGroup()} \n "
453+
454+ def stdoutFile = new File (logFilePath)
455+ if (stdoutFile. exists()) {
456+ sb << " capturedStdout=${ stdoutFile.absolutePath} size=${ stdoutFile.length()} mtime=${ new Date(stdoutFile.lastModified())} \n "
457+ sb << " --- captured stdout (last 60 lines, post-SIGQUIT) ---\n "
458+ sb << tailFile(stdoutFile, 60 )
459+ sb << " \n "
460+ } else {
461+ sb << " capturedStdout=${ logFilePath} (does not exist)\n "
462+ }
463+
464+ if (outputLogFile != null && outputLogFile. exists()) {
465+ sb << " appLogFile=${ outputLogFile.absolutePath} size=${ outputLogFile.length()} \n "
466+ sb << " --- app log (last 30 lines) ---\n "
467+ sb << tailFile(outputLogFile, 30 )
468+ sb << " \n "
469+ } else if (outputLogFile != null ) {
470+ sb << " appLogFile=${ outputLogFile.absolutePath} (does not exist)\n "
471+ }
472+
473+ if (pid > 0 ) {
474+ sb << " --- jcmd Thread.print (fallback, works even if OutputThreads writer is dead) ---\n "
475+ sb << jcmdThreadPrint(pid)
476+ sb << " \n "
477+ }
478+ return sb. toString()
479+ }
480+
481+ private String describeOutputThreadGroup () {
482+ // OutputThreads creates threads in a group named "smoke-output" — see OutputThreads.java.
483+ ThreadGroup root = Thread . currentThread(). threadGroup
484+ while (root. parent != null ) {
485+ root = root. parent
486+ }
487+ Thread [] threads = new Thread [root. activeCount() + 16 ]
488+ int n = root. enumerate(threads, true )
489+ def details = []
490+ for (int i = 0 ; i < n; i++ ) {
491+ def t = threads[i]
492+ if (t == null ) {
493+ continue
494+ }
495+ ThreadGroup tg = t. threadGroup
496+ if (tg != null && tg. name == " smoke-output" ) {
497+ details << " ${ t.name} /${ t.state} /alive=${ t.alive} "
498+ }
499+ }
500+ return " smoke-output threads=${ details.size()} [${ details.join(", ")} ]"
501+ }
502+
503+ // Cap on diagnostic chunks so a wedged JVM with a giant thread dump can't OOM the Gradle
504+ // worker or produce an unreadable test report. ~32KB is enough for typical thread dumps.
505+ private static final int DIAG_TAIL_BYTES = 32 * 1024
506+
507+ /**
508+ * Returns the last {@code lines } lines (or up to {@link #DIAG_TAIL_BYTES} from the file end,
509+ * whichever is smaller) without loading the whole file into memory. Important on the failure
510+ * path because we have just appended a full JVM thread dump to the captured stdout file via
511+ * SIGQUIT — {@code readLines() } on that file could OOM the Gradle worker on a wedged JVM with
512+ * a large dump.
513+ */
514+ private String tailFile (File f , int lines ) {
515+ try (RandomAccessFile raf = new RandomAccessFile (f, " r" )) {
516+ long len = raf. length()
517+ long start = Math . max(0L , len - DIAG_TAIL_BYTES )
518+ raf. seek(start)
519+ byte [] buf = new byte [(int ) (len - start)]
520+ raf. readFully(buf)
521+ String chunk = new String (buf, StandardCharsets . UTF_8 )
522+ // If we started mid-line, drop the partial first line so the tail reads cleanly.
523+ if (start > 0 ) {
524+ int nl = chunk. indexOf(' \n ' )
525+ if (nl >= 0 ) {
526+ chunk = chunk. substring(nl + 1 )
527+ }
528+ }
529+ String [] all = chunk. split(" \\ R" , -1 )
530+ int from = Math . max(0 , all. length - lines)
531+ def out = new StringBuilder ()
532+ if (start > 0 ) {
533+ out << " ...(truncated to last ${ len - start} bytes)...\n "
534+ }
535+ for (int i = from; i < all. length; i++ ) {
536+ if (i > from) {
537+ out << " \n "
538+ }
539+ out << all[i]
540+ }
541+ return out. toString()
542+ } catch (Throwable e) {
543+ return " (failed to read ${ f.name} : ${ e.message} )"
544+ }
545+ }
546+
547+ private String jcmdThreadPrint (long pid ) {
548+ String jcmdPath = resolveJcmdPath()
549+ if (jcmdPath == null ) {
550+ return " (jcmd not found on java.home; skipping)"
551+ }
391552 try {
392- def logFile = new File (logFilePath)
393- if (! logFile. exists()) {
394- return " (log file does not exist: ${ logFilePath} )"
553+ // Merge stderr into stdout and drain incrementally — for a wedged JVM the dump can be
554+ // larger than the OS pipe buffer (~64KB on Linux), and waiting for exit before reading
555+ // would deadlock both jcmd and us. See codex review.
556+ ProcessBuilder pb = new ProcessBuilder (jcmdPath, Long . toString(pid), " Thread.print" )
557+ pb. redirectErrorStream(true )
558+ Process proc = pb. start()
559+ ByteArrayOutputStream baos = new ByteArrayOutputStream ()
560+ byte [] buf = new byte [8192 ]
561+ long deadline = System . nanoTime() + SECONDS . toNanos(5 )
562+ InputStream is = proc. getInputStream()
563+ while (true ) {
564+ if (is. available() > 0 ) {
565+ int n = is. read(buf)
566+ if (n < 0 ) {
567+ break
568+ }
569+ // Bound the in-memory buffer so a runaway dump can't OOM us.
570+ if (baos. size() < DIAG_TAIL_BYTES * 2 ) {
571+ baos. write(buf, 0 , n)
572+ }
573+ } else if (! proc. isAlive()) {
574+ // Drain any remaining bytes after exit.
575+ int n
576+ while ((n = is. read(buf)) >= 0 ) {
577+ if (baos. size() < DIAG_TAIL_BYTES * 2 ) {
578+ baos. write(buf, 0 , n)
579+ }
580+ }
581+ break
582+ } else if (System . nanoTime() > deadline) {
583+ proc. destroyForcibly()
584+ proc. waitFor(1 , SECONDS )
585+ if (baos. size() == 0 ) {
586+ return " (jcmd timed out with no output)"
587+ }
588+ break
589+ } else {
590+ Thread . sleep(50 )
591+ }
592+ }
593+ String out = new String (baos. toByteArray(), StandardCharsets . UTF_8 )
594+ if (out. size() > DIAG_TAIL_BYTES ) {
595+ // Keep the head — application/agent threads tend to be earlier in the dump than
596+ // generic VM/GC/JIT threads, and the head is what reveals where main is wedged.
597+ return out. substring(0 , DIAG_TAIL_BYTES ) + " \n ...(truncated; full dump was ${ out.size()} bytes)..."
598+ }
599+ return out
600+ } catch (Throwable e) {
601+ return " (jcmd unavailable: ${ e.message} )"
602+ }
603+ }
604+
605+ private String resolveJcmdPath () {
606+ // On Java 8, java.home points to the JRE subdirectory, so jcmd is at ../bin/jcmd; on
607+ // Java 9+ it's at bin/jcmd. Try both, fall back to PATH.
608+ String javaHome = System . getProperty(" java.home" )
609+ if (javaHome != null ) {
610+ File direct = new File (javaHome, " bin/jcmd" )
611+ if (direct. canExecute()) {
612+ return direct. absolutePath
613+ }
614+ File jdkSibling = new File (new File (javaHome). parentFile, " bin/jcmd" )
615+ if (jdkSibling. canExecute()) {
616+ return jdkSibling. absolutePath
395617 }
396- def allLines = logFile. readLines()
397- def tail = allLines. size() > lines ? allLines[- lines.. -1 ] : allLines
398- return tail. join(" \n " )
399- } catch (Exception e) {
400- return " (failed to read log: ${ e.message} )"
401618 }
619+ return " jcmd"
402620 }
403621
404622 def parseTraceFromStdOut ( String line ) {
0 commit comments