Skip to content

Commit 9a554fc

Browse files
bm1549claude
andcommitted
Diagnose log injection smoke test flakiness instead of masking it
The `check raw file injection` test flakes across 11+ logging backend variants. CI Visibility data shows the failure is bimodal — successful runs complete in 3-9s, but failures sit at exactly 30s (the PollingConditions timeout) with traceCount=0. Nothing in between. This means the process either works or is totally broken — no amount of timeout increase will help. The current test is blind during the 30s wait — it just polls traceCount with no diagnostics when the process crashes or hangs. Changes: - Add `waitForTraceCountAlive` that checks process liveness on every poll iteration. If the process dies, it fails immediately with the exit code, RC poll count, and last 20 lines of process output. - On timeout, enrich the error with diagnostic state (process alive?, traceCount, RC polls received, last 30 lines of output) so the next CI failure tells us whether it's a crash, a hang, or a connectivity issue. - Reorder `waitForTraceCount(4)` before `waitFor` to confirm all traces are delivered while the process is still alive. - Assert `waitFor` return value for a clear error if the process hangs. tag: no release notes Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
1 parent e1eebcc commit 9a554fc

File tree

1 file changed

+44
-26
lines changed

1 file changed

+44
-26
lines changed

dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy

Lines changed: 44 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import java.nio.charset.StandardCharsets
1010
import java.util.zip.GZIPInputStream
1111
import spock.lang.AutoCleanup
1212
import spock.lang.Shared
13-
import spock.util.concurrent.PollingConditions
1413

1514
import static datadog.trace.api.config.TraceInstrumentationConfig.TRACE_128_BIT_TRACEID_LOGGING_ENABLED
1615
import static datadog.trace.api.config.TracerConfig.TRACE_128_BIT_TRACEID_GENERATION_ENABLED
@@ -28,15 +27,9 @@ import static java.util.concurrent.TimeUnit.SECONDS
2827
* SECONDTRACEID TRACEID SPANID
2928
*/
3029
abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
31-
// Timeout for the process to complete after all traces have been received
30+
// Estimate for the amount of time instrumentation, plus request, plus some extra
3231
static final int TIMEOUT_SECS = 30
3332

34-
// Agent initialization (bytecode instrumentation of ~200 integrations) can take >30s on
35-
// overloaded CI machines. Use a longer timeout for the initial waitForTraceCount that covers
36-
// JVM startup + agent init + first trace flush.
37-
@Shared
38-
protected final PollingConditions startupPoll = new PollingConditions(timeout: 120, initialDelay: 0, delay: 1, factor: 1)
39-
4033
static final String LOG4J2_BACKEND = "Log4j2"
4134

4235
@Shared
@@ -355,26 +348,53 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
355348
}
356349

357350
/**
358-
* Like {@link AbstractSmokeTest#waitForTraceCount} but also checks that the process is still
359-
* alive on each poll iteration. Without this, a crashed process causes a silent 30s (or longer)
360-
* timeout instead of an immediate failure with a useful message.
351+
* Like {@link AbstractSmokeTest#waitForTraceCount} but checks process liveness on every poll
352+
* iteration and dumps diagnostic state on failure, so CI failures produce actionable output
353+
* instead of a bare "Condition not satisfied" after a 30s timeout.
361354
*/
362-
int waitForTraceCountAlive(int count, spock.util.concurrent.PollingConditions conditions) {
363-
conditions.eventually {
364-
if (traceDecodingFailure != null) {
365-
throw traceDecodingFailure
355+
int waitForTraceCountAlive(int count) {
356+
try {
357+
defaultPoll.eventually {
358+
if (traceDecodingFailure != null) {
359+
throw traceDecodingFailure
360+
}
361+
if (testedProcess != null && !testedProcess.isAlive()) {
362+
def lastLines = tailProcessLog(20)
363+
throw new AssertionError(
364+
"Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces " +
365+
"(received ${traceCount.get()}, RC polls: ${rcClientMessages.size()}).\n" +
366+
"Last process output:\n${lastLines}")
367+
}
368+
assert traceCount.get() >= count
366369
}
367-
if (testedProcess != null && !testedProcess.isAlive()) {
368-
// Process died — fail immediately instead of waiting for full timeout
369-
throw new AssertionError(
370-
"Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces (have ${traceCount.get()}). " +
371-
"Check ${logFilePath} for errors.")
370+
} catch (Throwable e) {
371+
if (e instanceof AssertionError && e.message?.startsWith("Process exited")) {
372+
throw e // Already enriched — don't wrap again
372373
}
373-
assert traceCount.get() >= count
374+
// The default error ("Condition not satisfied after 30s") is useless — enrich with diagnostic state
375+
def alive = testedProcess?.isAlive()
376+
def lastLines = tailProcessLog(30)
377+
throw new AssertionError(
378+
"Timed out waiting for ${count} traces after ${defaultPoll.timeout}s. " +
379+
"traceCount=${traceCount.get()}, process.alive=${alive}, " +
380+
"RC polls received: ${rcClientMessages.size()}.\n" +
381+
"Last process output:\n${lastLines}", e)
374382
}
375383
traceCount.get()
376384
}
377385

386+
private String tailProcessLog(int lines) {
387+
try {
388+
def logFile = new File(logFilePath)
389+
if (!logFile.exists()) return "(log file does not exist: ${logFilePath})"
390+
def allLines = logFile.readLines()
391+
def tail = allLines.size() > lines ? allLines[-lines..-1] : allLines
392+
return tail.join("\n")
393+
} catch (Exception e) {
394+
return "(failed to read log: ${e.message})"
395+
}
396+
}
397+
378398
def parseTraceFromStdOut( String line ) {
379399
if (line == null) {
380400
throw new IllegalArgumentException("Line is null")
@@ -393,9 +413,7 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
393413
@Flaky(condition = () -> JavaVirtualMachine.isIbm8() || JavaVirtualMachine.isOracleJDK8())
394414
def "check raw file injection"() {
395415
when:
396-
// Use the longer startup timeout for the initial wait — agent initialization
397-
// (bytecode instrumentation) can take >30s on overloaded CI machines
398-
def count = waitForTraceCountAlive(2, startupPoll)
416+
def count = waitForTraceCountAlive(2)
399417

400418
def newConfig = """
401419
{"lib_config":
@@ -404,12 +422,12 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
404422
""".toString()
405423
setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", newConfig)
406424

407-
count = waitForTraceCountAlive(3, defaultPoll)
425+
count = waitForTraceCountAlive(3)
408426

409427
setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", """{"lib_config":{}}""".toString())
410428

411429
// Wait for all 4 traces before waiting for process exit to ensure trace delivery is confirmed
412-
count = waitForTraceCountAlive(4, defaultPoll)
430+
count = waitForTraceCountAlive(4)
413431

414432
assert testedProcess.waitFor(TIMEOUT_SECS, SECONDS) : "Process did not exit within ${TIMEOUT_SECS}s"
415433
def exitValue = testedProcess.exitValue()

0 commit comments

Comments
 (0)