Skip to content

Commit e1eebcc

Browse files
bm1549claude
andcommitted
Fix log injection smoke test flakiness from startup timeout
The `check raw file injection` test has been flaking across 11+ logging backend variants for months. CI Visibility data shows 90% of failures are `traceCount=0` at `waitForTraceCount(2)` after exactly 30s — the JVM + agent bytecode instrumentation simply takes >30s on overloaded CI machines. Changes: - Add `startupPoll` with 120s timeout for the initial `waitForTraceCount(2)` that covers JVM startup + agent init, giving 4x headroom over the current 30s `defaultPoll` - Add `waitForTraceCountAlive` that checks process liveness on each poll iteration, turning silent 30-120s timeouts into instant, actionable errors when the process crashes - Reorder `waitForTraceCount(4)` before `waitFor` to confirm all traces are delivered while the process is still alive - Assert `waitFor` return value for a clear error if the process hangs tag: no release note Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
1 parent b266e2d commit e1eebcc

File tree

1 file changed

+37
-6
lines changed

1 file changed

+37
-6
lines changed

dd-smoke-tests/log-injection/src/test/groovy/datadog/smoketest/LogInjectionSmokeTest.groovy

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import java.nio.charset.StandardCharsets
1010
import java.util.zip.GZIPInputStream
1111
import spock.lang.AutoCleanup
1212
import spock.lang.Shared
13+
import spock.util.concurrent.PollingConditions
1314

1415
import static datadog.trace.api.config.TraceInstrumentationConfig.TRACE_128_BIT_TRACEID_LOGGING_ENABLED
1516
import static datadog.trace.api.config.TracerConfig.TRACE_128_BIT_TRACEID_GENERATION_ENABLED
@@ -27,9 +28,15 @@ import static java.util.concurrent.TimeUnit.SECONDS
2728
* SECONDTRACEID TRACEID SPANID
2829
*/
2930
abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
30-
// Estimate for the amount of time instrumentation, plus request, plus some extra
31+
// Timeout for the process to complete after all traces have been received
3132
static final int TIMEOUT_SECS = 30
3233

34+
// Agent initialization (bytecode instrumentation of ~200 integrations) can take >30s on
35+
// overloaded CI machines. Use a longer timeout for the initial waitForTraceCount that covers
36+
// JVM startup + agent init + first trace flush.
37+
@Shared
38+
protected final PollingConditions startupPoll = new PollingConditions(timeout: 120, initialDelay: 0, delay: 1, factor: 1)
39+
3340
static final String LOG4J2_BACKEND = "Log4j2"
3441

3542
@Shared
@@ -347,6 +354,27 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
347354
return logEvent[key]
348355
}
349356

357+
/**
358+
* Like {@link AbstractSmokeTest#waitForTraceCount} but also checks that the process is still
359+
* alive on each poll iteration. Without this, a crashed process causes a silent 30s (or longer)
360+
* timeout instead of an immediate failure with a useful message.
361+
*/
362+
int waitForTraceCountAlive(int count, spock.util.concurrent.PollingConditions conditions) {
363+
conditions.eventually {
364+
if (traceDecodingFailure != null) {
365+
throw traceDecodingFailure
366+
}
367+
if (testedProcess != null && !testedProcess.isAlive()) {
368+
// Process died — fail immediately instead of waiting for full timeout
369+
throw new AssertionError(
370+
"Process exited with code ${testedProcess.exitValue()} while waiting for ${count} traces (have ${traceCount.get()}). " +
371+
"Check ${logFilePath} for errors.")
372+
}
373+
assert traceCount.get() >= count
374+
}
375+
traceCount.get()
376+
}
377+
350378
def parseTraceFromStdOut( String line ) {
351379
if (line == null) {
352380
throw new IllegalArgumentException("Line is null")
@@ -365,7 +393,9 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
365393
@Flaky(condition = () -> JavaVirtualMachine.isIbm8() || JavaVirtualMachine.isOracleJDK8())
366394
def "check raw file injection"() {
367395
when:
368-
def count = waitForTraceCount(2)
396+
// Use the longer startup timeout for the initial wait — agent initialization
397+
// (bytecode instrumentation) can take >30s on overloaded CI machines
398+
def count = waitForTraceCountAlive(2, startupPoll)
369399

370400
def newConfig = """
371401
{"lib_config":
@@ -374,14 +404,15 @@ abstract class LogInjectionSmokeTest extends AbstractSmokeTest {
374404
""".toString()
375405
setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", newConfig)
376406

377-
count = waitForTraceCount(3)
407+
count = waitForTraceCountAlive(3, defaultPoll)
378408

379409
setRemoteConfig("datadog/2/APM_TRACING/config_overrides/config", """{"lib_config":{}}""".toString())
380410

381-
testedProcess.waitFor(TIMEOUT_SECS, SECONDS)
382-
def exitValue = testedProcess.exitValue()
411+
// Wait for all 4 traces before waiting for process exit to ensure trace delivery is confirmed
412+
count = waitForTraceCountAlive(4, defaultPoll)
383413

384-
count = waitForTraceCount(4)
414+
assert testedProcess.waitFor(TIMEOUT_SECS, SECONDS) : "Process did not exit within ${TIMEOUT_SECS}s"
415+
def exitValue = testedProcess.exitValue()
385416

386417
def logLines = outputLogFile.readLines()
387418
println "log lines: " + logLines

0 commit comments

Comments
 (0)