Skip to content

Commit 79829a1

Browse files
Refactored dumping hanged tests by collecting child processes too.
1 parent 005aa33 commit 79829a1

5 files changed

Lines changed: 106 additions & 85 deletions

File tree

.gitlab-ci.yml

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ variables:
3131
GRADLE_PLUGIN_PROXY: "https://depot-read-api-java.us1.ddbuild.io/magicmirror/magicmirror/@current/"
3232
BUILDER_IMAGE_VERSION_PREFIX: "v26.01-" # use either an empty string (e.g. "") for latest images or a version followed by a hyphen (e.g. "v25.05-")
3333
REPO_NOTIFICATION_CHANNEL: "#apm-java-escalations"
34-
DEFAULT_TEST_JVMS: /^(8|11|17|21|25|stable)$/ # the latest "stable" version is 26
34+
DEFAULT_TEST_JVMS: /^(8|21|ibm8)$/ # the latest "stable" version is 26
3535
PROFILE_TESTS:
3636
description: "Enable profiling of tests"
3737
value: "false"
@@ -468,17 +468,17 @@ check_build_src:
468468
variables:
469469
GRADLE_TARGET: ":buildSrc:build"
470470

471-
check_base:
472-
extends: .check_job
473-
variables:
474-
GRADLE_TARGET: ":baseCheck"
475-
476-
check_inst:
477-
extends: .check_job
478-
parallel: 4
479-
variables:
480-
GRADLE_TARGET: ":instrumentationCheck"
481-
CACHE_TYPE: "inst"
471+
# check_base:
472+
# extends: .check_job
473+
# variables:
474+
# GRADLE_TARGET: ":baseCheck"
475+
#
476+
# check_inst:
477+
# extends: .check_job
478+
# parallel: 4
479+
# variables:
480+
# GRADLE_TARGET: ":instrumentationCheck"
481+
# CACHE_TYPE: "inst"
482482

483483
check_smoke:
484484
extends: .check_job
@@ -670,36 +670,36 @@ agent_integration_tests:
670670
DD_HOSTNAME: "local-agent"
671671
DD_API_KEY: "invalid_key_but_this_is_fine"
672672

673-
test_base:
674-
extends: .test_job
675-
variables:
676-
GRADLE_TARGET: ":baseTest"
677-
CACHE_TYPE: "base"
678-
parallel:
679-
matrix: *test_matrix_4
680-
script:
681-
- if [ "$testJvm" == "8" ]; then export GRADLE_PARAMS="-PskipFlakyTests -PcheckCoverage"; fi
682-
- !reference [.test_job, script]
683-
684-
test_inst:
685-
extends: .test_job_with_test_agent
686-
variables:
687-
GRADLE_TARGET: ":instrumentationTest"
688-
CACHE_TYPE: "inst"
689-
parallel:
690-
matrix: *test_matrix_8
691-
692-
test_inst_latest:
693-
extends: .test_job_with_test_agent
694-
variables:
695-
GRADLE_TARGET: ":instrumentationLatestDepTest"
696-
CACHE_TYPE: "latestdep"
697-
parallel:
698-
matrix:
699-
- testJvm: ["8", "17", "21", "25"] # the latest "stable" version is LTS v25
700-
# Gitlab doesn't support "parallel" and "parallel:matrix" at the same time
701-
# This emulates "parallel" by including it in the matrix
702-
CI_SPLIT: [ "1/6", "2/6", "3/6", "4/6", "5/6", "6/6"]
673+
# test_base:
674+
# extends: .test_job
675+
# variables:
676+
# GRADLE_TARGET: ":baseTest"
677+
# CACHE_TYPE: "base"
678+
# parallel:
679+
# matrix: *test_matrix_4
680+
# script:
681+
# - if [ "$testJvm" == "8" ]; then export GRADLE_PARAMS="-PskipFlakyTests -PcheckCoverage"; fi
682+
# - !reference [.test_job, script]
683+
684+
# test_inst:
685+
# extends: .test_job_with_test_agent
686+
# variables:
687+
# GRADLE_TARGET: ":instrumentationTest"
688+
# CACHE_TYPE: "inst"
689+
# parallel:
690+
# matrix: *test_matrix_8
691+
#
692+
# test_inst_latest:
693+
# extends: .test_job_with_test_agent
694+
# variables:
695+
# GRADLE_TARGET: ":instrumentationLatestDepTest"
696+
# CACHE_TYPE: "latestdep"
697+
# parallel:
698+
# matrix:
699+
# - testJvm: ["8", "17", "21", "25"] # the latest "stable" version is LTS v25
700+
# # Gitlab doesn't support "parallel" and "parallel:matrix" at the same time
701+
# # This emulates "parallel" by including it in the matrix
702+
# CI_SPLIT: [ "1/6", "2/6", "3/6", "4/6", "5/6", "6/6"]
703703

704704
test_flaky:
705705
extends: .test_job_with_test_agent

buildSrc/src/main/kotlin/datadog/gradle/plugin/dump/DumpHangedTestPlugin.kt

Lines changed: 58 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -120,43 +120,27 @@ class DumpHangedTestPlugin : Plugin<Project> {
120120

121121
dumpsDir.mkdirs()
122122

123-
fun file(name: String, ext: String = "log") =
124-
File(dumpsDir, "$name-${System.currentTimeMillis()}.$ext")
123+
val allProcessesFile = file(dumpsDir, "all-processes")
124+
runCmd(Redirect.to(allProcessesFile), "ps", "-ef")
125125

126-
// Collect all JVMs pids.
127-
val allJavaProcessesFile = file("all-java-processes")
128-
runCmd(Redirect.to(allJavaProcessesFile), "jcmd", "-l")
126+
val allProcesses = extractProcesses(allProcessesFile)
129127

130-
// On IBM JDK thread dump can be collected by signaling the matching `Gradle Test Executor` process with `kill -3`.
131-
// It will be writen into `/tmp/javacore.YYYYMMDD.HHMMSS.PID.SEQ.txt
132-
if (isIbm8(allJavaProcessesFile)) {
133-
val allProcessesFile = file("all-processes")
134-
runCmd(Redirect.to(allProcessesFile), "ps", "-ef")
135-
extractPidsIbm8(allProcessesFile).forEach { ibm8Pid ->
136-
runCmd(Redirect.INHERIT, "kill", "-3", ibm8Pid)
137-
}
138-
} else {
139-
val pids = extractPids(allJavaProcessesFile)
128+
val gradleTestExecutors = allProcesses.filter { it.command.contains("Gradle Test Executor") }
129+
val childProcesses = collectChildProcesses(allProcesses, gradleTestExecutors)
140130

141-
pids.forEach { pid ->
142-
// Collect heap dump by pid.
143-
val heapDumpPath = file("${pid}-heap-dump", "hprof").absolutePath
144-
runCmd(Redirect.INHERIT, "jcmd", pid, "GC.heap_dump", heapDumpPath)
145-
146-
// Collect thread dump by pid.
147-
val threadDumpFile = file("${pid}-thread-dump")
148-
runCmd(Redirect.to(threadDumpFile), "jcmd", pid, "Thread.print", "-l")
149-
}
131+
(gradleTestExecutors + childProcesses).forEach { process -> collectDump(dumpsDir, process) }
150132

151-
// Just in case collect all thread dumps by using special PID `0`.
152-
val allThreadsFile = file("all-thread-dumps")
153-
runCmd(Redirect.to(allThreadsFile), "jcmd", "0", "Thread.print", "-l")
154-
}
133+
// Just in case collect all thread dumps by using special PID `0`.
134+
val allThreadsFile = file(dumpsDir, "all-thread-dumps")
135+
runCmd(Redirect.to(allThreadsFile), "jcmd", "0", "Thread.print", "-l")
155136
} catch (e: Throwable) {
156137
t.logger.warn("Taking dumps failed with error: ${e.message ?: e.javaClass.name}, for ${t.path}")
157138
}
158139
}
159140

141+
private fun file(baseDir: File, name: String, ext: String = "log") =
142+
File(baseDir, "$name-${System.currentTimeMillis()}.$ext")
143+
160144
private fun cleanup(t: Task) {
161145
val future = t.extra
162146
.takeIf { it.has(DUMP_FUTURE_KEY) }
@@ -183,23 +167,55 @@ class DumpHangedTestPlugin : Plugin<Project> {
183167
}
184168
}
185169

186-
private fun isIbm8(file: File): Boolean =
187-
file.readLines().any { it.contains("-PtestJvm=ibm8") }
170+
private data class ProcessInfo(
171+
val pid: String,
172+
val ppid: String,
173+
val command: String,
174+
val isIbm: Boolean
175+
)
188176

189-
private fun extractPids(file: File): List<String> =
190-
file.readLines()
191-
.filter { it.contains("Gradle Test Executor") }
192-
.map { it.substringBefore(' ') }
177+
private val whitespaceRegex = Regex("\\s+")
193178

194-
private fun extractPidsIbm8(file: File): List<String> =
179+
// ps -ef format produce output like: `UID PID PPID C STIME TTY TIME CMD`
180+
private fun extractProcesses(file: File): List<ProcessInfo> =
195181
file.readLines()
196-
.filter { it.contains("Gradle Test Executor") }
197-
.filter { it.contains("ibm", ignoreCase = true) }
198-
.mapNotNull(::extractPid)
182+
.filter { it.contains("/bin/java") }
183+
.map {
184+
val parts = it.trimStart().split(whitespaceRegex, limit = 8)
185+
val command = parts.getOrNull(7) ?: ""
186+
187+
ProcessInfo(
188+
pid = parts[1],
189+
ppid = parts[2],
190+
command = command,
191+
isIbm = command.contains("/ibm8")
192+
)
193+
}
199194

200-
private val whitespaceRegex = Regex("\\s+")
195+
private fun collectChildProcesses(
196+
allProcesses: List<ProcessInfo>,
197+
gradleTestExecutors: List<ProcessInfo>
198+
): List<ProcessInfo> {
199+
val parentPids = gradleTestExecutors.map { it.pid }.toSet()
200+
return allProcesses.filter { parentPids.contains(it.ppid) }
201+
}
201202

202-
// ps -ef format produce output like: UID PID PPID ...
203-
private fun extractPid(line: String): String? =
204-
line.trimStart().split(whitespaceRegex, limit = 3).getOrNull(1)
203+
private fun collectDump(
204+
baseDir: File,
205+
process: ProcessInfo
206+
) {
207+
if (process.isIbm) {
208+
// On IBM JDK thread dump can be collected by signaling process with `kill -3`.
209+
// It will be writen into `/tmp/javacore.YYYYMMDD.HHMMSS.PID.SEQ.txt
210+
runCmd(Redirect.INHERIT, "kill", "-3", process.pid)
211+
} else {
212+
// Collect heap dump by pid.
213+
val heapDumpPath = file(baseDir, "${process.pid}-heap-dump", "hprof").absolutePath
214+
runCmd(Redirect.INHERIT, "jcmd", process.pid, "GC.heap_dump", heapDumpPath)
215+
216+
// Collect thread dump by pid.
217+
val threadDumpFile = file(baseDir, "${process.pid}-thread-dump", "log")
218+
runCmd(Redirect.to(threadDumpFile), "jcmd", process.pid, "Thread.print", "-l")
219+
}
220+
}
205221
}

dd-smoke-tests/concurrent/java-21/src/test/groovy/datadog/smoketest/concurrent/AbstractConcurrentTest.groovy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ abstract class AbstractConcurrentTest extends AbstractSmokeTest {
6565
}
6666

6767
protected void receivedCorrectTrace() {
68-
waitForTrace(defaultPoll, checkTrace())
68+
waitForTrace(freezePoll, checkTrace())
6969
assert traceCount.get() == 1
7070
assert testedProcess.waitFor(TIMEOUT_SECS, SECONDS)
7171
assert testedProcess.exitValue() == 0

dd-smoke-tests/osgi/src/main/java/datadog/smoketest/osgi/app/OSGiApplication.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ public static void main(final String[] args) throws Exception {
7070

7171
framework.waitForStop(1_000);
7272

73+
Thread.sleep(1000 * 60 * 21); // TODO emulating hanging task on CI
74+
7375
// XXX: Knopflerfish will leave some dangling non-daemon thread and prevent shutdown here.
7476
System.exit(0);
7577
}

dd-smoke-tests/src/main/groovy/datadog/smoketest/AbstractSmokeTest.groovy

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ abstract class AbstractSmokeTest extends ProcessManager {
6868
@Shared
6969
protected final PollingConditions defaultPoll = new PollingConditions(timeout: 30, initialDelay: 0, delay: 1, factor: 1)
7070

71+
@Shared
72+
protected final PollingConditions freezePoll = new PollingConditions(timeout: 1260, initialDelay: 0, delay: 5, factor: 2)
73+
7174
@Shared
7275
@AutoCleanup
7376
protected TestHttpServer server = httpServer {

0 commit comments

Comments
 (0)