From fe6fd5aa329d0e12d955f9871c4a1487865fe929 Mon Sep 17 00:00:00 2001 From: Andres Tiko Date: Tue, 24 Mar 2026 15:53:24 +0200 Subject: [PATCH 1/2] [#37930][Runners] Reap child processes in ProcessManager to prevent zombie accumulation ProcessManager.stopProcess() calls destroy()/destroyForcibly() to terminate child processes, but never calls Process.waitFor() to collect the exit status. On POSIX systems, this means the terminated child process entry remains in the kernel process table as a zombie (state Z/defunct) until the parent process itself exits. In long-running environments like Flink TaskManagers using --environment_type=PROCESS, the expansion service processes are repeatedly spawned and stopped but never reaped. Over time this leads to significant zombie accumulation (176+ observed on production hosts). The fix adds process.waitFor() calls after process termination in: - stopProcess(): after the destroy/destroyForcibly sequence - killAllProcesses(): after destroyForcibly in the shutdown hook path Fixes #37930 --- .../environment/ProcessManager.java | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/environment/ProcessManager.java b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/environment/ProcessManager.java index 3570fef00df1..86299762783c 100644 --- a/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/environment/ProcessManager.java +++ b/runners/java-fn-execution/src/main/java/org/apache/beam/runners/fnexecution/environment/ProcessManager.java @@ -203,6 +203,14 @@ private void stopProcess(String id, Process process) { } } } + // Reap the child process to prevent zombie accumulation. destroy()/destroyForcibly() send + // signals but do not call waitpid(), so the terminated process remains in the kernel process + // table as a zombie until waitFor() collects its exit status. + try { + process.waitFor(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } } /** Returns true if the process exists within maxWaitTimeMillis. */ @@ -249,8 +257,16 @@ private void stopAllProcesses() { processes.forEach((id, process) -> process.destroy()); } - /** Kill all remaining processes forcibly, i.e. upon JVM shutdown */ + /** Kill all remaining processes forcibly and reap them, i.e. upon JVM shutdown. */ private void killAllProcesses() { - processes.forEach((id, process) -> process.destroyForcibly()); + processes.forEach( + (id, process) -> { + process.destroyForcibly(); + try { + process.waitFor(); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + }); } } From 9ea8bedbc3c7e359d1f3cca961dacda820ac111e Mon Sep 17 00:00:00 2001 From: Andres Tiko Date: Tue, 24 Mar 2026 16:05:26 +0200 Subject: [PATCH 2/2] Update CHANGES.md with ProcessManager zombie fix --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index e91da103c30e..dc442cc964c7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -84,6 +84,7 @@ ## Bugfixes * Fixed X (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). +* Fixed ProcessManager not reaping child processes, causing zombie process accumulation on long-running Flink deployments (Java) ([#37930](https://github.com/apache/beam/issues/37930)). ## Known Issues