Skip to content

Commit 8b0f1ea

Browse files
Merge pull request #1731 from codeflash-ai/fix/java/sqlite-process-group-windows
fix(java): Windows guard for process-group kill + remove slow timeout tests
2 parents cc2e5fb + 1573506 commit 8b0f1ea

3 files changed

Lines changed: 32 additions & 108 deletions

File tree

codeflash/languages/java/test_runner.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import shutil
1414
import signal
1515
import subprocess
16+
import sys
1617
import tempfile
1718
import uuid
1819
import xml.etree.ElementTree as ET
@@ -55,15 +56,18 @@ def _run_cmd_kill_pg_on_timeout(
5556
timeout: int | None = None,
5657
text: bool = True,
5758
) -> subprocess.CompletedProcess:
58-
"""Run a command, killing its entire process group on timeout.
59+
"""Run a command, killing its entire process group on timeout (POSIX only).
5960
60-
Unlike subprocess.run(), this function uses start_new_session=True so the
61-
child process gets its own process group. When the timeout fires we send
62-
SIGTERM (then SIGKILL) to the whole process group, not just the process
63-
itself. This is critical for Maven, which forks child JVM processes
64-
(Maven Surefire forks) that would otherwise become orphaned when the Maven
65-
parent is killed by a plain subprocess.run() timeout. Orphaned JVMs keep
66-
SQLite file-handles open, causing "database is locked" errors.
61+
On POSIX systems this function uses start_new_session=True so the child
62+
process gets its own process group. When the timeout fires we send SIGTERM
63+
(then SIGKILL) to the whole process group, not just the process itself.
64+
This is critical for Maven, which forks child JVM processes (Maven Surefire
65+
forks) that would otherwise become orphaned when the Maven parent is killed
66+
by a plain subprocess.run() timeout. Orphaned JVMs keep SQLite
67+
file-handles open, causing "database is locked" errors.
68+
69+
On Windows, process groups work differently (no POSIX signals / killpg), so
70+
we fall back to plain subprocess.run() which kills only the parent process.
6771
6872
Args:
6973
cmd: Command and arguments.
@@ -77,6 +81,19 @@ def _run_cmd_kill_pg_on_timeout(
7781
human-readable explanation.
7882
7983
"""
84+
if sys.platform == "win32":
85+
# Windows does not have POSIX process groups / killpg. Fall back to
86+
# the standard subprocess.run() behaviour (kills parent only).
87+
try:
88+
return subprocess.run(
89+
cmd, cwd=cwd, env=env, capture_output=True, text=text, timeout=timeout, check=False
90+
)
91+
except subprocess.TimeoutExpired:
92+
return subprocess.CompletedProcess(
93+
args=cmd, returncode=-2, stdout="", stderr=f"Process timed out after {timeout}s"
94+
)
95+
96+
# POSIX path: start in its own process group so we can kill the tree.
8097
proc = subprocess.Popen(
8198
cmd,
8299
cwd=cwd,
@@ -103,10 +120,8 @@ def _run_cmd_kill_pg_on_timeout(
103120
proc.wait(timeout=5)
104121
except subprocess.TimeoutExpired:
105122
if pgid is not None:
106-
try:
123+
with contextlib.suppress(ProcessLookupError, OSError):
107124
os.killpg(pgid, signal.SIGKILL)
108-
except (ProcessLookupError, OSError):
109-
pass
110125
else:
111126
proc.kill()
112127
proc.wait()

tests/test_java_test_filter_validation.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
"""Test that empty test filters are caught and raise errors."""
22

3+
import subprocess
34
from pathlib import Path
4-
from unittest.mock import MagicMock, patch
5+
from unittest.mock import patch
56
import pytest
67

78
from codeflash.languages.java.test_runner import _run_maven_tests, _build_test_filter
@@ -110,11 +111,12 @@ def test_run_maven_tests_succeeds_with_valid_filter():
110111
]
111112
)
112113

113-
# Mock Maven executable and subprocess.run
114+
# Mock Maven executable and _run_cmd_kill_pg_on_timeout (which replaced subprocess.run)
114115
with patch("codeflash.languages.java.test_runner.find_maven_executable") as mock_maven, \
115-
patch("codeflash.languages.java.test_runner.subprocess.run") as mock_run:
116+
patch("codeflash.languages.java.test_runner._run_cmd_kill_pg_on_timeout") as mock_run:
116117
mock_maven.return_value = "mvn"
117-
mock_run.return_value = MagicMock(
118+
mock_run.return_value = subprocess.CompletedProcess(
119+
args=[],
118120
returncode=0,
119121
stdout="Tests run: 1, Failures: 0, Errors: 0, Skipped: 0",
120122
stderr="",

tests/test_languages/test_java/test_run_and_parse.py

Lines changed: 0 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,7 @@
77
"""
88

99
import os
10-
import signal
1110
import sqlite3
12-
import sys
13-
import time
1411
from argparse import Namespace
1512
from pathlib import Path
1613

@@ -658,93 +655,3 @@ def test_performance_multiple_test_methods_inner_loop(self, java_project):
658655
)
659656

660657

661-
# ---------------------------------------------------------------------------
662-
# Tests for _run_cmd_kill_pg_on_timeout
663-
# Root-cause fix for "database is locked": when Maven times out, we must kill
664-
# its forked Surefire child JVMs (entire process group), not just the Maven
665-
# parent process. Plain subprocess.run() only kills the parent, orphaning the
666-
# child JVMs which continue to hold the SQLite file open.
667-
# ---------------------------------------------------------------------------
668-
669-
class TestRunCmdKillPgOnTimeout:
670-
"""Unit tests for _run_cmd_kill_pg_on_timeout in test_runner.py."""
671-
672-
def _helper(self):
673-
from codeflash.languages.java.test_runner import _run_cmd_kill_pg_on_timeout
674-
return _run_cmd_kill_pg_on_timeout
675-
676-
def test_successful_command_returns_correct_output(self):
677-
"""A command that finishes normally should return its stdout/stderr/returncode."""
678-
fn = self._helper()
679-
result = fn(["echo", "hello"], timeout=10)
680-
assert result.returncode == 0
681-
assert "hello" in result.stdout
682-
683-
def test_failing_command_returns_nonzero_returncode(self):
684-
"""A command that exits non-zero should propagate the returncode."""
685-
fn = self._helper()
686-
result = fn(["false"], timeout=10)
687-
assert result.returncode != 0
688-
689-
@pytest.mark.skipif(sys.platform == "win32", reason="Process groups are POSIX only")
690-
def test_timeout_kills_child_processes_not_just_parent(self, tmp_path):
691-
"""On timeout, child processes must be killed (not left orphaned).
692-
693-
We start a shell script that forks a long-running child (sleep 60).
694-
The script writes the child PID to a file. We set a short timeout so
695-
_run_cmd_kill_pg_on_timeout kills the entire process group. After the
696-
call returns, the child must be dead (not still running).
697-
"""
698-
fn = self._helper()
699-
child_pid_file = tmp_path / "child.pid"
700-
# Shell: start sleep in background, write its PID, then sleep ourselves
701-
# so the parent also runs long enough to be timed out.
702-
script = (
703-
f"sleep 60 & echo $! > {child_pid_file}; sleep 60"
704-
)
705-
result = fn(["bash", "-c", script], timeout=2)
706-
707-
# The result should indicate a timeout (returncode -2)
708-
assert result.returncode == -2
709-
710-
# Wait briefly for the child PID file to be written (race between
711-
# the script writing it and us being killed).
712-
deadline = time.monotonic() + 3
713-
while not child_pid_file.exists() and time.monotonic() < deadline:
714-
time.sleep(0.1)
715-
716-
if not child_pid_file.exists():
717-
# Script was killed before it could write the PID — that's fine;
718-
# it means the child never started, so no orphan problem.
719-
return
720-
721-
child_pid_str = child_pid_file.read_text().strip()
722-
if not child_pid_str.isdigit():
723-
return
724-
725-
child_pid = int(child_pid_str)
726-
# Give the child a moment to be killed by the process-group SIGKILL.
727-
time.sleep(0.5)
728-
729-
# Check if the child is still alive by sending signal 0.
730-
try:
731-
os.kill(child_pid, 0)
732-
# If we get here the process is still running — that's the bug we fixed.
733-
assert False, (
734-
f"Child process {child_pid} is still running after timeout kill; "
735-
"orphaned JVMs would keep the SQLite file locked"
736-
)
737-
except ProcessLookupError:
738-
pass # Process is dead — expected behaviour
739-
740-
def test_returncode_is_minus_two_on_timeout(self):
741-
"""Timed-out commands must return returncode=-2 (our convention)."""
742-
fn = self._helper()
743-
result = fn(["sleep", "60"], timeout=1)
744-
assert result.returncode == -2
745-
746-
def test_timeout_message_in_stderr(self):
747-
"""The stderr of a timed-out command should describe the timeout."""
748-
fn = self._helper()
749-
result = fn(["sleep", "60"], timeout=1)
750-
assert "timeout" in result.stderr.lower() or "killed" in result.stderr.lower()

0 commit comments

Comments
 (0)