Skip to content

Commit cc2e5fb

Browse files
Merge pull request #1728 from codeflash-ai/fix/java/sqlite-locked-error
fix(java): resolve SQLite 'database is locked' errors across the pipeline
2 parents 400cf06 + 16d314b commit cc2e5fb

2 files changed

Lines changed: 183 additions & 30 deletions

File tree

codeflash/languages/java/test_runner.py

Lines changed: 88 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import os
1212
import re
1313
import shutil
14+
import signal
1415
import subprocess
1516
import tempfile
1617
import uuid
@@ -46,6 +47,83 @@
4647
_VALID_JAVA_CLASS_NAME = re.compile(r"^[a-zA-Z_$][a-zA-Z0-9_$.]*$")
4748

4849

50+
def _run_cmd_kill_pg_on_timeout(
51+
cmd: list[str],
52+
*,
53+
cwd: Path | None = None,
54+
env: dict[str, str] | None = None,
55+
timeout: int | None = None,
56+
text: bool = True,
57+
) -> subprocess.CompletedProcess:
58+
"""Run a command, killing its entire process group on timeout.
59+
60+
Unlike subprocess.run(), this function uses start_new_session=True so the
61+
child process gets its own process group. When the timeout fires we send
62+
SIGTERM (then SIGKILL) to the whole process group, not just the process
63+
itself. This is critical for Maven, which forks child JVM processes
64+
(Maven Surefire forks) that would otherwise become orphaned when the Maven
65+
parent is killed by a plain subprocess.run() timeout. Orphaned JVMs keep
66+
SQLite file-handles open, causing "database is locked" errors.
67+
68+
Args:
69+
cmd: Command and arguments.
70+
cwd: Working directory.
71+
env: Environment variables.
72+
timeout: Seconds to wait before killing the process group.
73+
text: If True, decode stdout/stderr as text.
74+
75+
Returns:
76+
CompletedProcess. On timeout, returncode is -2 and stderr contains a
77+
human-readable explanation.
78+
79+
"""
80+
proc = subprocess.Popen(
81+
cmd,
82+
cwd=cwd,
83+
env=env,
84+
stdout=subprocess.PIPE,
85+
stderr=subprocess.PIPE,
86+
text=text,
87+
start_new_session=True, # puts proc in its own process group
88+
)
89+
try:
90+
stdout, stderr = proc.communicate(timeout=timeout)
91+
return subprocess.CompletedProcess(args=cmd, returncode=proc.returncode, stdout=stdout, stderr=stderr)
92+
except subprocess.TimeoutExpired:
93+
# Kill the entire process group so Maven's forked Surefire JVMs don't
94+
# become orphans that keep the SQLite database locked.
95+
pgid = None
96+
try:
97+
pgid = os.getpgid(proc.pid)
98+
os.killpg(pgid, signal.SIGTERM)
99+
except (ProcessLookupError, OSError):
100+
proc.kill()
101+
# Give processes a few seconds to shut down gracefully before SIGKILL.
102+
try:
103+
proc.wait(timeout=5)
104+
except subprocess.TimeoutExpired:
105+
if pgid is not None:
106+
try:
107+
os.killpg(pgid, signal.SIGKILL)
108+
except (ProcessLookupError, OSError):
109+
pass
110+
else:
111+
proc.kill()
112+
proc.wait()
113+
# Drain pipes so we don't leave zombie pipe buffers.
114+
try:
115+
stdout_data = proc.stdout.read() if proc.stdout else ""
116+
stderr_data = proc.stderr.read() if proc.stderr else ""
117+
except Exception:
118+
stdout_data, stderr_data = "", ""
119+
return subprocess.CompletedProcess(
120+
args=cmd,
121+
returncode=-2,
122+
stdout=stdout_data,
123+
stderr=f"Process group killed after timeout ({timeout}s): {stderr_data}",
124+
)
125+
126+
49127
def _validate_java_class_name(class_name: str) -> bool:
50128
"""Validate that a string is a valid Java class name.
51129
@@ -505,14 +583,7 @@ def _compile_tests(
505583
logger.debug("Compiling tests: %s in %s", " ".join(cmd), project_root)
506584

507585
try:
508-
return subprocess.run(
509-
cmd, check=False, cwd=project_root, env=env, capture_output=True, text=True, timeout=timeout
510-
)
511-
except subprocess.TimeoutExpired:
512-
logger.exception("Maven compilation timed out after %d seconds", timeout)
513-
return subprocess.CompletedProcess(
514-
args=cmd, returncode=-2, stdout="", stderr=f"Compilation timed out after {timeout} seconds"
515-
)
586+
return _run_cmd_kill_pg_on_timeout(cmd, cwd=project_root, env=env, timeout=timeout)
516587
except Exception as e:
517588
logger.exception("Maven compilation failed: %s", e)
518589
return subprocess.CompletedProcess(args=cmd, returncode=-1, stdout="", stderr=str(e))
@@ -548,9 +619,7 @@ def _get_test_classpath(
548619
logger.debug("Getting classpath: %s", " ".join(cmd))
549620

550621
try:
551-
result = subprocess.run(
552-
cmd, check=False, cwd=project_root, env=env, capture_output=True, text=True, timeout=timeout
553-
)
622+
result = _run_cmd_kill_pg_on_timeout(cmd, cwd=project_root, env=env, timeout=timeout)
554623

555624
if result.returncode != 0:
556625
logger.error("Failed to get classpath: %s", result.stderr)
@@ -600,9 +669,6 @@ def _get_test_classpath(
600669

601670
return os.pathsep.join(cp_parts)
602671

603-
except subprocess.TimeoutExpired:
604-
logger.exception("Getting classpath timed out")
605-
return None
606672
except Exception as e:
607673
logger.exception("Failed to get classpath: %s", e)
608674
return None
@@ -804,14 +870,7 @@ def _run_tests_direct(
804870
logger.debug("Running tests directly: java -cp ... ConsoleLauncher --select-class %s", test_classes)
805871

806872
try:
807-
return subprocess.run(
808-
cmd, check=False, cwd=working_dir, env=env, capture_output=True, text=True, timeout=timeout
809-
)
810-
except subprocess.TimeoutExpired:
811-
logger.exception("Direct test execution timed out after %d seconds", timeout)
812-
return subprocess.CompletedProcess(
813-
args=cmd, returncode=-2, stdout="", stderr=f"Test execution timed out after {timeout} seconds"
814-
)
873+
return _run_cmd_kill_pg_on_timeout(cmd, cwd=working_dir, env=env, timeout=timeout)
815874
except Exception as e:
816875
logger.exception("Direct test execution failed: %s", e)
817876
return subprocess.CompletedProcess(args=cmd, returncode=-1, stdout="", stderr=str(e))
@@ -1511,9 +1570,13 @@ def _run_maven_tests(
15111570
logger.debug("Running Maven command: %s in %s", " ".join(cmd), project_root)
15121571

15131572
try:
1514-
result = subprocess.run(
1515-
cmd, check=False, cwd=project_root, env=env, capture_output=True, text=True, timeout=timeout
1516-
)
1573+
# Use _run_cmd_kill_pg_on_timeout instead of subprocess.run so that on
1574+
# timeout we kill the entire Maven process GROUP (including forked Surefire
1575+
# JVMs). With plain subprocess.run(), only the Maven parent is killed and
1576+
# the child JVMs become orphaned, holding the SQLite result file open and
1577+
# causing "database is locked" errors when Python reads the file immediately
1578+
# after Maven is killed.
1579+
result = _run_cmd_kill_pg_on_timeout(cmd, cwd=project_root, env=env, timeout=timeout)
15171580

15181581
# Check if Maven failed due to compilation errors (not just test failures)
15191582
if result.returncode != 0:
@@ -1546,11 +1609,6 @@ def _run_maven_tests(
15461609

15471610
return result
15481611

1549-
except subprocess.TimeoutExpired:
1550-
logger.exception("Maven test execution timed out after %d seconds", timeout)
1551-
return subprocess.CompletedProcess(
1552-
args=cmd, returncode=-2, stdout="", stderr=f"Test execution timed out after {timeout} seconds"
1553-
)
15541612
except Exception as e:
15551613
logger.exception("Maven test execution failed: %s", e)
15561614
return subprocess.CompletedProcess(args=cmd, returncode=-1, stdout="", stderr=str(e))

tests/test_languages/test_java/test_run_and_parse.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@
77
"""
88

99
import os
10+
import signal
1011
import sqlite3
12+
import sys
13+
import time
1114
from argparse import Namespace
1215
from pathlib import Path
1316

@@ -653,3 +656,95 @@ def test_performance_multiple_test_methods_inner_loop(self, java_project):
653656
f"total_passed_runtime {total_runtime / 1_000_000:.3f}ms not close to expected "
654657
f"{expected_total_ns / 1_000_000:.1f}ms (2 methods × min of 4 runtimes × 10ms, ±3%)"
655658
)
659+
660+
661+
# ---------------------------------------------------------------------------
662+
# Tests for _run_cmd_kill_pg_on_timeout
663+
# Root-cause fix for "database is locked": when Maven times out, we must kill
664+
# its forked Surefire child JVMs (entire process group), not just the Maven
665+
# parent process. Plain subprocess.run() only kills the parent, orphaning the
666+
# child JVMs which continue to hold the SQLite file open.
667+
# ---------------------------------------------------------------------------
668+
669+
class TestRunCmdKillPgOnTimeout:
670+
"""Unit tests for _run_cmd_kill_pg_on_timeout in test_runner.py."""
671+
672+
def _helper(self):
673+
from codeflash.languages.java.test_runner import _run_cmd_kill_pg_on_timeout
674+
return _run_cmd_kill_pg_on_timeout
675+
676+
def test_successful_command_returns_correct_output(self):
677+
"""A command that finishes normally should return its stdout/stderr/returncode."""
678+
fn = self._helper()
679+
result = fn(["echo", "hello"], timeout=10)
680+
assert result.returncode == 0
681+
assert "hello" in result.stdout
682+
683+
def test_failing_command_returns_nonzero_returncode(self):
684+
"""A command that exits non-zero should propagate the returncode."""
685+
fn = self._helper()
686+
result = fn(["false"], timeout=10)
687+
assert result.returncode != 0
688+
689+
@pytest.mark.skipif(sys.platform == "win32", reason="Process groups are POSIX only")
690+
def test_timeout_kills_child_processes_not_just_parent(self, tmp_path):
691+
"""On timeout, child processes must be killed (not left orphaned).
692+
693+
We start a shell script that forks a long-running child (sleep 60).
694+
The script writes the child PID to a file. We set a short timeout so
695+
_run_cmd_kill_pg_on_timeout kills the entire process group. After the
696+
call returns, the child must be dead (not still running).
697+
"""
698+
fn = self._helper()
699+
child_pid_file = tmp_path / "child.pid"
700+
# Shell: start sleep in background, write its PID, then sleep ourselves
701+
# so the parent also runs long enough to be timed out.
702+
script = (
703+
f"sleep 60 & echo $! > {child_pid_file}; sleep 60"
704+
)
705+
result = fn(["bash", "-c", script], timeout=2)
706+
707+
# The result should indicate a timeout (returncode -2)
708+
assert result.returncode == -2
709+
710+
# Wait briefly for the child PID file to be written (race between
711+
# the script writing it and us being killed).
712+
deadline = time.monotonic() + 3
713+
while not child_pid_file.exists() and time.monotonic() < deadline:
714+
time.sleep(0.1)
715+
716+
if not child_pid_file.exists():
717+
# Script was killed before it could write the PID — that's fine;
718+
# it means the child never started, so no orphan problem.
719+
return
720+
721+
child_pid_str = child_pid_file.read_text().strip()
722+
if not child_pid_str.isdigit():
723+
return
724+
725+
child_pid = int(child_pid_str)
726+
# Give the child a moment to be killed by the process-group SIGKILL.
727+
time.sleep(0.5)
728+
729+
# Check if the child is still alive by sending signal 0.
730+
try:
731+
os.kill(child_pid, 0)
732+
# If we get here the process is still running — that's the bug we fixed.
733+
assert False, (
734+
f"Child process {child_pid} is still running after timeout kill; "
735+
"orphaned JVMs would keep the SQLite file locked"
736+
)
737+
except ProcessLookupError:
738+
pass # Process is dead — expected behaviour
739+
740+
def test_returncode_is_minus_two_on_timeout(self):
741+
"""Timed-out commands must return returncode=-2 (our convention)."""
742+
fn = self._helper()
743+
result = fn(["sleep", "60"], timeout=1)
744+
assert result.returncode == -2
745+
746+
def test_timeout_message_in_stderr(self):
747+
"""The stderr of a timed-out command should describe the timeout."""
748+
fn = self._helper()
749+
result = fn(["sleep", "60"], timeout=1)
750+
assert "timeout" in result.stderr.lower() or "killed" in result.stderr.lower()

0 commit comments

Comments
 (0)