Skip to content

Commit 470482e

Browse files
fix: check subprocess exit codes in Java tracer
_run_java_with_graceful_timeout() discarded the subprocess exit code in both the no-timeout and timeout paths. If Maven/Gradle failed (compilation error, OOM, etc.), the tracer silently continued with missing/stale data. Now returns the exit code. Stage 1 (JFR profiling) warns on failure but continues. Stage 2 (argument capture) raises RuntimeError since trace data is essential for replay test generation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 5ee642e commit 470482e

2 files changed

Lines changed: 170 additions & 10 deletions

File tree

codeflash/languages/java/tracer.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,19 @@
1717
GRACEFUL_SHUTDOWN_WAIT = 5 # seconds to wait after SIGTERM before SIGKILL
1818

1919

20-
def _run_java_with_graceful_timeout(
21-
java_command: list[str], env: dict[str, str], timeout: int, stage_name: str
22-
) -> None:
20+
def _run_java_with_graceful_timeout(java_command: list[str], env: dict[str, str], timeout: int, stage_name: str) -> int:
2321
"""Run a Java command with graceful timeout handling.
2422
2523
Sends SIGTERM first (allowing JFR dump and shutdown hooks to run),
2624
then SIGKILL if the process doesn't exit within GRACEFUL_SHUTDOWN_WAIT seconds.
25+
26+
Returns the process exit code, or -1 if the process was killed due to timeout.
2727
"""
2828
if not timeout:
29-
subprocess.run(java_command, env=env, check=False)
30-
return
29+
result = subprocess.run(java_command, env=env, check=False)
30+
if result.returncode != 0:
31+
logger.warning("%s exited with code %d", stage_name, result.returncode)
32+
return result.returncode
3133

3234
import signal
3335

@@ -45,6 +47,11 @@ def _run_java_with_graceful_timeout(
4547
logger.warning("%s stage did not exit after SIGTERM, sending SIGKILL", stage_name)
4648
proc.kill()
4749
proc.wait()
50+
return -1
51+
52+
if proc.returncode != 0:
53+
logger.warning("%s exited with code %d", stage_name, proc.returncode)
54+
return proc.returncode
4855

4956

5057
# --add-opens flags needed for Kryo serialization on Java 16+
@@ -78,21 +85,27 @@ def trace(
7885
jfr_file = trace_db_path.with_suffix(".jfr")
7986
trace_db_path.parent.mkdir(parents=True, exist_ok=True)
8087

81-
# Stage 1: JFR Profiling
88+
# Stage 1: JFR Profiling (non-fatal — JFR data is supplementary)
8289
logger.info("Stage 1: Running JFR profiling...")
8390
jfr_env = self.build_jfr_env(jfr_file)
84-
_run_java_with_graceful_timeout(java_command, jfr_env, timeout, "JFR profiling")
91+
jfr_exit = _run_java_with_graceful_timeout(java_command, jfr_env, timeout, "JFR profiling")
8592

86-
if not jfr_file.exists():
93+
if jfr_exit != 0:
94+
logger.warning("JFR profiling failed (exit code %d), continuing without profiling data", jfr_exit)
95+
elif not jfr_file.exists():
8796
logger.warning("JFR file was not created at %s", jfr_file)
8897

89-
# Stage 2: Argument Capture via Tracing Agent
98+
# Stage 2: Argument Capture via Tracing Agent (fatal — trace data is essential)
9099
logger.info("Stage 2: Running argument capture...")
91100
config_path = self.create_tracer_config(
92101
trace_db_path, packages, project_root=project_root, max_function_count=max_function_count, timeout=timeout
93102
)
94103
agent_env = self.build_agent_env(config_path)
95-
_run_java_with_graceful_timeout(java_command, agent_env, timeout, "Argument capture")
104+
capture_exit = _run_java_with_graceful_timeout(java_command, agent_env, timeout, "Argument capture")
105+
106+
if capture_exit != 0:
107+
msg = f"Argument capture failed with exit code {capture_exit} — cannot proceed without trace data"
108+
raise RuntimeError(msg)
96109

97110
if not trace_db_path.exists():
98111
logger.error("Trace database was not created at %s", trace_db_path)
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
from __future__ import annotations
2+
3+
from typing import TYPE_CHECKING
4+
from unittest.mock import MagicMock, patch
5+
6+
if TYPE_CHECKING:
7+
from pathlib import Path
8+
9+
import pytest
10+
11+
from codeflash.languages.java.tracer import JavaTracer, _run_java_with_graceful_timeout
12+
13+
14+
class TestRunJavaWithGracefulTimeout:
15+
def test_returns_zero_on_success(self) -> None:
16+
mock_result = MagicMock()
17+
mock_result.returncode = 0
18+
with patch("codeflash.languages.java.tracer.subprocess.run", return_value=mock_result):
19+
rc = _run_java_with_graceful_timeout(["java", "-version"], {}, 0, "test")
20+
assert rc == 0
21+
22+
def test_returns_nonzero_on_failure(self) -> None:
23+
mock_result = MagicMock()
24+
mock_result.returncode = 1
25+
with patch("codeflash.languages.java.tracer.subprocess.run", return_value=mock_result):
26+
rc = _run_java_with_graceful_timeout(["java", "-version"], {}, 0, "test")
27+
assert rc == 1
28+
29+
def test_returns_exit_code_137_oom_kill(self) -> None:
30+
mock_result = MagicMock()
31+
mock_result.returncode = 137
32+
with patch("codeflash.languages.java.tracer.subprocess.run", return_value=mock_result):
33+
rc = _run_java_with_graceful_timeout(["java", "-version"], {}, 0, "test")
34+
assert rc == 137
35+
36+
def test_timeout_path_returns_zero_on_success(self) -> None:
37+
mock_proc = MagicMock()
38+
mock_proc.returncode = 0
39+
with patch("codeflash.languages.java.tracer.subprocess.Popen", return_value=mock_proc):
40+
rc = _run_java_with_graceful_timeout(["java", "-version"], {}, 60, "test")
41+
assert rc == 0
42+
43+
def test_timeout_path_returns_nonzero_on_failure(self) -> None:
44+
mock_proc = MagicMock()
45+
mock_proc.returncode = 1
46+
with patch("codeflash.languages.java.tracer.subprocess.Popen", return_value=mock_proc):
47+
rc = _run_java_with_graceful_timeout(["java", "-version"], {}, 60, "test")
48+
assert rc == 1
49+
50+
def test_timeout_returns_negative_one(self) -> None:
51+
import subprocess
52+
53+
mock_proc = MagicMock()
54+
# First wait() times out, SIGTERM wait succeeds
55+
mock_proc.wait.side_effect = [
56+
subprocess.TimeoutExpired(cmd="java", timeout=60),
57+
None, # SIGTERM wait succeeds
58+
]
59+
with patch("codeflash.languages.java.tracer.subprocess.Popen", return_value=mock_proc):
60+
rc = _run_java_with_graceful_timeout(["java", "-version"], {}, 60, "test")
61+
assert rc == -1
62+
63+
def test_timeout_sends_sigterm_then_sigkill(self) -> None:
64+
import signal
65+
import subprocess
66+
67+
mock_proc = MagicMock()
68+
# First wait() times out, SIGTERM wait also times out
69+
mock_proc.wait.side_effect = [
70+
subprocess.TimeoutExpired(cmd="java", timeout=60),
71+
subprocess.TimeoutExpired(cmd="java", timeout=5),
72+
None,
73+
]
74+
with patch("codeflash.languages.java.tracer.subprocess.Popen", return_value=mock_proc):
75+
rc = _run_java_with_graceful_timeout(["java", "-version"], {}, 60, "test")
76+
77+
assert rc == -1
78+
mock_proc.send_signal.assert_called_once_with(signal.SIGTERM)
79+
mock_proc.kill.assert_called_once()
80+
81+
82+
class TestJavaTracerExitCodeHandling:
83+
def test_stage1_failure_continues(self, tmp_path: Path) -> None:
84+
trace_db_path = (tmp_path / "trace.db").resolve()
85+
tracer = JavaTracer()
86+
87+
# Stage 1 fails (exit code 1), Stage 2 succeeds (exit code 0)
88+
exit_codes = iter([1, 0])
89+
90+
def mock_run_timeout(java_command: list[str], env: dict, timeout: int, stage_name: str) -> int:
91+
rc = next(exit_codes)
92+
if stage_name == "Argument capture":
93+
trace_db_path.write_bytes(b"fake-db")
94+
return rc
95+
96+
with (
97+
patch("codeflash.languages.java.tracer._run_java_with_graceful_timeout", side_effect=mock_run_timeout),
98+
patch.object(tracer, "build_jfr_env", return_value={}),
99+
patch.object(tracer, "build_agent_env", return_value={}),
100+
patch.object(tracer, "create_tracer_config", return_value=tmp_path / "config.json"),
101+
):
102+
trace_db, _jfr_file = tracer.trace(
103+
java_command=["java", "-cp", ".", "Main"], trace_db_path=trace_db_path, packages=["com.example"]
104+
)
105+
# Should complete despite Stage 1 failure
106+
assert trace_db == trace_db_path
107+
108+
def test_stage2_failure_raises(self, tmp_path: Path) -> None:
109+
trace_db_path = (tmp_path / "trace.db").resolve()
110+
tracer = JavaTracer()
111+
112+
# Stage 1 succeeds (exit code 0), Stage 2 fails (exit code 1)
113+
exit_codes = iter([0, 1])
114+
115+
def mock_run_timeout(java_command: list[str], env: dict, timeout: int, stage_name: str) -> int:
116+
return next(exit_codes)
117+
118+
with (
119+
patch("codeflash.languages.java.tracer._run_java_with_graceful_timeout", side_effect=mock_run_timeout),
120+
patch.object(tracer, "build_jfr_env", return_value={}),
121+
patch.object(tracer, "build_agent_env", return_value={}),
122+
patch.object(tracer, "create_tracer_config", return_value=tmp_path / "config.json"),
123+
pytest.raises(RuntimeError, match="Argument capture failed with exit code 1"),
124+
):
125+
tracer.trace(
126+
java_command=["java", "-cp", ".", "Main"], trace_db_path=trace_db_path, packages=["com.example"]
127+
)
128+
129+
def test_both_stages_succeed(self, tmp_path: Path) -> None:
130+
trace_db_path = (tmp_path / "trace.db").resolve()
131+
tracer = JavaTracer()
132+
133+
def mock_run_timeout(java_command: list[str], env: dict, timeout: int, stage_name: str) -> int:
134+
if stage_name == "Argument capture":
135+
trace_db_path.write_bytes(b"fake-db")
136+
return 0
137+
138+
with (
139+
patch("codeflash.languages.java.tracer._run_java_with_graceful_timeout", side_effect=mock_run_timeout),
140+
patch.object(tracer, "build_jfr_env", return_value={}),
141+
patch.object(tracer, "build_agent_env", return_value={}),
142+
patch.object(tracer, "create_tracer_config", return_value=tmp_path / "config.json"),
143+
):
144+
trace_db, _jfr_file = tracer.trace(
145+
java_command=["java", "-cp", ".", "Main"], trace_db_path=trace_db_path, packages=["com.example"]
146+
)
147+
assert trace_db == trace_db_path

0 commit comments

Comments
 (0)