Skip to content

Commit 26425be

Browse files
snimuclaude
andauthored
Add SubLLMEmptyModelResponseError and exception hierarchy tests (#894)
* Add SubLLMEmptyModelResponseError and exception hierarchy tests - Add SubLLMEmptyModelResponseError (inherits EmptyModelResponseError) raised when a sub-LLM call returns an empty response, making it easy to distinguish from root-model empty responses - Add tests for the full RLM exception hierarchy: class inheritance, RLMSessionError, RLMWorkerError, RLMSetupError, RLMCodeExecutionTimeout abort/recovery paths, and SubLLMEmptyModelResponseError Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * Clean up mock sessions in worker error tests Prevent atexit teardown from trying to stop mock worker processes left in executor._sessions by the tests. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ccef044 commit 26425be

2 files changed

Lines changed: 249 additions & 1 deletion

File tree

tests/test_rlm_env.py

Lines changed: 243 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,19 @@
1515
import pytest
1616
from datasets import Dataset
1717
from verifiers.envs.experimental import rlm_env as rlm_module
18-
from verifiers.envs.experimental.rlm_env import RLMEnv, RLMWorkerPaths
18+
from verifiers.envs.experimental.rlm_env import (
19+
RLMEnv,
20+
RLMWorkerPaths,
21+
RLMCodeExecutionTimeout,
22+
RLMSessionError,
23+
RLMSetupError,
24+
RLMWorkerError,
25+
RLMWorkerRecoveryError,
26+
SubLLMEmptyModelResponseError,
27+
LocalRLMReplSession,
28+
)
29+
import subprocess
30+
import verifiers as vf
1931

2032

2133
# =============================================================================
@@ -1440,3 +1452,233 @@ def test_no_https_prefix(self):
14401452
line = "something.trycloudflare.com without https"
14411453
url = extract_tunnel_url_from_line(line)
14421454
assert url is None
1455+
1456+
1457+
# =============================================================================
1458+
# 14. RLM Exception Hierarchy
1459+
# =============================================================================
1460+
1461+
1462+
class TestExceptionHierarchy:
1463+
"""Verify that RLM exceptions inherit from the correct verifiers base classes."""
1464+
1465+
def test_rlm_session_error_is_sandbox_error(self):
1466+
assert issubclass(RLMSessionError, vf.SandboxError)
1467+
1468+
def test_rlm_setup_error_is_sandbox_error(self):
1469+
assert issubclass(RLMSetupError, vf.SandboxError)
1470+
1471+
def test_rlm_worker_error_is_sandbox_error(self):
1472+
assert issubclass(RLMWorkerError, vf.SandboxError)
1473+
1474+
def test_rlm_worker_recovery_error_is_worker_error(self):
1475+
assert issubclass(RLMWorkerRecoveryError, RLMWorkerError)
1476+
1477+
def test_rlm_code_execution_timeout_is_tool_call_error(self):
1478+
assert issubclass(RLMCodeExecutionTimeout, vf.ToolCallError)
1479+
1480+
def test_sub_llm_empty_response_is_empty_model_response_error(self):
1481+
assert issubclass(SubLLMEmptyModelResponseError, vf.EmptyModelResponseError)
1482+
1483+
def test_all_are_vf_errors(self):
1484+
"""All RLM exceptions should be caught by the rollout loop's except vf.Error."""
1485+
for exc_cls in (
1486+
RLMSessionError,
1487+
RLMSetupError,
1488+
RLMWorkerError,
1489+
RLMWorkerRecoveryError,
1490+
RLMCodeExecutionTimeout,
1491+
SubLLMEmptyModelResponseError,
1492+
):
1493+
assert issubclass(exc_cls, vf.Error), (
1494+
f"{exc_cls.__name__} is not a vf.Error"
1495+
)
1496+
1497+
1498+
class TestRLMSessionErrorRaised:
1499+
"""Test that RLMSessionError is raised when sessions/sandboxes are not initialized."""
1500+
1501+
def test_local_get_session_missing_rollout_id(self, rlm_env):
1502+
executor = rlm_env._executor
1503+
state = {}
1504+
with pytest.raises(RLMSessionError, match="Local session not initialized"):
1505+
executor._get_session(state)
1506+
1507+
def test_local_get_session_unknown_rollout_id(self, rlm_env):
1508+
executor = rlm_env._executor
1509+
state = {"rollout_id": "nonexistent"}
1510+
with pytest.raises(RLMSessionError, match="Local session not initialized"):
1511+
executor._get_session(state)
1512+
1513+
@pytest.mark.asyncio
1514+
async def test_local_start_worker_no_venv(self, rlm_env):
1515+
executor = rlm_env._executor
1516+
session = LocalRLMReplSession(
1517+
rollout_id="test",
1518+
rollout_dir="/tmp/test",
1519+
paths=MagicMock(),
1520+
fs_root="/tmp/test/fs",
1521+
control_dir="/tmp/test/control",
1522+
venv_path=None,
1523+
)
1524+
state = {}
1525+
with pytest.raises(RLMSessionError, match="Local venv not initialized"):
1526+
await executor._start_worker(state, session)
1527+
1528+
1529+
class TestRLMWorkerErrorRaised:
1530+
"""Test that RLMWorkerError is raised when the worker is not running."""
1531+
1532+
@pytest.mark.asyncio
1533+
async def test_local_execute_worker_process_none(self, rlm_env):
1534+
executor = rlm_env._executor
1535+
session = LocalRLMReplSession(
1536+
rollout_id="test",
1537+
rollout_dir="/tmp/test",
1538+
paths=MagicMock(),
1539+
fs_root="/tmp/test/fs",
1540+
control_dir="/tmp/test/control",
1541+
worker_process=None,
1542+
)
1543+
executor._sessions["test"] = session
1544+
state = {"rollout_id": "test"}
1545+
try:
1546+
with pytest.raises(RLMWorkerError, match="RLM worker process not running"):
1547+
await executor.execute({"code": "1+1", "seq": 1}, state)
1548+
finally:
1549+
executor._sessions.pop("test", None)
1550+
1551+
@pytest.mark.asyncio
1552+
async def test_local_execute_worker_process_exited(self, rlm_env):
1553+
executor = rlm_env._executor
1554+
mock_process = MagicMock()
1555+
mock_process.poll.return_value = 1 # process exited
1556+
session = LocalRLMReplSession(
1557+
rollout_id="test",
1558+
rollout_dir="/tmp/test",
1559+
paths=MagicMock(),
1560+
fs_root="/tmp/test/fs",
1561+
control_dir="/tmp/test/control",
1562+
worker_process=mock_process,
1563+
)
1564+
executor._sessions["test"] = session
1565+
state = {"rollout_id": "test"}
1566+
try:
1567+
with pytest.raises(RLMWorkerError, match="RLM worker process not running"):
1568+
await executor.execute({"code": "1+1", "seq": 1}, state)
1569+
finally:
1570+
executor._sessions.pop("test", None)
1571+
1572+
1573+
class TestRLMSetupErrorRaised:
1574+
"""Test that RLMSetupError is raised on setup failures."""
1575+
1576+
@pytest.mark.asyncio
1577+
async def test_uv_not_found(self, rlm_env):
1578+
executor = rlm_env._executor
1579+
with patch(
1580+
"asyncio.to_thread", new=AsyncMock(side_effect=FileNotFoundError("uv"))
1581+
):
1582+
with pytest.raises(RLMSetupError, match="uv not found on PATH"):
1583+
await executor._run_uv_command(["uv", "venv", "/tmp/test"], timeout=30)
1584+
1585+
@pytest.mark.asyncio
1586+
async def test_uv_command_timeout(self, rlm_env):
1587+
executor = rlm_env._executor
1588+
with patch(
1589+
"asyncio.to_thread",
1590+
new=AsyncMock(side_effect=subprocess.TimeoutExpired("uv", 30)),
1591+
):
1592+
with pytest.raises(RLMSetupError, match="uv command timed out"):
1593+
await executor._run_uv_command(["uv", "venv", "/tmp/test"], timeout=30)
1594+
1595+
@pytest.mark.asyncio
1596+
async def test_uv_command_nonzero_exit(self, rlm_env):
1597+
executor = rlm_env._executor
1598+
mock_result = MagicMock()
1599+
mock_result.returncode = 1
1600+
mock_result.stderr = "some error"
1601+
mock_result.stdout = ""
1602+
with patch("asyncio.to_thread", new=AsyncMock(return_value=mock_result)):
1603+
with pytest.raises(RLMSetupError, match="uv command failed"):
1604+
await executor._run_uv_command(
1605+
["uv", "pip", "install", "foo"], timeout=30
1606+
)
1607+
1608+
1609+
class TestRLMCodeExecutionTimeoutHandling:
1610+
"""Test the abort and recovery paths for code execution timeout."""
1611+
1612+
@pytest.mark.asyncio
1613+
async def test_abort_on_timeout_raises_timeout_directly(self, rlm_env):
1614+
rlm_env.abort_on_code_timeout = True
1615+
rlm_env._executor.execute = AsyncMock(
1616+
side_effect=RLMCodeExecutionTimeout("timed out")
1617+
)
1618+
rlm_env._executor.prepare_filesystem = AsyncMock()
1619+
rlm_env._executor.setup = AsyncMock()
1620+
1621+
state = {"rlm_worker_ready": True, "_exec_seq": 0}
1622+
with pytest.raises(RLMCodeExecutionTimeout):
1623+
await rlm_env._execute_code("import time; time.sleep(999)", state)
1624+
1625+
@pytest.mark.asyncio
1626+
async def test_recovery_failure_raises_worker_recovery_error(self, rlm_env):
1627+
rlm_env.abort_on_code_timeout = False
1628+
rlm_env._executor.execute = AsyncMock(
1629+
side_effect=RLMCodeExecutionTimeout("timed out")
1630+
)
1631+
rlm_env._executor.prepare_filesystem = AsyncMock()
1632+
rlm_env._executor.setup = AsyncMock()
1633+
rlm_env._recover_from_code_timeout = AsyncMock(return_value=False)
1634+
1635+
state = {"rlm_worker_ready": True, "_exec_seq": 0}
1636+
with pytest.raises(RLMWorkerRecoveryError, match="could not be restarted"):
1637+
await rlm_env._execute_code("import time; time.sleep(999)", state)
1638+
1639+
@pytest.mark.asyncio
1640+
async def test_recovery_success_returns_error_result(self, rlm_env):
1641+
rlm_env.abort_on_code_timeout = False
1642+
rlm_env._executor.execute = AsyncMock(
1643+
side_effect=RLMCodeExecutionTimeout("timed out")
1644+
)
1645+
rlm_env._executor.prepare_filesystem = AsyncMock()
1646+
rlm_env._executor.setup = AsyncMock()
1647+
rlm_env._recover_from_code_timeout = AsyncMock(return_value=True)
1648+
1649+
state = {"rlm_worker_ready": True, "_exec_seq": 0}
1650+
result = await rlm_env._execute_code("slow_code()", state)
1651+
assert result["status"] == "error"
1652+
assert "timed out" in result["result"]
1653+
1654+
1655+
class TestSubLLMEmptyModelResponseErrorRaised:
1656+
"""Test that SubLLMEmptyModelResponseError is raised for empty sub-LLM responses."""
1657+
1658+
@pytest.mark.asyncio
1659+
async def test_empty_response_from_sub_llm(self, rlm_env):
1660+
with patch.object(
1661+
rlm_env,
1662+
"get_model_response",
1663+
new=AsyncMock(
1664+
side_effect=vf.EmptyModelResponseError("Model returned no response")
1665+
),
1666+
):
1667+
state = {"sampling_args": {}}
1668+
messages = [{"role": "user", "content": "hello"}]
1669+
with pytest.raises(SubLLMEmptyModelResponseError, match="no response"):
1670+
await rlm_env._call_sub_llm_api(state, MagicMock(), "gpt-4", messages)
1671+
1672+
@pytest.mark.asyncio
1673+
async def test_sub_llm_empty_response_chains_cause(self, rlm_env):
1674+
original = vf.EmptyModelResponseError("original error")
1675+
with patch.object(
1676+
rlm_env,
1677+
"get_model_response",
1678+
new=AsyncMock(side_effect=original),
1679+
):
1680+
state = {"sampling_args": {}}
1681+
messages = [{"role": "user", "content": "hello"}]
1682+
with pytest.raises(SubLLMEmptyModelResponseError) as exc_info:
1683+
await rlm_env._call_sub_llm_api(state, MagicMock(), "gpt-4", messages)
1684+
assert exc_info.value.__cause__ is original

verifiers/envs/experimental/rlm_env.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,10 @@ def _merge_tool_lists(
146146
return deduped_all, deduped_map
147147

148148

149+
class SubLLMEmptyModelResponseError(vf.EmptyModelResponseError):
150+
"""Raised when a sub-LLM call returns an empty model response."""
151+
152+
149153
class RLMCodeExecutionTimeout(vf.ToolCallError):
150154
"""Raised when code execution exceeds the configured timeout."""
151155

@@ -3151,6 +3155,8 @@ async def _call_sub_llm_api(
31513155
f"Sub-LLM API call timed out after {self.sub_llm_api_timeout}s"
31523156
)
31533157
return None
3158+
except vf.EmptyModelResponseError as e:
3159+
raise SubLLMEmptyModelResponseError(str(e)) from e
31543160
except Exception as e:
31553161
raise e
31563162

0 commit comments

Comments
 (0)