|
15 | 15 | import pytest |
16 | 16 | from datasets import Dataset |
17 | 17 | from verifiers.envs.experimental import rlm_env as rlm_module |
18 | | -from verifiers.envs.experimental.rlm_env import RLMEnv, RLMWorkerPaths |
| 18 | +from verifiers.envs.experimental.rlm_env import ( |
| 19 | + RLMEnv, |
| 20 | + RLMWorkerPaths, |
| 21 | + RLMCodeExecutionTimeout, |
| 22 | + RLMSessionError, |
| 23 | + RLMSetupError, |
| 24 | + RLMWorkerError, |
| 25 | + RLMWorkerRecoveryError, |
| 26 | + SubLLMEmptyModelResponseError, |
| 27 | + LocalRLMReplSession, |
| 28 | +) |
| 29 | +import subprocess |
| 30 | +import verifiers as vf |
19 | 31 |
|
20 | 32 |
|
21 | 33 | # ============================================================================= |
@@ -1440,3 +1452,233 @@ def test_no_https_prefix(self): |
1440 | 1452 | line = "something.trycloudflare.com without https" |
1441 | 1453 | url = extract_tunnel_url_from_line(line) |
1442 | 1454 | assert url is None |
| 1455 | + |
| 1456 | + |
| 1457 | +# ============================================================================= |
| 1458 | +# 14. RLM Exception Hierarchy |
| 1459 | +# ============================================================================= |
| 1460 | + |
| 1461 | + |
| 1462 | +class TestExceptionHierarchy: |
| 1463 | + """Verify that RLM exceptions inherit from the correct verifiers base classes.""" |
| 1464 | + |
| 1465 | + def test_rlm_session_error_is_sandbox_error(self): |
| 1466 | + assert issubclass(RLMSessionError, vf.SandboxError) |
| 1467 | + |
| 1468 | + def test_rlm_setup_error_is_sandbox_error(self): |
| 1469 | + assert issubclass(RLMSetupError, vf.SandboxError) |
| 1470 | + |
| 1471 | + def test_rlm_worker_error_is_sandbox_error(self): |
| 1472 | + assert issubclass(RLMWorkerError, vf.SandboxError) |
| 1473 | + |
| 1474 | + def test_rlm_worker_recovery_error_is_worker_error(self): |
| 1475 | + assert issubclass(RLMWorkerRecoveryError, RLMWorkerError) |
| 1476 | + |
| 1477 | + def test_rlm_code_execution_timeout_is_tool_call_error(self): |
| 1478 | + assert issubclass(RLMCodeExecutionTimeout, vf.ToolCallError) |
| 1479 | + |
| 1480 | + def test_sub_llm_empty_response_is_empty_model_response_error(self): |
| 1481 | + assert issubclass(SubLLMEmptyModelResponseError, vf.EmptyModelResponseError) |
| 1482 | + |
| 1483 | + def test_all_are_vf_errors(self): |
| 1484 | + """All RLM exceptions should be caught by the rollout loop's except vf.Error.""" |
| 1485 | + for exc_cls in ( |
| 1486 | + RLMSessionError, |
| 1487 | + RLMSetupError, |
| 1488 | + RLMWorkerError, |
| 1489 | + RLMWorkerRecoveryError, |
| 1490 | + RLMCodeExecutionTimeout, |
| 1491 | + SubLLMEmptyModelResponseError, |
| 1492 | + ): |
| 1493 | + assert issubclass(exc_cls, vf.Error), ( |
| 1494 | + f"{exc_cls.__name__} is not a vf.Error" |
| 1495 | + ) |
| 1496 | + |
| 1497 | + |
| 1498 | +class TestRLMSessionErrorRaised: |
| 1499 | + """Test that RLMSessionError is raised when sessions/sandboxes are not initialized.""" |
| 1500 | + |
| 1501 | + def test_local_get_session_missing_rollout_id(self, rlm_env): |
| 1502 | + executor = rlm_env._executor |
| 1503 | + state = {} |
| 1504 | + with pytest.raises(RLMSessionError, match="Local session not initialized"): |
| 1505 | + executor._get_session(state) |
| 1506 | + |
| 1507 | + def test_local_get_session_unknown_rollout_id(self, rlm_env): |
| 1508 | + executor = rlm_env._executor |
| 1509 | + state = {"rollout_id": "nonexistent"} |
| 1510 | + with pytest.raises(RLMSessionError, match="Local session not initialized"): |
| 1511 | + executor._get_session(state) |
| 1512 | + |
| 1513 | + @pytest.mark.asyncio |
| 1514 | + async def test_local_start_worker_no_venv(self, rlm_env): |
| 1515 | + executor = rlm_env._executor |
| 1516 | + session = LocalRLMReplSession( |
| 1517 | + rollout_id="test", |
| 1518 | + rollout_dir="/tmp/test", |
| 1519 | + paths=MagicMock(), |
| 1520 | + fs_root="/tmp/test/fs", |
| 1521 | + control_dir="/tmp/test/control", |
| 1522 | + venv_path=None, |
| 1523 | + ) |
| 1524 | + state = {} |
| 1525 | + with pytest.raises(RLMSessionError, match="Local venv not initialized"): |
| 1526 | + await executor._start_worker(state, session) |
| 1527 | + |
| 1528 | + |
| 1529 | +class TestRLMWorkerErrorRaised: |
| 1530 | + """Test that RLMWorkerError is raised when the worker is not running.""" |
| 1531 | + |
| 1532 | + @pytest.mark.asyncio |
| 1533 | + async def test_local_execute_worker_process_none(self, rlm_env): |
| 1534 | + executor = rlm_env._executor |
| 1535 | + session = LocalRLMReplSession( |
| 1536 | + rollout_id="test", |
| 1537 | + rollout_dir="/tmp/test", |
| 1538 | + paths=MagicMock(), |
| 1539 | + fs_root="/tmp/test/fs", |
| 1540 | + control_dir="/tmp/test/control", |
| 1541 | + worker_process=None, |
| 1542 | + ) |
| 1543 | + executor._sessions["test"] = session |
| 1544 | + state = {"rollout_id": "test"} |
| 1545 | + try: |
| 1546 | + with pytest.raises(RLMWorkerError, match="RLM worker process not running"): |
| 1547 | + await executor.execute({"code": "1+1", "seq": 1}, state) |
| 1548 | + finally: |
| 1549 | + executor._sessions.pop("test", None) |
| 1550 | + |
| 1551 | + @pytest.mark.asyncio |
| 1552 | + async def test_local_execute_worker_process_exited(self, rlm_env): |
| 1553 | + executor = rlm_env._executor |
| 1554 | + mock_process = MagicMock() |
| 1555 | + mock_process.poll.return_value = 1 # process exited |
| 1556 | + session = LocalRLMReplSession( |
| 1557 | + rollout_id="test", |
| 1558 | + rollout_dir="/tmp/test", |
| 1559 | + paths=MagicMock(), |
| 1560 | + fs_root="/tmp/test/fs", |
| 1561 | + control_dir="/tmp/test/control", |
| 1562 | + worker_process=mock_process, |
| 1563 | + ) |
| 1564 | + executor._sessions["test"] = session |
| 1565 | + state = {"rollout_id": "test"} |
| 1566 | + try: |
| 1567 | + with pytest.raises(RLMWorkerError, match="RLM worker process not running"): |
| 1568 | + await executor.execute({"code": "1+1", "seq": 1}, state) |
| 1569 | + finally: |
| 1570 | + executor._sessions.pop("test", None) |
| 1571 | + |
| 1572 | + |
| 1573 | +class TestRLMSetupErrorRaised: |
| 1574 | + """Test that RLMSetupError is raised on setup failures.""" |
| 1575 | + |
| 1576 | + @pytest.mark.asyncio |
| 1577 | + async def test_uv_not_found(self, rlm_env): |
| 1578 | + executor = rlm_env._executor |
| 1579 | + with patch( |
| 1580 | + "asyncio.to_thread", new=AsyncMock(side_effect=FileNotFoundError("uv")) |
| 1581 | + ): |
| 1582 | + with pytest.raises(RLMSetupError, match="uv not found on PATH"): |
| 1583 | + await executor._run_uv_command(["uv", "venv", "/tmp/test"], timeout=30) |
| 1584 | + |
| 1585 | + @pytest.mark.asyncio |
| 1586 | + async def test_uv_command_timeout(self, rlm_env): |
| 1587 | + executor = rlm_env._executor |
| 1588 | + with patch( |
| 1589 | + "asyncio.to_thread", |
| 1590 | + new=AsyncMock(side_effect=subprocess.TimeoutExpired("uv", 30)), |
| 1591 | + ): |
| 1592 | + with pytest.raises(RLMSetupError, match="uv command timed out"): |
| 1593 | + await executor._run_uv_command(["uv", "venv", "/tmp/test"], timeout=30) |
| 1594 | + |
| 1595 | + @pytest.mark.asyncio |
| 1596 | + async def test_uv_command_nonzero_exit(self, rlm_env): |
| 1597 | + executor = rlm_env._executor |
| 1598 | + mock_result = MagicMock() |
| 1599 | + mock_result.returncode = 1 |
| 1600 | + mock_result.stderr = "some error" |
| 1601 | + mock_result.stdout = "" |
| 1602 | + with patch("asyncio.to_thread", new=AsyncMock(return_value=mock_result)): |
| 1603 | + with pytest.raises(RLMSetupError, match="uv command failed"): |
| 1604 | + await executor._run_uv_command( |
| 1605 | + ["uv", "pip", "install", "foo"], timeout=30 |
| 1606 | + ) |
| 1607 | + |
| 1608 | + |
| 1609 | +class TestRLMCodeExecutionTimeoutHandling: |
| 1610 | + """Test the abort and recovery paths for code execution timeout.""" |
| 1611 | + |
| 1612 | + @pytest.mark.asyncio |
| 1613 | + async def test_abort_on_timeout_raises_timeout_directly(self, rlm_env): |
| 1614 | + rlm_env.abort_on_code_timeout = True |
| 1615 | + rlm_env._executor.execute = AsyncMock( |
| 1616 | + side_effect=RLMCodeExecutionTimeout("timed out") |
| 1617 | + ) |
| 1618 | + rlm_env._executor.prepare_filesystem = AsyncMock() |
| 1619 | + rlm_env._executor.setup = AsyncMock() |
| 1620 | + |
| 1621 | + state = {"rlm_worker_ready": True, "_exec_seq": 0} |
| 1622 | + with pytest.raises(RLMCodeExecutionTimeout): |
| 1623 | + await rlm_env._execute_code("import time; time.sleep(999)", state) |
| 1624 | + |
| 1625 | + @pytest.mark.asyncio |
| 1626 | + async def test_recovery_failure_raises_worker_recovery_error(self, rlm_env): |
| 1627 | + rlm_env.abort_on_code_timeout = False |
| 1628 | + rlm_env._executor.execute = AsyncMock( |
| 1629 | + side_effect=RLMCodeExecutionTimeout("timed out") |
| 1630 | + ) |
| 1631 | + rlm_env._executor.prepare_filesystem = AsyncMock() |
| 1632 | + rlm_env._executor.setup = AsyncMock() |
| 1633 | + rlm_env._recover_from_code_timeout = AsyncMock(return_value=False) |
| 1634 | + |
| 1635 | + state = {"rlm_worker_ready": True, "_exec_seq": 0} |
| 1636 | + with pytest.raises(RLMWorkerRecoveryError, match="could not be restarted"): |
| 1637 | + await rlm_env._execute_code("import time; time.sleep(999)", state) |
| 1638 | + |
| 1639 | + @pytest.mark.asyncio |
| 1640 | + async def test_recovery_success_returns_error_result(self, rlm_env): |
| 1641 | + rlm_env.abort_on_code_timeout = False |
| 1642 | + rlm_env._executor.execute = AsyncMock( |
| 1643 | + side_effect=RLMCodeExecutionTimeout("timed out") |
| 1644 | + ) |
| 1645 | + rlm_env._executor.prepare_filesystem = AsyncMock() |
| 1646 | + rlm_env._executor.setup = AsyncMock() |
| 1647 | + rlm_env._recover_from_code_timeout = AsyncMock(return_value=True) |
| 1648 | + |
| 1649 | + state = {"rlm_worker_ready": True, "_exec_seq": 0} |
| 1650 | + result = await rlm_env._execute_code("slow_code()", state) |
| 1651 | + assert result["status"] == "error" |
| 1652 | + assert "timed out" in result["result"] |
| 1653 | + |
| 1654 | + |
| 1655 | +class TestSubLLMEmptyModelResponseErrorRaised: |
| 1656 | + """Test that SubLLMEmptyModelResponseError is raised for empty sub-LLM responses.""" |
| 1657 | + |
| 1658 | + @pytest.mark.asyncio |
| 1659 | + async def test_empty_response_from_sub_llm(self, rlm_env): |
| 1660 | + with patch.object( |
| 1661 | + rlm_env, |
| 1662 | + "get_model_response", |
| 1663 | + new=AsyncMock( |
| 1664 | + side_effect=vf.EmptyModelResponseError("Model returned no response") |
| 1665 | + ), |
| 1666 | + ): |
| 1667 | + state = {"sampling_args": {}} |
| 1668 | + messages = [{"role": "user", "content": "hello"}] |
| 1669 | + with pytest.raises(SubLLMEmptyModelResponseError, match="no response"): |
| 1670 | + await rlm_env._call_sub_llm_api(state, MagicMock(), "gpt-4", messages) |
| 1671 | + |
| 1672 | + @pytest.mark.asyncio |
| 1673 | + async def test_sub_llm_empty_response_chains_cause(self, rlm_env): |
| 1674 | + original = vf.EmptyModelResponseError("original error") |
| 1675 | + with patch.object( |
| 1676 | + rlm_env, |
| 1677 | + "get_model_response", |
| 1678 | + new=AsyncMock(side_effect=original), |
| 1679 | + ): |
| 1680 | + state = {"sampling_args": {}} |
| 1681 | + messages = [{"role": "user", "content": "hello"}] |
| 1682 | + with pytest.raises(SubLLMEmptyModelResponseError) as exc_info: |
| 1683 | + await rlm_env._call_sub_llm_api(state, MagicMock(), "gpt-4", messages) |
| 1684 | + assert exc_info.value.__cause__ is original |
0 commit comments