Skip to content

Commit 2f5122e

Browse files
committed
fix unittests
1 parent ced0e4b commit 2f5122e

4 files changed

Lines changed: 19 additions & 5 deletions

File tree

tests/conftest.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,16 @@
11
import sys
22
from pathlib import Path
3+
import pytest
34

45
# Add the project root to the Python path
56
project_root = Path(__file__).resolve().parent.parent
67
sys.path.insert(0, str(project_root))
8+
9+
# Import _HAS_E2B to create skip decorator
10+
try:
11+
from eval_protocol.rewards.code_execution import _HAS_E2B
12+
except ImportError:
13+
_HAS_E2B = False
14+
15+
# Decorator to skip E2B tests when E2B is not available
16+
skip_e2b = pytest.mark.skipif(not _HAS_E2B, reason="E2B not installed")

tests/test_adapters_e2e.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ def math_transform(row: Dict[str, Any]) -> Dict[str, Any]:
276276

277277
# Create adapter
278278
adapter = create_huggingface_adapter(
279-
dataset_id="hendrycks/competition_math",
279+
dataset_id="SuperSecureHuman/competition_math_hf_dataset",
280280
transform_fn=math_transform,
281281
)
282282

tests/test_cli_agent.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import argparse
66
import asyncio # Added import
77
import json
8+
import logging
89
from unittest.mock import ( # Added AsyncMock and Mock
910
AsyncMock,
1011
MagicMock,
@@ -38,6 +39,9 @@ class TestAgentEvalCommand:
3839
@patch("eval_protocol.cli_commands.agent_eval_cmd.TaskManager")
3940
@patch("eval_protocol.cli_commands.agent_eval_cmd.Path")
4041
def test_agent_eval_success_yaml(self, MockPath, MockTaskManager, caplog):
42+
# Configure caplog to capture logs from the agent_eval logger
43+
caplog.set_level(logging.INFO, logger="agent_eval")
44+
4145
# Setup Path mock
4246
mock_path_instance = Mock()
4347
MockPath.return_value = mock_path_instance

tests/test_examples_end_to_end.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def test_math_example(temp_examples_dir, mock_env_variables):
144144
Message(role="user", content="What is 2+2?"),
145145
Message(
146146
role="assistant",
147-
content="<think>The user is asking for the sum of 2 and 2.</think><answer>The final answer is \\boxed{4}</answer>",
147+
content="<think>I need to solve this arithmetic problem.</think><answer>The final answer is \\boxed{4}</answer>",
148148
),
149149
]
150150
ground_truth_correct = "The final answer is \\boxed{4}"
@@ -165,7 +165,7 @@ def test_math_example(temp_examples_dir, mock_env_variables):
165165
Message(role="user", content="What is 2+2?"),
166166
Message(
167167
role="assistant",
168-
content="<think>The user is asking for the sum of 2 and 2.</think><answer>The final answer is \\boxed{5}</answer>",
168+
content="<think>I need to solve this arithmetic problem.</think><answer>The final answer is \\boxed{5}</answer>",
169169
),
170170
]
171171
# Ground truth is still 4
@@ -186,7 +186,7 @@ def test_math_example(temp_examples_dir, mock_env_variables):
186186
]
187187
result_incorrect_fmt = math_module.evaluate(messages=messages_incorrect_fmt, ground_truth=ground_truth_correct)
188188

189-
assert result_incorrect_fmt["score"] == 1.0 # Accuracy is 1.0
189+
assert result_incorrect_fmt["score"] == 0.8 # Combined score: (1.0 * 0.8) + (0.0 * 0.2) = 0.8
190190
assert result_incorrect_fmt["is_score_valid"] is True
191191
# Asserting extracted answers from the result object directly might fail
192192
assert result_incorrect_fmt["metrics"]["accuracy_reward"]["score"] == 1.0
@@ -269,7 +269,7 @@ def test_math_example(temp_examples_dir, mock_env_variables):
269269
messages=messages_only_answer_tag, ground_truth=ground_truth_simple_gt
270270
)
271271

272-
assert result_only_answer_tag["score"] == 1.0 # Accuracy is fine
272+
assert result_only_answer_tag["score"] == 0.8 # Combined score: (1.0 * 0.8) + (0.0 * 0.2) = 0.8
273273
assert result_only_answer_tag["is_score_valid"] is True
274274
# Asserting extracted answers from the result object directly might fail
275275
assert result_only_answer_tag["metrics"]["accuracy_reward"]["score"] == 1.0

0 commit comments

Comments
 (0)