fix unittests

benjibc · benjibc · commit 2f5122ed013c · 2025-08-04T17:55:20.000Z
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,6 +1,16 @@
 import sys
 from pathlib import Path
+import pytest
 
 # Add the project root to the Python path
 project_root = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(project_root))
+
+# Import _HAS_E2B to create skip decorator
+try:
+    from eval_protocol.rewards.code_execution import _HAS_E2B
+except ImportError:
+    _HAS_E2B = False
+
+# Decorator to skip E2B tests when E2B is not available
+skip_e2b = pytest.mark.skipif(not _HAS_E2B, reason="E2B not installed")
diff --git a/tests/test_adapters_e2e.py b/tests/test_adapters_e2e.py
@@ -276,7 +276,7 @@ def math_transform(row: Dict[str, Any]) -> Dict[str, Any]:
         
         # Create adapter
         adapter = create_huggingface_adapter(
-            dataset_id="hendrycks/competition_math",
+            dataset_id="SuperSecureHuman/competition_math_hf_dataset",
             transform_fn=math_transform,
         )
         
diff --git a/tests/test_cli_agent.py b/tests/test_cli_agent.py
@@ -5,6 +5,7 @@
 import argparse
 import asyncio  # Added import
 import json
+import logging
 from unittest.mock import (  # Added AsyncMock and Mock
     AsyncMock,
     MagicMock,
@@ -38,6 +39,9 @@ class TestAgentEvalCommand:
     @patch("eval_protocol.cli_commands.agent_eval_cmd.TaskManager")
     @patch("eval_protocol.cli_commands.agent_eval_cmd.Path")
     def test_agent_eval_success_yaml(self, MockPath, MockTaskManager, caplog):
+        # Configure caplog to capture logs from the agent_eval logger
+        caplog.set_level(logging.INFO, logger="agent_eval")
+        
         # Setup Path mock
         mock_path_instance = Mock()
         MockPath.return_value = mock_path_instance
diff --git a/tests/test_examples_end_to_end.py b/tests/test_examples_end_to_end.py
@@ -144,7 +144,7 @@ def test_math_example(temp_examples_dir, mock_env_variables):
         Message(role="user", content="What is 2+2?"),
         Message(
             role="assistant",
-            content="<think>The user is asking for the sum of 2 and 2.</think><answer>The final answer is \\boxed{4}</answer>",
+            content="<think>I need to solve this arithmetic problem.</think><answer>The final answer is \\boxed{4}</answer>",
         ),
     ]
     ground_truth_correct = "The final answer is \\boxed{4}"
@@ -165,7 +165,7 @@ def test_math_example(temp_examples_dir, mock_env_variables):
         Message(role="user", content="What is 2+2?"),
         Message(
             role="assistant",
-            content="<think>The user is asking for the sum of 2 and 2.</think><answer>The final answer is \\boxed{5}</answer>",
+            content="<think>I need to solve this arithmetic problem.</think><answer>The final answer is \\boxed{5}</answer>",
         ),
     ]
     # Ground truth is still 4
@@ -186,7 +186,7 @@ def test_math_example(temp_examples_dir, mock_env_variables):
     ]
     result_incorrect_fmt = math_module.evaluate(messages=messages_incorrect_fmt, ground_truth=ground_truth_correct)
 
-    assert result_incorrect_fmt["score"] == 1.0  # Accuracy is 1.0
+    assert result_incorrect_fmt["score"] == 0.8  # Combined score: (1.0 * 0.8) + (0.0 * 0.2) = 0.8
     assert result_incorrect_fmt["is_score_valid"] is True
     # Asserting extracted answers from the result object directly might fail
     assert result_incorrect_fmt["metrics"]["accuracy_reward"]["score"] == 1.0
@@ -269,7 +269,7 @@ def test_math_example(temp_examples_dir, mock_env_variables):
         messages=messages_only_answer_tag, ground_truth=ground_truth_simple_gt
     )
 
-    assert result_only_answer_tag["score"] == 1.0  # Accuracy is fine
+    assert result_only_answer_tag["score"] == 0.8  # Combined score: (1.0 * 0.8) + (0.0 * 0.2) = 0.8
     assert result_only_answer_tag["is_score_valid"] is True
     # Asserting extracted answers from the result object directly might fail
     assert result_only_answer_tag["metrics"]["accuracy_reward"]["score"] == 1.0

Original file line number	Diff line number	Diff line change
`@@ -276,7 +276,7 @@ def math_transform(row: Dict[str, Any]) -> Dict[str, Any]:`
`276`	`276`
`277`	`277`	`# Create adapter`
`278`	`278`	`adapter = create_huggingface_adapter(`
`279`		`- dataset_id="hendrycks/competition_math",`
	`279`	`+ dataset_id="SuperSecureHuman/competition_math_hf_dataset",`
`280`	`280`	`transform_fn=math_transform,`
`281`	`281`	`)`
`282`	`282`