@@ -144,7 +144,7 @@ def test_math_example(temp_examples_dir, mock_env_variables):
144144 Message (role = "user" , content = "What is 2+2?" ),
145145 Message (
146146 role = "assistant" ,
147- content = "<think>The user is asking for the sum of 2 and 2 .</think><answer>The final answer is \\ boxed{4}</answer>" ,
147+ content = "<think>I need to solve this arithmetic problem .</think><answer>The final answer is \\ boxed{4}</answer>" ,
148148 ),
149149 ]
150150 ground_truth_correct = "The final answer is \\ boxed{4}"
@@ -165,7 +165,7 @@ def test_math_example(temp_examples_dir, mock_env_variables):
165165 Message (role = "user" , content = "What is 2+2?" ),
166166 Message (
167167 role = "assistant" ,
168- content = "<think>The user is asking for the sum of 2 and 2 .</think><answer>The final answer is \\ boxed{5}</answer>" ,
168+ content = "<think>I need to solve this arithmetic problem .</think><answer>The final answer is \\ boxed{5}</answer>" ,
169169 ),
170170 ]
171171 # Ground truth is still 4
@@ -186,7 +186,7 @@ def test_math_example(temp_examples_dir, mock_env_variables):
186186 ]
187187 result_incorrect_fmt = math_module .evaluate (messages = messages_incorrect_fmt , ground_truth = ground_truth_correct )
188188
189- assert result_incorrect_fmt ["score" ] == 1.0 # Accuracy is 1.0
189+ assert result_incorrect_fmt ["score" ] == 0.8 # Combined score: ( 1.0 * 0.8) + (0.0 * 0.2) = 0.8
190190 assert result_incorrect_fmt ["is_score_valid" ] is True
191191 # Asserting extracted answers from the result object directly might fail
192192 assert result_incorrect_fmt ["metrics" ]["accuracy_reward" ]["score" ] == 1.0
@@ -269,7 +269,7 @@ def test_math_example(temp_examples_dir, mock_env_variables):
269269 messages = messages_only_answer_tag , ground_truth = ground_truth_simple_gt
270270 )
271271
272- assert result_only_answer_tag ["score" ] == 1.0 # Accuracy is fine
272+ assert result_only_answer_tag ["score" ] == 0.8 # Combined score: ( 1.0 * 0.8) + (0.0 * 0.2) = 0.8
273273 assert result_only_answer_tag ["is_score_valid" ] is True
274274 # Asserting extracted answers from the result object directly might fail
275275 assert result_only_answer_tag ["metrics" ]["accuracy_reward" ]["score" ] == 1.0
0 commit comments