@@ -111,25 +111,33 @@ class TestNormalizeFinalAnswer(unittest.TestCase):
111111 """Tests for utils_rl.normalize_final_answer."""
112112
113113 @pytest .mark .cpu_only
114- def test_normalize_final_answer (self ):
114+ def test_comma_boxed_and_currency (self ):
115115 # Comma-separated numbers, \\boxed{}, and leading $ are all normalized to plain integers
116116 self .assertEqual (utils_rl .normalize_final_answer ("1,000" ), "1000" )
117117 self .assertEqual (utils_rl .normalize_final_answer ("$1,000" ), "1000" )
118118 self .assertEqual (utils_rl .normalize_final_answer ("\\ boxed{1,000}" ), "1000" )
119119
120+ @pytest .mark .cpu_only
121+ def test_equation_splitting_and_unit_removal (self ):
120122 # Expressions with '=' are split on '='; trailing unit words are stripped
121123 self .assertEqual (utils_rl .normalize_final_answer ("x = 10" ), "10" )
122124 self .assertEqual (utils_rl .normalize_final_answer ("total = 100 meters" ), "100" )
123125 self .assertEqual (utils_rl .normalize_final_answer ("42 mph" ), "42" )
124126
127+ @pytest .mark .cpu_only
128+ def test_latex_wrappers (self ):
125129 # \\text{}, \\textbf{}, and \\overline{} wrappers are removed, leaving inner content
126130 self .assertEqual (utils_rl .normalize_final_answer ("\\ text{hello}" ), "hello" )
127131 self .assertEqual (utils_rl .normalize_final_answer ("\\ textbf{42}" ), "42" )
128132 self .assertEqual (utils_rl .normalize_final_answer ("\\ overline{AB}" ), "AB" )
129133
134+ @pytest .mark .cpu_only
135+ def test_dollar_math_extraction (self ):
130136 # Content inside $...$ is extracted
131137 self .assertEqual (utils_rl .normalize_final_answer ("The answer is $\\ frac{1}{2}$" ), "\\ frac{1}{2}" )
132138
139+ @pytest .mark .cpu_only
140+ def test_shorthand_frac_and_sqrt (self ):
133141 # Shorthand \\fracab and \\sqrta are expanded to their full LaTeX forms
134142 self .assertEqual (utils_rl .normalize_final_answer ("\\ fracab" ), "\\ frac{a}{b}" )
135143 self .assertEqual (utils_rl .normalize_final_answer ("\\ sqrta" ), "\\ sqrt{a}" )
@@ -150,22 +158,32 @@ def _score(self, completion):
150158 return utils_rl .match_format_approximately (None , completion , self .config )
151159
152160 @pytest .mark .cpu_only
153- def test_partial_format_scores (self ):
154- """Scores cover the full range depending on how many tags appear exactly once."""
161+ def test_score_all_tags_present_exactly_once (self ):
155162 # All four tags present exactly once -> 4 * 0.5 = 2.0
156163 self .assertEqual (self ._score (["<reasoning>think</reasoning><answer>42</answer>" ])[0 ], 2.0 )
164+
165+ @pytest .mark .cpu_only
166+ def test_score_no_tags_present (self ):
157167 # No tags at all -> 4 * -0.5 = -2.0
158168 self .assertEqual (self ._score (["The answer is 42." ])[0 ], - 2.0 )
169+
170+ @pytest .mark .cpu_only
171+ def test_score_only_answer_tags_present (self ):
159172 # Only <answer>...</answer> present -> 2 * 0.5 + 2 * -0.5 = 0.0
160173 self .assertEqual (self ._score (["<answer>42</answer>" ])[0 ], 0.0 )
174+
175+ @pytest .mark .cpu_only
176+ def test_score_duplicate_reasoning_start_tag (self ):
161177 # Duplicate <reasoning> tag -> 3 * 0.5 + 1 * -0.5 = 1.0
162178 self .assertEqual (self ._score (["<reasoning><reasoning>think</reasoning><answer>42</answer>" ])[0 ], 1.0 )
179+
180+ @pytest .mark .cpu_only
181+ def test_score_multiple_completions (self ):
163182 # Multiple completions at once -> one score per entry
164- multi_completions = [
183+ scores = self . _score ( [
165184 "<reasoning>think</reasoning><answer>42</answer>" , # 2.0
166185 "no tags here" , # -2.0
167- ]
168- scores = self ._score (multi_completions )
186+ ])
169187 self .assertEqual (len (scores ), 2 )
170188 self .assertEqual (scores [0 ], 2.0 )
171189 self .assertEqual (scores [1 ], - 2.0 )
0 commit comments