remove conditional preprocess_math_string

A9isha · A9isha · commit 18c26ada03c0 · 2026-04-07T20:24:52.000Z
diff --git a/src/maxtext/configs/post_train/rl.yml b/src/maxtext/configs/post_train/rl.yml
@@ -147,6 +147,7 @@ generation_configs:
 num_eval_passes: 1 # Number of generation passes during evaluation
 eval_corr_lst: False # If True, only include correct responses in the list during evaluation
 eval_make_lst: False # If True, return a list of (question, answer, responses) during evaluation
+eval_mode: "pass" # Evaluation mode ("pass" for pass@K, "maj" for majority voting maj@K)
 
 # ====== Inference ======
 # === Generation during GRPO training ===
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -1775,6 +1775,10 @@ class RLEvaluation(BaseModel):
       False,
       description="If True, return a list of (question, answer, responses) during evaluation.",
   )
+  eval_mode: Literal["pass", "maj"] = Field(
+      "pass",
+      description="Evaluation mode to use ('pass' for pass@K, 'maj' for maj@K).",
+  )
 
 
 class Reward(BaseModel):
diff --git a/src/maxtext/trainers/post_train/rl/evaluate_rl.py b/src/maxtext/trainers/post_train/rl/evaluate_rl.py
@@ -16,6 +16,7 @@
 """
 RL Evaluation Module.
 """
+import collections
 import json
 
 from tqdm.auto import tqdm
@@ -87,7 +88,7 @@ def generate_responses(
   return multiple_call_responses
 
 
-def score_responses(tmvp_config, question, responses, answers):
+def score_responses(tmvp_config, question, responses, answers, eval_mode="pass"):
   """
   Score a set of responses for a single question.
 
@@ -96,6 +97,7 @@ def score_responses(tmvp_config, question, responses, answers):
       question: The evaluation question
       responses: List of generated responses for this question
       answers: List of acceptable answers for this question
+      eval_mode: The evaluation mode to use ("pass" for pass@K, "maj" for maj@K)
 
   Returns:
       Tuple of (is_correct, is_partially_correct, has_correct_format)
@@ -112,6 +114,35 @@ def score_responses(tmvp_config, question, responses, answers):
   is_partially_correct = False
   has_correct_format = False
 
+  if eval_mode == "maj":
+    extracted_answers = []
+    for response in responses:
+      match_format = utils_rl.get_match_format_regex(tmvp_config)
+      if match_format.search(response) is not None:
+        has_correct_format = True
+
+      extracted_response = utils_rl.extract_answer(response, tmvp_config)
+      extracted_answers.append(extracted_response)
+
+    if not extracted_answers:
+      return False, False, False
+
+    counter = collections.Counter(extracted_answers)
+    majority_answer = counter.most_common(1)[0][0]
+
+    try:
+      is_correct, is_partially_correct = utils_rl.check_correctness(majority_answer, answers, tmvp_config)
+      if tmvp_config.debug.rl:
+        max_logging.log(f"Majority Answer: {majority_answer} (Count: {counter[majority_answer]})")
+        max_logging.log(f"Result is_correct: {is_correct}")
+        max_logging.log(f"Result is_partially_correct: {is_partially_correct}")
+    except Exception as e:
+      if tmvp_config.debug.rl:
+        max_logging.log(f"Evaluation Exception on majority answer: {e}")
+        max_logging.log("SKIPPED")
+
+    return is_correct, is_partially_correct, has_correct_format
+
   for response in responses:
     match_format = utils_rl.get_match_format_regex(tmvp_config)
     if match_format.search(response) is not None:
@@ -144,6 +175,7 @@ def evaluate(
     num_passes=1,
     corr_lst=False,
     make_lst=False,
+    eval_mode=None,
 ):
   """
   Computes accuracy and percentage of outputs matching the format.
@@ -155,10 +187,14 @@ def evaluate(
       num_passes: Number of generation passes
       corr_lst: If True, only include correct responses in the list
       make_lst: If True, return a list of (question, answer, responses)
+      eval_mode: Override for the evaluation mode ("pass" or "maj").
 
   Returns:
       Tuple of statistics and optionally the response list
   """
+  if eval_mode is None:
+    eval_mode = getattr(tmvp_config, "eval_mode", "pass")
+
   response_lst = []
   corr = 0
   partially_corr = 0
@@ -187,6 +223,7 @@ def evaluate(
           question=question,
           responses=responses,
           answers=answer,
+          eval_mode=eval_mode,
       )
 
       # Update counters
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -726,6 +726,7 @@ def rl_train(trainer_config, sampler_config, trainer_devices, sampler_devices):
       num_passes=trainer_config.num_eval_passes,
       corr_lst=trainer_config.eval_corr_lst,
       make_lst=trainer_config.eval_make_lst,
+      eval_mode=getattr(trainer_config, "eval_mode", "pass"),
   )
   max_logging.warning(f"Pre RL Training: {corr=}, {total=}, {accuracy=}%, {partial_accuracy=}%," f" {format_accuracy=}%")
 
@@ -755,6 +756,7 @@ def rl_train(trainer_config, sampler_config, trainer_devices, sampler_devices):
       num_passes=trainer_config.num_eval_passes,
       corr_lst=trainer_config.eval_corr_lst,
       make_lst=trainer_config.eval_make_lst,
+      eval_mode=getattr(trainer_config, "eval_mode", "pass"),
   )
   max_logging.warning(f"Post RL Training: {corr=}, {total=}, {accuracy=}%, {partial_accuracy=}%," f" {format_accuracy=}%")
 
diff --git a/src/maxtext/trainers/post_train/rl/utils_rl.py b/src/maxtext/trainers/post_train/rl/utils_rl.py
@@ -275,11 +275,7 @@ def normalize_final_answer(final_answer: str) -> str:
 def preprocess_math_string(dataset_name, text) -> str:
   """Fix common formatting issues in text."""
   # Normalize for certain datasets and parse
-  if any(
-      name in dataset_name
-      for name in ["DAPO", "OpenMathInstruct", "OpenMathReasoning", "OpenR1-Math-220k", "CuratedThoughts", "MATH-500"]
-  ):
-    text = normalize_final_answer(text).strip()
+  text = normalize_final_answer(text).strip()
   # Fix LaTeX escaping issues
   text = fix_latex_escaping(text)
   return text

Original file line number	Diff line number	Diff line change
`@@ -1775,6 +1775,10 @@ class RLEvaluation(BaseModel):`
`1775`	`1775`	`False,`
`1776`	`1776`	`description="If True, return a list of (question, answer, responses) during evaluation.",`
`1777`	`1777`	`)`
	`1778`	`+ eval_mode: Literal["pass", "maj"] = Field(`
	`1779`	`+ "pass",`
	`1780`	`+ description="Evaluation mode to use ('pass' for pass@K, 'maj' for maj@K).",`
	`1781`	`+ )`
`1778`	`1782`
`1779`	`1783`
`1780`	`1784`	`class Reward(BaseModel):`
Original file line number	Diff line number	Diff line change
`@@ -726,6 +726,7 @@ def rl_train(trainer_config, sampler_config, trainer_devices, sampler_devices):`
`726`	`726`	`num_passes=trainer_config.num_eval_passes,`
`727`	`727`	`corr_lst=trainer_config.eval_corr_lst,`
`728`	`728`	`make_lst=trainer_config.eval_make_lst,`
	`729`	`+ eval_mode=getattr(trainer_config, "eval_mode", "pass"),`
`729`	`730`	`)`
`730`	`731`	`max_logging.warning(f"Pre RL Training: {corr=}, {total=}, {accuracy=}%, {partial_accuracy=}%," f" {format_accuracy=}%")`
`731`	`732`
`@@ -755,6 +756,7 @@ def rl_train(trainer_config, sampler_config, trainer_devices, sampler_devices):`
`755`	`756`	`num_passes=trainer_config.num_eval_passes,`
`756`	`757`	`corr_lst=trainer_config.eval_corr_lst,`
`757`	`758`	`make_lst=trainer_config.eval_make_lst,`
	`759`	`+ eval_mode=getattr(trainer_config, "eval_mode", "pass"),`
`758`	`760`	`)`
`759`	`761`	`max_logging.warning(f"Post RL Training: {corr=}, {total=}, {accuracy=}%, {partial_accuracy=}%," f" {format_accuracy=}%")`
`760`	`762`