AI-Hypercomputer
diff --git a/‎.github/workflows/build_and_test_maxtext.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/build_and_test_maxtext.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/maxtext/configs/post_train/rl.yml‎
Lines changed: 9 additions & 2 deletions b/‎src/maxtext/configs/post_train/rl.yml‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎src/maxtext/configs/types.py‎
Lines changed: 9 additions & 0 deletions b/‎src/maxtext/configs/types.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎src/maxtext/trainers/post_train/rl/evaluate_rl.py‎
Lines changed: 109 additions & 63 deletions b/‎src/maxtext/trainers/post_train/rl/evaluate_rl.py‎
Lines changed: 109 additions & 63 deletions
@@ -199,6 +199,7 @@ jobs:
       base_image: maxtext-unit-test-tpu:py312
       cloud_runner: linux-x86-ct6e-180-4tpu
       pytest_marker: 'not cpu_only and not gpu_only and not integration_test and not post_training'
+      pytest_addopts: '--ignore=tests/post_training'
       xla_python_client_mem_fraction: 0.75
       tf_force_gpu_allow_growth: false
       container_resource_option: "--privileged"
@@ -217,6 +218,7 @@ jobs:
       base_image: maxtext-unit-test-tpu:py312
       cloud_runner: linux-x86-ct6e-180-4tpu
       pytest_marker: 'not cpu_only and not gpu_only and integration_test and not post_training'
+      pytest_addopts: '--ignore=tests/post_training'
       xla_python_client_mem_fraction: 0.75
       tf_force_gpu_allow_growth: false
       container_resource_option: "--privileged"
 
@@ -148,6 +148,7 @@ generation_configs:
 num_eval_passes: 1 # Number of generation passes during evaluation
 eval_corr_lst: False # If True, only include correct responses in the list during evaluation
 eval_make_lst: False # If True, return a list of (question, answer, responses) during evaluation
+eval_mode: "pass" # Evaluation mode ("pass" for pass@K, "maj" for majority voting maj@K, "pass_at_1" for pass@1 estimation)
 
 # ====== Inference ======
 # === Generation during GRPO training ===
@@ -190,6 +191,12 @@ reward_ratio_guess_to_answer_low: 0.0
 penalty_incorrect_format: 0.0
 penalty_incorrect_answer: 0.0
 
+# ====== Configuration for math_verify Pool ======
+# Timeout (seconds) for math_verify
+math_verify_timeout: 300
+# Max worker processes for the math_verify pool. null ⇒ min(batch_size, cpu_count())
+math_verify_num_procs: null
+
 # ====== Special tokens/templates for GSM8K reasoning ======
 reasoning_start_token: '<reasoning>'
 reasoning_end_token: '</reasoning>'
@@ -200,8 +207,8 @@ skip_jax_distributed_system: True
 
 # # TODO(@mazumdera): fix this
 # Dataset Configuration
-dataset_name: 'gsm8k' # huggingface:open-r1/DAPO-Math-17k-Processed
-eval_dataset_name: 'gsm8k' # huggingface:BytedTsinghua-SIA/AIME-2024
+dataset_name: 'gsm8k' # open-r1/DAPO-Math-17k-Processed
+eval_dataset_name: 'gsm8k' # BytedTsinghua-SIA/AIME-2024
 train_split: 'train'
 eval_split: 'test'
 tokenizer_type: 'huggingface'
@@ -1817,6 +1817,10 @@ class RLEvaluation(BaseModel):
       False,
       description="If True, return a list of (question, answer, responses) during evaluation.",
   )
+  eval_mode: Literal["pass", "maj", "pass_at_1"] = Field(
+      "pass",
+      description="Evaluation mode to use ('pass' for pass@K, 'maj' for maj@K, 'pass_at_1' for pass@1 estimation).",
+  )
 
 
 class Reward(BaseModel):
@@ -1834,6 +1838,11 @@ class Reward(BaseModel):
   )
   penalty_incorrect_format: float = Field(-0.5, description="Penalty for an incorrect format.")
   penalty_incorrect_answer: float = Field(-1.0, description="Penalty for an incorrect answer.")
+  math_verify_timeout: int = Field(300, description="Timeout (seconds) for math_verify call per batch.")
+  math_verify_num_procs: int | None = Field(
+      None,
+      description=("Max worker processes for the math_verify pool. None ⇒ " "min(batch_size, cpu_count())."),
+  )
 
 
 class SpecialTokens(BaseModel):
 
@@ -16,7 +16,9 @@
 """
 RL Evaluation Module.
 """
-from math_verify import parse
+import collections
+import json
+
 from tqdm.auto import tqdm
 from tunix.rl.rollout.base_rollout import RolloutConfig
 
@@ -86,85 +88,128 @@ def generate_responses(
   return multiple_call_responses
 
 
-def score_responses(tmvp_config, question, responses, answer):
+def score_responses(tmvp_config, question, responses, answers):
   """
   Score a set of responses for a single question.
 
   Args:
       tmvp_config: Configuration object
       question: The evaluation question
       responses: List of generated responses for this question
-      answer: The correct answer
+      answers: List of acceptable answers for this question
 
   Returns:
       Tuple of (is_correct, is_partially_correct, has_correct_format)
   """
-  match_format = utils_rl.get_match_format_regex(tmvp_config)
-  answer_fallback = utils_rl.get_answer_fallback_regex(tmvp_config)
-
   if tmvp_config.debug.rl:
     max_logging.log("========================================")
     max_logging.log(f"Evaluation Question: {question}")
-    max_logging.log(f"Evaluation Answer: {answer}")
+    max_logging.log(f"Evaluation Answer: {answers}")
     max_logging.log(f"Evaluation Responses: {responses}")
     max_logging.log("========================================")
 
-  is_correct = False
-  is_partially_correct = False
-  has_correct_format = False
+  eval_mode = getattr(tmvp_config, "eval_mode", "pass")
+  match_format = utils_rl.get_match_format_regex(tmvp_config)
 
+  extracted_responses = []
   for response in responses:
-    # Extract answer: prefer the full format match; fall back to the last
-    # <answer>...</answer> tag if full format match is not found, so result
-    # scoring is decoupled from format.
-    full_match = match_format.search(response)
-    if full_match is not None:
-      extracted_response = full_match.group(1)
-    else:
-      # Find the *last* occurrence of the answer tag (most likely the final answer).
-      fallback_matches = answer_fallback.findall(response)
-      extracted_response = fallback_matches[-1].strip() if fallback_matches else "-1000000"
+    extracted_response = utils_rl.extract_answer(response, tmvp_config)
+    extracted_responses.append(extracted_response)
+
+  if not extracted_responses:
+    return False, False, False
+
+  if eval_mode == "maj":
+    # extract the single-most frequent response
+    counter = collections.Counter(extracted_responses)
+    majority_response = counter.most_common(1)[0][0]
     if tmvp_config.debug.rl:
-      used = "full format" if full_match is not None else "answer-tag fallback"
-      max_logging.log(f"Evaluation extracted_response ({used}): {extracted_response}")
+      max_logging.log(f"Majority Response: {majority_response} (Count: {counter[majority_response]})")
+
+    # Check the format for the majority_response
+    has_correct_format = False
+    for idx, extracted_response in enumerate(extracted_responses):
+      if extracted_response == majority_response:
+        if match_format.search(responses[idx]) is not None:
+          has_correct_format = True
+          break
 
-    # Check exact correctness
     try:
-      # Fix LaTeX escaping issues for both ground truth and extracted answer
-      norm_answer = utils_rl.fix_latex_escaping(answer)
-      norm_extracted = utils_rl.fix_latex_escaping(extracted_response)
-      # Normalize Normalize for certain datasets and parse
-      if "DAPO" in tmvp_config.dataset_name or "OpenMathInstruct" in tmvp_config.dataset_name:
-        norm_extracted = utils_rl.normalize_final_answer(norm_extracted).strip()
-        norm_answer = utils_rl.normalize_final_answer(answer).strip()
-      is_correct = utils_rl.math_verify_func([utils_rl.boxed(norm_answer)], [utils_rl.boxed(norm_extracted)])[0] > 0.1
+      is_correct, is_partially_correct = utils_rl.check_correctness(majority_response, answers, tmvp_config)
       if tmvp_config.debug.rl:
-        # is_correct is a tuple, if first value is 1.0 means it's a match;
-        # 0.0 means a mismatch. e.g. (0.0, (['3', '3'], ['3/5', '\\frac{3}{5}']))
+        max_logging.log(f"Result has_correct_format: {has_correct_format}")
         max_logging.log(f"Result is_correct: {is_correct}")
-
-      val_extracted = parse(utils_rl.boxed(norm_extracted))
-      val_answer = parse(utils_rl.boxed(norm_answer))
-
-      # Check partial correctness if values can be extracted (within 10%)
-      if val_extracted and val_answer:
-        ratio = (val_extracted[0] + utils_rl.EPSILON) / (val_answer[0] + utils_rl.EPSILON)
-        is_partially_correct = 0.9 <= ratio <= 1.1
-
+        max_logging.log(f"Result is_partially_correct: {is_partially_correct}")
     except Exception as e:
+      is_correct, is_partially_correct = False, False
       if tmvp_config.debug.rl:
-        max_logging.log(f"Evaluation Exception: {e}")
+        max_logging.log(f"Evaluation Exception on majority answer: {e}")
         max_logging.log("SKIPPED")
+    return is_correct, is_partially_correct, has_correct_format
 
-    # Check format correctness (requires the full <reasoning>...</reasoning><answer>...</answer> structure)
-    if full_match is not None:
-      has_correct_format = True
+  if eval_mode == "pass":
+    for idx, response in enumerate(responses):
+      is_correct, is_partially_correct, has_correct_format = False, False, False
+      if match_format.search(response) is not None:
+        has_correct_format = True
+
+      # Check exact and partial correctness (within 10%)
+      try:
+        is_correct, is_partially_correct = utils_rl.check_correctness(extracted_responses[idx], answers, tmvp_config)
+        if tmvp_config.debug.rl:
+          max_logging.log(f"Result is_correct: {is_correct}")
+          max_logging.log(f"Result is_partially_correct: {is_partially_correct}")
+      except Exception as e:
+        if tmvp_config.debug.rl:
+          max_logging.log(f"Evaluation Exception: {e}")
+          max_logging.log("SKIPPED")
+
+      # Early exit if all criteria are met
+      if is_correct and is_partially_correct and has_correct_format:
+        return is_correct, is_partially_correct, has_correct_format
+    return is_correct, is_partially_correct, has_correct_format
+
+  if eval_mode == "pass_at_1":
+    # Estimate pass@1: fraction of N samples that are correct per problem.
+    # Returns floats in [0, 1] instead of booleans.
+    n_samples = len(responses)
+    n_correct = 0
+    n_partially_correct = 0
+    n_correct_format = 0
+
+    for idx, response in enumerate(responses):
+      if match_format.search(response) is not None:
+        n_correct_format += 1
+
+      try:
+        sample_correct, sample_partial = utils_rl.check_correctness(extracted_responses[idx], answers, tmvp_config)
+        if sample_correct:
+          n_correct += 1
+        if sample_partial:
+          n_partially_correct += 1
+        if tmvp_config.debug.rl:
+          max_logging.log(f"Sample {idx}: correct={sample_correct}, partial={sample_partial}")
+      except Exception as e:
+        if tmvp_config.debug.rl:
+          max_logging.log(f"Evaluation Exception on sample {idx}: {e}")
+          max_logging.log("SKIPPED")
+
+    frac_correct = n_correct / n_samples
+    frac_partially_correct = n_partially_correct / n_samples
+    frac_correct_format = n_correct_format / n_samples
+
+    if tmvp_config.debug.rl:
+      max_logging.log(
+          f"pass@1: {n_correct}/{n_samples} correct, "
+          f"{n_partially_correct}/{n_samples} partial, "
+          f"{n_correct_format}/{n_samples} format"
+      )
 
-    # Early exit if all criteria are met
-    if is_correct and is_partially_correct and has_correct_format:
-      break
+    return frac_correct, frac_partially_correct, frac_correct_format
 
-  return is_correct, is_partially_correct, has_correct_format
+  if tmvp_config.debug.rl:
+    max_logging.log(f"Unknown eval mode: {eval_mode}")
+  return False, False, False
 
 
 def evaluate(
@@ -210,28 +255,29 @@ def evaluate(
 
     # Score each question-answer pair
     for question, responses, answer in zip(questions, multiple_call_responses, answers):
+      # decode the json-encoded list of acceptable answers
+      answer = list(dict.fromkeys(json.loads(answer)))
       is_correct, is_partially_correct, has_correct_format = score_responses(
           tmvp_config=tmvp_config,
           question=question,
           responses=responses,
-          answer=answer,
+          answers=answer,
       )
 
-      # Update counters
-      if is_correct:
-        corr += 1
-        if corr_lst and make_lst:
+      # Update counters. For "pass" and "maj" modes, scores are booleans
+      # (True=1, False=0). For "pass_at_1" mode, scores are floats in [0, 1]
+      # representing the fraction of samples correct. Using += works for both:
+      # bool is a subtype of int in Python, so True += is the same as += 1.
+      corr += is_correct
+      partially_corr += is_partially_correct
+      corr_format += has_correct_format
+
+      if make_lst:
+        if corr_lst and is_correct:
           response_lst.append((question, answer, responses))
-      else:
-        if not corr_lst and make_lst:
+        elif not corr_lst and not is_correct:
           response_lst.append((question, answer, responses))
 
-      if is_partially_correct:
-        partially_corr += 1
-
-      if has_correct_format:
-        corr_format += 1
-
       total += 1
 
       # Print progress every 10 items