Add mean_reward to evaluate() via reusing user-provided reward functions

Pooya Moradi · Pooya Moradi · commit b06b8b619c10 · 2026-06-05T19:25:02.000Z
evaluate() previously returned only binary correctness metrics (corr,
accuracy, partial_accuracy, format_accuracy). For RL training the most
important metric is the actual reward signal — but that was only
available at training time, not during PRE / POST / intermediate eval.

Extend evaluate() to accept an optional `reward_fns` list and compute
the per-example sum of all reward functions, returning the mean as a
6th element `mean_reward`. The reward functions used are the same ones
that drive training (per `reward_functions_path` + `reward_functions`
CLI knobs from commit 50eb2ca) — so eval-time mean_reward is exactly
what training optimizes for. No task-specific code is added to maxtext;
whatever scoring scheme the user plugs in becomes both the training
signal AND the eval-time reward metric.

Plumbed through to all three eval call sites in train_rl.py:
  * Pre RL Training log line now includes mean_reward=...
  * Post RL Training log line now includes mean_reward=...
  * Intermediate Eval (step=N) log line now includes mean_reward=...

When no reward_fns is configured, mean_reward is reported as 0.0 and
the rest of evaluate() works exactly as before (backward compatible).

create_rl_components() return signature extended to also return
reward_fns so the eval call sites can pass them along.
diff --git a/src/maxtext/trainers/post_train/rl/evaluate_rl.py b/src/maxtext/trainers/post_train/rl/evaluate_rl.py
@@ -19,7 +19,7 @@
 import collections
 import json
 import re
-from typing import Any
+from typing import Any, Callable, Optional
 
 from tqdm.auto import tqdm
 from tunix.rl.rollout.base_rollout import RolloutConfig
@@ -183,13 +183,43 @@ def score_responses(tmvp_config, question, responses, answers):
   raise ValueError(f"Unknown eval_mode: {eval_mode!r}")
 
 
+def _compute_row_reward(reward_fns, prompt, responses, answer, row_idx):
+  """Sum the per-function reward scores across all sampled responses for one prompt.
+
+  Honors the sampling strategy `evaluate()` ran with: when `num_passes > 1`
+  (or when a non-greedy `eval_sampling_strategy` is configured),
+  `responses` contains one entry per pass for the same prompt, and this
+  helper sums the reward across all of them. The caller divides the
+  total by the number of (prompt, response) pairs to get the per-sample
+  mean reward, mirroring tunix's GRPO per-rollout reward aggregation.
+
+  Returns a tuple `(score_sum, n_responses)`. On any exception the
+  failure is logged and `(0.0, 0)` is returned so the caller's running
+  mean is not corrupted.
+  """
+  if not responses:
+    return 0.0, 0
+  try:
+    score_sum = 0.0
+    for resp in responses:
+      for fn in reward_fns:
+        scores = fn(prompts=[prompt], completions=[resp], answer=[answer])
+        if scores:
+          score_sum += float(scores[0])
+    return score_sum, len(responses)
+  except Exception as e:  # pylint: disable=broad-exception-caught
+    max_logging.log(f"[eval-reward] reward_fn failed on row {row_idx}: {e!r}")
+    return 0.0, 0
+
+
 def evaluate(
     tmvp_config,
     dataset,
     rl_cluster,
     num_passes=1,
     corr_lst=False,
     make_lst=False,
+    reward_fns: Optional[list[Callable[..., Any]]] = None,
 ):
   """
   Computes accuracy and percentage of outputs matching the format.
@@ -201,15 +231,27 @@ def evaluate(
       num_passes: Number of generation passes
       corr_lst: If True, only include correct responses in the list
       make_lst: If True, return a list of (question, answer, responses)
+      reward_fns: Optional list of reward functions to also evaluate against
+          the sampled responses (using whichever `eval_sampling_strategy`
+          is configured). Each function must accept `prompts`,
+          `completions`, `answer`, and return a list of floats (same signature
+          as the training-time reward stack). When provided, the per-example
+          score is the SUM across all reward functions (matching tunix's GRPO
+          aggregation), and the per-example mean is returned as `mean_reward`.
+          When None or empty, `mean_reward` is 0.0.
 
   Returns:
-      Tuple of statistics and optionally the response list
+      Tuple (corr, total, accuracy, partial_accuracy, format_accuracy,
+      mean_reward), response_lst
   """
   response_lst = []
   corr = 0
   partially_corr = 0
   corr_format = 0
   total = 0
+  reward_sum = 0.0
+  reward_count = 0  # number of (prompt, sampled response) pairs scored
+  use_reward = bool(reward_fns)
 
   for batch in tqdm(dataset):
     answers = batch["answer"]
@@ -225,16 +267,26 @@ def evaluate(
     )
 
     # Score each question-answer pair
-    for question, responses, answer in zip(questions, multiple_call_responses, answers):
+    for question, responses, answer, prompt in zip(questions, multiple_call_responses, answers, prompts):
       # decode the json-encoded list of acceptable answers
-      answer = list(dict.fromkeys(json.loads(answer)))
+      answer_list = list(dict.fromkeys(json.loads(answer)))
       is_correct, is_partially_correct, has_correct_format = score_responses(
           tmvp_config=tmvp_config,
           question=question,
           responses=responses,
-          answers=answer,
+          answers=answer_list,
       )
 
+      # Per-example reward (eval-time mirror of the training reward). The
+      # total is accumulated across all sampled responses (across num_passes
+      # and across the eval_sampling_strategy distribution) and divided by
+      # the actual per-(prompt, response) count at the end. See
+      # `_compute_row_reward` for details.
+      if use_reward:
+        row_sum, row_count = _compute_row_reward(reward_fns, prompt, responses, answer, total)
+        reward_sum += row_sum
+        reward_count += row_count
+
       # Update counters. For "pass" and "maj" modes, scores are booleans
       # (True=1, False=0). For "pass_at_1" mode, scores are floats in [0, 1]
       # representing the fraction of samples correct. Using += works for both:
@@ -245,9 +297,9 @@ def evaluate(
 
       if make_lst:
         if corr_lst and is_correct:
-          response_lst.append((question, answer, responses))
+          response_lst.append((question, answer_list, responses))
         elif not corr_lst and not is_correct:
-          response_lst.append((question, answer, responses))
+          response_lst.append((question, answer_list, responses))
 
       total += 1
 
@@ -265,6 +317,7 @@ def evaluate(
       corr / total * 100 if total > 0 else 0,
       partially_corr / total * 100 if total > 0 else 0,
       corr_format / total * 100 if total > 0 else 0,
+      reward_sum / reward_count if (use_reward and reward_count > 0) else 0.0,
   )
 
   return to_return, response_lst
diff --git a/src/maxtext/trainers/post_train/rl/hooks.py b/src/maxtext/trainers/post_train/rl/hooks.py
@@ -14,7 +14,7 @@
 
 """Training hooks for post-train RL."""
 
-from typing import Any
+from typing import Any, Callable, Optional
 
 from tunix.sft import hooks as _tunix_hooks
 
@@ -33,7 +33,8 @@ class RLTrainingHooks(_tunix_hooks.TrainingHooks):
 
   This hook hooks `on_train_step_end`, checks
   `rl_cluster.global_steps % eval_interval`, and calls maxtext's
-  `evaluate(...)` — greedy decode + the configured scoring pipeline —
+  `evaluate(...)` (using whichever `eval_sampling_strategy` is configured
+  in `generation_configs`) plus the configured scoring pipeline,
   logging the result. Gives matched-step PRE/INTERMEDIATE/POST curves
   without any change to tunix.
   """
@@ -44,11 +45,13 @@ def __init__(
       trainer_config: Any,
       test_dataset: Any,
       eval_interval: int,
+      reward_fns: Optional[list[Callable[..., Any]]] = None,
   ):
     self._rl_cluster = rl_cluster
     self._trainer_config = trainer_config
     self._test_dataset = test_dataset
     self._eval_interval = eval_interval
+    self._reward_fns = reward_fns
     self._last_step_evaluated = -1
 
   # The five lifecycle methods below are abstract in `tunix.sft.hooks.TrainingHooks`,
@@ -83,17 +86,19 @@ def on_train_step_end(self, trainer, step, loss):  # noqa: ARG002
     self._last_step_evaluated = outer_step
     try:
       tc = self._trainer_config
-      (corr, total, accuracy, partial_accuracy, format_accuracy), _ = evaluate(
+      (corr, total, accuracy, partial_accuracy, format_accuracy, mean_reward), _ = evaluate(
           tc,
           self._test_dataset,
           rl_cluster=self._rl_cluster,
           num_passes=tc.num_eval_passes,
           corr_lst=tc.eval_corr_lst,
           make_lst=tc.eval_make_lst,
+          reward_fns=self._reward_fns,
       )
       max_logging.warning(
           f"Intermediate Eval (step={outer_step}): {corr=}, {total=},"
-          f" {accuracy=}%, {partial_accuracy=}%, {format_accuracy=}%"
+          f" {accuracy=}%, {partial_accuracy=}%, {format_accuracy=}%,"
+          f" {mean_reward=:.4f}"
       )
     except Exception as e:  # pylint: disable=broad-exception-caught
       max_logging.warning(f"[intermediate-eval] step={outer_step} failed: {e!r}")
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -579,7 +579,7 @@ def _reward_fn(**kwargs):
         algo_config=grpo_config,
     )
 
-  return rl_cluster, rl_trainer, optimizer
+  return rl_cluster, rl_trainer, optimizer, reward_fns
 
 
 def rl_train(argv: Sequence[str], kwargs: dict):
@@ -664,7 +664,7 @@ def _rl_train_impl(argv: Sequence[str], kwargs: dict):
     max_logging.log(f"Policy mesh shape: {actor_mesh.shape}")
     max_logging.log(f"Rollout_mesh shape: {rollout_mesh.shape}")
 
-  rl_cluster, rl_trainer, _ = create_rl_components(
+  rl_cluster, rl_trainer, _, reward_fns = create_rl_components(
       trainer_config,
       sampler_config,
       sampler_devices,
@@ -682,16 +682,18 @@ def _rl_train_impl(argv: Sequence[str], kwargs: dict):
     # Update vllm with model parameters from checkpoint
     rl_cluster.rollout.update_params(nnx.state(actor_model))
 
-    (corr, total, accuracy, partial_accuracy, format_accuracy), _ = evaluate(
+    (corr, total, accuracy, partial_accuracy, format_accuracy, mean_reward), _ = evaluate(
         trainer_config,
         test_dataset,
         rl_cluster=rl_cluster,
         num_passes=trainer_config.num_eval_passes,
         corr_lst=trainer_config.eval_corr_lst,
         make_lst=trainer_config.eval_make_lst,
+        reward_fns=reward_fns,
     )
     max_logging.warning(
-        f"Pre RL Training: {corr=}, {total=}, {accuracy=}%, {partial_accuracy=}%," f" {format_accuracy=}%"
+        f"Pre RL Training: {corr=}, {total=}, {accuracy=}%, {partial_accuracy=}%,"
+        f" {format_accuracy=}%, {mean_reward=:.4f}"
     )
 
   # Start training
@@ -701,7 +703,7 @@ def _rl_train_impl(argv: Sequence[str], kwargs: dict):
 
   # Wire intermediate eval: fire greedy `evaluate(...)` every `eval_interval`
   # outer steps. No-op when eval_interval <= 0 or num_test_batches <= 0.
-  utils_rl.install_training_hooks(rl_cluster, trainer_config, test_dataset)
+  utils_rl.install_training_hooks(rl_cluster, trainer_config, test_dataset, reward_fns)
 
   max_logging.warning("Starting RL training...")
   rl_trainer.train(train_dataset)
@@ -718,16 +720,18 @@ def _rl_train_impl(argv: Sequence[str], kwargs: dict):
 
   # Run evaluation after training
   if trainer_config.num_test_batches > 0:
-    (corr, total, accuracy, partial_accuracy, format_accuracy), _ = evaluate(
+    (corr, total, accuracy, partial_accuracy, format_accuracy, mean_reward), _ = evaluate(
         trainer_config,
         test_dataset,
         rl_cluster=rl_cluster,
         num_passes=trainer_config.num_eval_passes,
         corr_lst=trainer_config.eval_corr_lst,
         make_lst=trainer_config.eval_make_lst,
+        reward_fns=reward_fns,
     )
     max_logging.warning(
-        f"Post RL Training: {corr=}, {total=}, {accuracy=}%, {partial_accuracy=}%," f" {format_accuracy=}%"
+        f"Post RL Training: {corr=}, {total=}, {accuracy=}%, {partial_accuracy=}%,"
+        f" {format_accuracy=}%, {mean_reward=:.4f}"
     )
 
 
diff --git a/src/maxtext/trainers/post_train/rl/utils_rl.py b/src/maxtext/trainers/post_train/rl/utils_rl.py
@@ -779,9 +779,14 @@ def install_training_hooks(
     rl_cluster: Any,
     trainer_config: Any,
     test_dataset: Any,
+    reward_fns: Optional[list[Callable[..., Any]]] = None,
 ) -> None:
   """Install maxtext's `RLTrainingHooks` on the actor trainer.
 
+  When `reward_fns` is provided, intermediate eval logs the per-example
+  `mean_reward` alongside the correctness metrics, mirroring the training-time
+  reward stack.
+
   No-op if `eval_interval <= 0` or `num_test_batches <= 0` or tunix's hooks
   module is unavailable.
   """
@@ -803,7 +808,7 @@ def install_training_hooks(
   try:
     actor = rl_cluster.actor_trainer
     if getattr(actor, "training_hooks", None) is None:
-      actor.training_hooks = RLTrainingHooks(rl_cluster, trainer_config, test_dataset, eval_interval)
+      actor.training_hooks = RLTrainingHooks(rl_cluster, trainer_config, test_dataset, eval_interval, reward_fns)
       max_logging.warning(
           f"[intermediate-eval] hook installed: evaluate(...) will fire every {eval_interval} outer steps."
       )
diff --git a/tests/post_training/unit/evaluate_rl_test.py b/tests/post_training/unit/evaluate_rl_test.py
@@ -14,6 +14,8 @@
 
 """Unit tests for evaluate_rl.py (CPU-only)."""
 
+# pylint: disable=protected-access
+
 import unittest
 import pytest
 from types import SimpleNamespace
@@ -208,5 +210,96 @@ def test_pass_at_1_all_correct(self):
     self.assertAlmostEqual(has_correct_format, 1.0)
 
 
+class TestComputeRowReward(unittest.TestCase):
+  """Tests for _compute_row_reward (per-prompt eval-time reward aggregation)."""
+
+  def _two_fns(self):
+    """Return two reward functions whose per-response scores can be summed."""
+
+    # Each fn must accept prompts, completions, answer as keyword args and
+    # return a list of per-completion scores. The helper calls fn once per
+    # response with single-element lists, so the returned list has length 1.
+    def fn1(prompts, completions, answer):  # pylint: disable=unused-argument
+      return [1.0 for _ in completions]
+
+    def fn2(prompts, completions, answer):  # pylint: disable=unused-argument
+      return [float(len(c)) for c in completions]
+
+    return [fn1, fn2]
+
+  @pytest.mark.cpu_only
+  def test_single_response_single_fn(self):
+    def fn(prompts, completions, answer):  # pylint: disable=unused-argument
+      return [2.5 for _ in completions]
+
+    score_sum, count = evaluate_rl._compute_row_reward(
+        reward_fns=[fn],
+        prompt="p",
+        responses=["abc"],
+        answer="gold",
+        row_idx=0,
+    )
+    self.assertAlmostEqual(score_sum, 2.5)
+    self.assertEqual(count, 1)
+
+  @pytest.mark.cpu_only
+  def test_sums_across_reward_fns_for_single_response(self):
+    score_sum, count = evaluate_rl._compute_row_reward(
+        reward_fns=self._two_fns(),
+        prompt="p",
+        responses=["abcd"],
+        answer="gold",
+        row_idx=0,
+    )
+    # fn1 = 1.0, fn2 = len("abcd") = 4.0  ->  per-pass score = 5.0
+    self.assertAlmostEqual(score_sum, 5.0)
+    self.assertEqual(count, 1)
+
+  @pytest.mark.cpu_only
+  def test_sums_across_passes_for_multiple_responses(self):
+    """Multi-pass: helper must aggregate across ALL sampled responses, not just [0]."""
+    score_sum, count = evaluate_rl._compute_row_reward(
+        reward_fns=self._two_fns(),
+        prompt="p",
+        responses=["a", "bcd", "ef"],
+        answer="gold",
+        row_idx=0,
+    )
+    # Per pass: pass0 = 1 + 1 = 2,  pass1 = 1 + 3 = 4,  pass2 = 1 + 2 = 3
+    # Sum across 3 passes = 9, count = 3
+    self.assertAlmostEqual(score_sum, 9.0)
+    self.assertEqual(count, 3)
+
+  @pytest.mark.cpu_only
+  def test_empty_responses_returns_zero_and_zero_count(self):
+    """An empty responses list must contribute nothing to the running mean."""
+    score_sum, count = evaluate_rl._compute_row_reward(
+        reward_fns=self._two_fns(),
+        prompt="p",
+        responses=[],
+        answer="gold",
+        row_idx=0,
+    )
+    self.assertEqual(score_sum, 0.0)
+    self.assertEqual(count, 0)
+
+  @pytest.mark.cpu_only
+  def test_exception_in_reward_fn_swallowed_and_returns_zero_count(self):
+    """A raising reward_fn must not propagate and must not corrupt the mean denominator."""
+
+    def _boom(**kwargs):  # pylint: disable=unused-argument
+      raise RuntimeError("reward failure")
+
+    score_sum, count = evaluate_rl._compute_row_reward(
+        reward_fns=[_boom],
+        prompt="p",
+        responses=["abc"],
+        answer="gold",
+        row_idx=0,
+    )
+    self.assertEqual(score_sum, 0.0)
+    self.assertEqual(count, 0)  # zero count so the caller's mean isn't biased
+
+
 if __name__ == "__main__":
   unittest.main()
diff --git a/tests/post_training/unit/rl_hooks_test.py b/tests/post_training/unit/rl_hooks_test.py