Add intermediate eval hook: fire evaluate() every eval_interval outer steps

Pooya Moradi · Pooya Moradi · commit 0f6177ee81d9 · 2026-06-04T20:40:09.000Z
`eval_interval` was a silently-dead config: even though it's plumbed into
tunix's `RLTrainingConfig.eval_every_n_steps`, tunix's `_run_eval` is a
no-op unless an `eval_ds` is passed to `trainer.train()`. And even if you
do pass one, tunix's default GRPO eval re-runs the full sampled rollout
(num_generations responses per prompt), which is ~3hr/eval and impractical
for trajectory monitoring.

Install a `tunix.sft.hooks.TrainingHooks` subclass that hooks
`on_train_step_end`, checks `rl_cluster.global_steps % eval_interval`,
and calls maxtext's own `evaluate(...)` (greedy decode + the configured
scoring pipeline). Gives matched-step PRE / step_N / POST trajectory
logging at near-zero cost beyond the eval itself (which is already fast
when `eval_batch_size` is set per commit d536d13).

No-op when eval_interval &lt;= 0 or num_test_batches &lt;= 0. Soft-skips with
a warning if tunix.sft.hooks isn't importable, so the launcher still
works against a stock-only tunix.
diff --git a/src/maxtext/trainers/post_train/rl/hooks.py b/src/maxtext/trainers/post_train/rl/hooks.py
@@ -0,0 +1,96 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Training hooks for post-train RL."""
+
+from typing import Any
+
+from tunix.sft import hooks as _tunix_hooks
+
+from maxtext.trainers.post_train.rl.evaluate_rl import evaluate
+from maxtext.utils import max_logging
+
+
+class RLTrainingHooks(_tunix_hooks.TrainingHooks):
+  """Tunix `TrainingHooks` subclass that fires `evaluate(...)` every
+  `eval_interval` outer steps during RL training.
+
+  tunix's `eval_every_n_steps` in `RLTrainingConfig` is silently dead unless
+  an `eval_ds` is passed to `trainer.train()`, and even then tunix's default
+  `_run_eval` re-runs the full GRPO rollout (`num_generations` sampled per
+  prompt), which is ~3hr/eval and impractical for trajectory monitoring.
+
+  This hook hooks `on_train_step_end`, checks
+  `rl_cluster.global_steps % eval_interval`, and calls maxtext's
+  `evaluate(...)` — greedy decode + the configured scoring pipeline —
+  logging the result. Gives matched-step PRE/INTERMEDIATE/POST curves
+  without any change to tunix.
+  """
+
+  def __init__(
+      self,
+      rl_cluster: Any,
+      trainer_config: Any,
+      test_dataset: Any,
+      eval_interval: int,
+  ):
+    self._rl_cluster = rl_cluster
+    self._trainer_config = trainer_config
+    self._test_dataset = test_dataset
+    self._eval_interval = eval_interval
+    self._last_step_evaluated = -1
+
+  def on_train_start(self, train_ctx):  # noqa: ARG002
+    del train_ctx
+
+  def on_train_end(self, train_ctx):  # noqa: ARG002
+    del train_ctx
+
+  def on_train_step_start(self, train_ctx):  # noqa: ARG002
+    del train_ctx
+
+  def on_eval_step_start(self, train_ctx):  # noqa: ARG002
+    del train_ctx
+
+  def on_eval_step_end(self, train_ctx, *args, **kwargs):  # noqa: ARG002
+    del train_ctx, args, kwargs
+
+  def on_train_step_end(self, trainer, step, loss):  # noqa: ARG002
+    """Fire `evaluate(...)` once per `eval_interval` outer steps."""
+    del trainer, loss
+    try:
+      outer_step = int(self._rl_cluster.global_steps)
+    except Exception:  # pylint: disable=broad-exception-caught
+      outer_step = int(step) if step is not None else -1
+    if outer_step <= 0 or outer_step == self._last_step_evaluated:
+      return
+    if outer_step % self._eval_interval != 0:
+      return
+    self._last_step_evaluated = outer_step
+    try:
+      tc = self._trainer_config
+      (corr, total, accuracy, partial_accuracy, format_accuracy), _ = evaluate(
+          tc,
+          self._test_dataset,
+          rl_cluster=self._rl_cluster,
+          num_passes=tc.num_eval_passes,
+          corr_lst=tc.eval_corr_lst,
+          make_lst=tc.eval_make_lst,
+      )
+      max_logging.warning(
+          f"Intermediate Eval (step={outer_step}): {corr=}, {total=},"
+          f" {accuracy=}%, {partial_accuracy=}%, {format_accuracy=}%"
+      )
+    except Exception as e:  # pylint: disable=broad-exception-caught
+      max_logging.warning(f"[intermediate-eval] step={outer_step} failed: {e!r}")
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -693,6 +693,10 @@ def _rl_train_impl(argv: Sequence[str], kwargs: dict):
     max_logging.log("Capturing reference model state before training.")
     ref_state_before = nnx.to_pure_dict(nnx.state(reference_model.base, nnx.Param))
 
+  # Wire intermediate eval: fire greedy `evaluate(...)` every `eval_interval`
+  # outer steps. No-op when eval_interval <= 0 or num_test_batches <= 0.
+  utils_rl.install_training_hooks(rl_cluster, trainer_config, test_dataset)
+
   max_logging.warning("Starting RL training...")
   rl_trainer.train(train_dataset)
 
diff --git a/src/maxtext/trainers/post_train/rl/utils_rl.py b/src/maxtext/trainers/post_train/rl/utils_rl.py
@@ -760,3 +760,43 @@ def parse(
     return super().parse(
         messages=formatted_messages, add_generation_prompt=add_generation_prompt, is_first_msg=is_first_msg
     )
+
+
+def install_training_hooks(
+    rl_cluster: Any,
+    trainer_config: Any,
+    test_dataset: Any,
+) -> None:
+  """Install maxtext's `RLTrainingHooks` on the actor trainer.
+
+  No-op if `eval_interval <= 0` or `num_test_batches <= 0` or tunix's hooks
+  module is unavailable.
+  """
+  if trainer_config.num_test_batches <= 0:
+    return
+  eval_interval = int(getattr(trainer_config, "eval_interval", 0))
+  if eval_interval <= 0:
+    return
+  try:
+    # Soft-import keeps the launcher usable against a stock-only tunix. The
+    # hooks module hard-imports `tunix.sft.hooks`, so the ImportError surfaces
+    # here when tunix doesn't have it.
+    from maxtext.trainers.post_train.rl.hooks import RLTrainingHooks  # pylint: disable=import-outside-toplevel
+  except ImportError:
+    max_logging.warning("[intermediate-eval] tunix.sft.hooks not importable; skipping hook install.")
+    return
+
+  # PeftTrainer composes a single training_hooks; install if free, else warn.
+  try:
+    actor = rl_cluster.actor_trainer
+    if getattr(actor, "training_hooks", None) is None:
+      actor.training_hooks = RLTrainingHooks(rl_cluster, trainer_config, test_dataset, eval_interval)
+      max_logging.warning(
+          f"[intermediate-eval] hook installed: evaluate(...) will fire every {eval_interval} outer steps."
+      )
+    else:
+      max_logging.warning(
+          "[intermediate-eval] actor.training_hooks already set; skipping install (chain manually if you need both)."
+      )
+  except Exception as e:  # pylint: disable=broad-exception-caught
+    max_logging.warning(f"[intermediate-eval] install failed: {e!r}")