Add intermediate eval hook: fire evaluate() every eval_interval outer steps

Pooya Moradi · Pooya Moradi · commit f2c32acf86f4 · 2026-06-04T20:32:26.000Z
`eval_interval` was a silently-dead config: even though it's plumbed into
tunix's `RLTrainingConfig.eval_every_n_steps`, tunix's `_run_eval` is a
no-op unless an `eval_ds` is passed to `trainer.train()`. And even if you
do pass one, tunix's default GRPO eval re-runs the full sampled rollout
(num_generations responses per prompt), which is ~3hr/eval and impractical
for trajectory monitoring.

Install a `tunix.sft.hooks.TrainingHooks` subclass that hooks
`on_train_step_end`, checks `rl_cluster.global_steps % eval_interval`,
and calls maxtext's own `evaluate(...)` (greedy decode + the configured
scoring pipeline). Gives matched-step PRE / step_N / POST trajectory
logging at near-zero cost beyond the eval itself (which is already fast
when `eval_batch_size` is set per commit d536d13).

No-op when eval_interval &lt;= 0 or num_test_batches &lt;= 0. Soft-skips with
a warning if tunix.sft.hooks isn't importable, so the launcher still
works against a stock-only tunix.
diff --git a/src/maxtext/trainers/post_train/rl/hooks.py b/src/maxtext/trainers/post_train/rl/hooks.py
@@ -0,0 +1,117 @@
+# Copyright 2023–2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Training hooks for post-train RL."""
+
+from typing import Any
+
+from maxtext.trainers.post_train.rl.evaluate_rl import evaluate
+from maxtext.utils import max_logging
+
+
+def install_intermediate_eval_hook(
+    rl_cluster: Any,
+    trainer_config: Any,
+    test_dataset: Any,
+) -> None:
+  """Fire `evaluate(...)` every `eval_interval` outer steps during training.
+
+  tunix's `eval_every_n_steps` in `RLTrainingConfig` is silently dead unless
+  an `eval_ds` is passed to `trainer.train()`, and even then tunix's default
+  `_run_eval` re-runs the full GRPO rollout (`num_generations` sampled per
+  prompt), which is ~3hr/eval and impractical for trajectory monitoring.
+
+  This hook subclasses `tunix.sft.hooks.TrainingHooks` and at every
+  `eval_interval` outer step (matched against `rl_cluster.global_steps`)
+  calls maxtext's `evaluate(...)` — greedy decode + the configured scoring
+  pipeline — and logs the result. Gives matched-step PRE/INTERMEDIATE/POST
+  curves without any change to tunix.
+
+  No-op if `eval_interval <= 0` or `num_test_batches <= 0` or tunix's hooks
+  module is unavailable.
+  """
+  if trainer_config.num_test_batches <= 0:
+    return
+  eval_interval = int(getattr(trainer_config, "eval_interval", 0))
+  if eval_interval <= 0:
+    return
+  try:
+    # Soft-import: keeps the launcher usable against a stock-only tunix.
+    from tunix.sft import hooks as _hk  # pylint: disable=import-outside-toplevel
+  except ImportError:
+    max_logging.warning("[intermediate-eval] tunix.sft.hooks not importable; skipping hook install.")
+    return
+
+  state: dict = {"last_step_evaluated": -1}
+
+  class _IntermediateEvalHook(_hk.TrainingHooks):  # type: ignore[name-defined]
+    """Fires `evaluate(...)` every `eval_interval` outer steps."""
+
+    def on_train_start(self, train_ctx):  # noqa: ARG002
+      del train_ctx
+
+    def on_train_end(self, train_ctx):  # noqa: ARG002
+      del train_ctx
+
+    def on_train_step_start(self, train_ctx):  # noqa: ARG002
+      del train_ctx
+
+    def on_eval_step_start(self, train_ctx):  # noqa: ARG002
+      del train_ctx
+
+    def on_eval_step_end(self, train_ctx, *args, **kwargs):  # noqa: ARG002
+      del train_ctx, args, kwargs
+
+    def on_train_step_end(self, trainer, step, loss):  # noqa: ARG002
+      """Fire `evaluate(...)` once per `eval_interval` outer steps."""
+      del trainer, loss
+      try:
+        outer_step = int(rl_cluster.global_steps)
+      except Exception:  # pylint: disable=broad-exception-caught
+        outer_step = int(step) if step is not None else -1
+      if outer_step <= 0 or outer_step == state["last_step_evaluated"]:
+        return
+      if outer_step % eval_interval != 0:
+        return
+      state["last_step_evaluated"] = outer_step
+      try:
+        (corr, total, accuracy, partial_accuracy, format_accuracy), _ = evaluate(
+            trainer_config,
+            test_dataset,
+            rl_cluster=rl_cluster,
+            num_passes=trainer_config.num_eval_passes,
+            corr_lst=trainer_config.eval_corr_lst,
+            make_lst=trainer_config.eval_make_lst,
+        )
+        max_logging.warning(
+            f"Intermediate Eval (step={outer_step}): {corr=}, {total=},"
+            f" {accuracy=}%, {partial_accuracy=}%, {format_accuracy=}%"
+        )
+      except Exception as e:  # pylint: disable=broad-exception-caught
+        max_logging.warning(f"[intermediate-eval] step={outer_step} failed: {e!r}")
+
+  # PeftTrainer composes a single training_hooks; install if free, else warn.
+  try:
+    actor = rl_cluster.actor_trainer
+    if getattr(actor, "training_hooks", None) is None:
+      actor.training_hooks = _IntermediateEvalHook()
+      max_logging.warning(
+          f"[intermediate-eval] hook installed: evaluate(...) will fire every {eval_interval} outer steps."
+      )
+    else:
+      max_logging.warning(
+          "[intermediate-eval] actor.training_hooks already set; skipping install (chain manually if you need both)."
+      )
+  except Exception as e:  # pylint: disable=broad-exception-caught
+    max_logging.warning(f"[intermediate-eval] install failed: {e!r}")
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -118,6 +118,7 @@ def _no_bf16_to_f32_cast(val, tgt_dtype, src_key):
 from maxtext.utils.globals import MAXTEXT_CONFIGS_DIR
 from maxtext.integration.vllm.maxtext_vllm_rollout import MaxTextVllmRollout
 from maxtext.trainers.post_train.rl.evaluate_rl import evaluate
+from maxtext.trainers.post_train.rl import hooks as rl_hooks
 from maxtext.trainers.post_train.rl import utils_rl
 from maxtext.input_pipeline.instruction_data_processing import load_data_template_from_file
 from maxtext.utils import max_logging, max_utils, model_creation_utils
@@ -693,6 +694,10 @@ def _rl_train_impl(argv: Sequence[str], kwargs: dict):
     max_logging.log("Capturing reference model state before training.")
     ref_state_before = nnx.to_pure_dict(nnx.state(reference_model.base, nnx.Param))
 
+  # Wire intermediate eval: fire greedy `evaluate(...)` every `eval_interval`
+  # outer steps. No-op when eval_interval <= 0 or num_test_batches <= 0.
+  rl_hooks.install_intermediate_eval_hook(rl_cluster, trainer_config, test_dataset)
+
   max_logging.warning("Starting RL training...")
   rl_trainer.train(train_dataset)