Add intermediate eval hook: fire evaluate() every eval_interval outer steps

Pooya Moradi · Pooya Moradi · commit c1b42ca5f520 · 2026-06-02T21:25:34.000Z
`eval_interval` was a silently-dead config: even though it's plumbed into
tunix's `RLTrainingConfig.eval_every_n_steps`, tunix's `_run_eval` is a
no-op unless an `eval_ds` is passed to `trainer.train()`. And even if you
do pass one, tunix's default GRPO eval re-runs the full sampled rollout
(num_generations responses per prompt), which is ~3hr/eval and impractical
for trajectory monitoring.

Install a `tunix.sft.hooks.TrainingHooks` subclass that hooks
`on_train_step_end`, checks `rl_cluster.global_steps % eval_interval`,
and calls maxtext's own `evaluate(...)` (greedy decode + the configured
scoring pipeline). Gives matched-step PRE / step_N / POST trajectory
logging at near-zero cost beyond the eval itself (which is already fast
when `eval_batch_size` is set per commit d536d13).

No-op when eval_interval &lt;= 0 or num_test_batches &lt;= 0. Soft-skips with
a warning if tunix.sft.hooks isn't importable, so the launcher still
works against a stock-only tunix.
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -352,6 +352,103 @@ def _use_raw_prompt(x):
   return train_dataset, test_dataset
 
 
+def _install_intermediate_eval_hook(
+    rl_cluster: Any,
+    trainer_config: Any,
+    test_dataset: Any,
+) -> None:
+  """Fire `evaluate(...)` every `eval_interval` outer steps during training.
+
+  tunix's `eval_every_n_steps` in `RLTrainingConfig` is silently dead unless
+  an `eval_ds` is passed to `trainer.train()`, and even then tunix's default
+  `_run_eval` re-runs the full GRPO rollout (`num_generations` sampled per
+  prompt), which is ~3hr/eval and impractical for trajectory monitoring.
+
+  This hook subclasses `tunix.sft.hooks.TrainingHooks` and at every
+  `eval_interval` outer step (matched against `rl_cluster.global_steps`)
+  calls maxtext's `evaluate(...)` — greedy decode + the configured scoring
+  pipeline — and logs the result. Gives matched-step PRE/INTERMEDIATE/POST
+  curves without any change to tunix.
+
+  No-op if `eval_interval <= 0` or `num_test_batches <= 0` or tunix's hooks
+  module is unavailable.
+  """
+  if trainer_config.num_test_batches <= 0:
+    return
+  eval_interval = int(getattr(trainer_config, "eval_interval", 0))
+  if eval_interval <= 0:
+    return
+  try:
+    # Soft-import: keeps the launcher usable against a stock-only tunix.
+    from tunix.sft import hooks as _hk  # pylint: disable=import-outside-toplevel
+  except ImportError:
+    max_logging.warning("[intermediate-eval] tunix.sft.hooks not importable; skipping hook" " install.")
+    return
+
+  state: dict = {"last_step_evaluated": -1}
+
+  class _IntermediateEvalHook(_hk.TrainingHooks):  # type: ignore[name-defined]
+    """Fires `evaluate(...)` every `eval_interval` outer steps."""
+
+    def on_train_start(self, train_ctx):  # noqa: ARG002
+      del train_ctx
+
+    def on_train_end(self, train_ctx):  # noqa: ARG002
+      del train_ctx
+
+    def on_train_step_start(self, train_ctx):  # noqa: ARG002
+      del train_ctx
+
+    def on_eval_step_start(self, train_ctx):  # noqa: ARG002
+      del train_ctx
+
+    def on_eval_step_end(self, train_ctx, *args, **kwargs):  # noqa: ARG002
+      del train_ctx, args, kwargs
+
+    def on_train_step_end(self, trainer, step, loss):  # noqa: ARG002
+      """Fire `evaluate(...)` once per `eval_interval` outer steps."""
+      del trainer, loss
+      try:
+        outer_step = int(rl_cluster.global_steps)
+      except Exception:  # pylint: disable=broad-exception-caught
+        outer_step = int(step) if step is not None else -1
+      if outer_step <= 0 or outer_step == state["last_step_evaluated"]:
+        return
+      if outer_step % eval_interval != 0:
+        return
+      state["last_step_evaluated"] = outer_step
+      try:
+        (corr, total, accuracy, partial_accuracy, format_accuracy), _ = evaluate(
+            trainer_config,
+            test_dataset,
+            rl_cluster=rl_cluster,
+            num_passes=trainer_config.num_eval_passes,
+            corr_lst=trainer_config.eval_corr_lst,
+            make_lst=trainer_config.eval_make_lst,
+        )
+        max_logging.warning(
+            f"Intermediate Eval (step={outer_step}): {corr=}, {total=},"
+            f" {accuracy=}%, {partial_accuracy=}%, {format_accuracy=}%"
+        )
+      except Exception as e:  # pylint: disable=broad-exception-caught
+        max_logging.warning(f"[intermediate-eval] step={outer_step} failed: {e!r}")
+
+  # PeftTrainer composes a single training_hooks; install if free, else warn.
+  try:
+    actor = rl_cluster.actor_trainer
+    if getattr(actor, "training_hooks", None) is None:
+      actor.training_hooks = _IntermediateEvalHook()
+      max_logging.warning(
+          "[intermediate-eval] hook installed: evaluate(...) will fire every" f" {eval_interval} outer steps."
+      )
+    else:
+      max_logging.warning(
+          "[intermediate-eval] actor.training_hooks already set; skipping" " install (chain manually if you need both)."
+      )
+  except Exception as e:  # pylint: disable=broad-exception-caught
+    max_logging.warning(f"[intermediate-eval] install failed: {e!r}")
+
+
 def create_rl_components(
     trainer_config,
     sampler_config,
@@ -693,6 +790,10 @@ def _rl_train_impl(argv: Sequence[str], kwargs: dict):
     max_logging.log("Capturing reference model state before training.")
     ref_state_before = nnx.to_pure_dict(nnx.state(reference_model.base, nnx.Param))
 
+  # Wire intermediate eval: fire greedy `evaluate(...)` every `eval_interval`
+  # outer steps. No-op when eval_interval <= 0 or num_test_batches <= 0.
+  _install_intermediate_eval_hook(rl_cluster, trainer_config, test_dataset)
+
   max_logging.warning("Starting RL training...")
   rl_trainer.train(train_dataset)