Add eval_batch_size knob for faster post-train RL evaluation

Pooya Moradi · Pooya Moradi · commit 87c9f3f29717 · 2026-06-08T18:12:03.000Z
Post-train RL evaluation batched at trainer_config.batch_size, which
for GRPO is intentionally small (e.g. 4 prompts per training step ×
8 generations = 32 trajectories per step — set by the GRPO recipe to
keep trainer HBM workable for the backward pass). At eval time this
is wasteful: eval is greedy decode only (no backward), so the trainer
budget doesn't apply, and vLLM rollout has many DP replicas sitting
idle when only 4 prompts are dispatched per batch.

Add an `eval_batch_size` knob (default -1 = use batch_size, preserving
old behavior) that overrides the batch dimension during dataset
preparation for the test split. Setting it to e.g. 128 on a sampler
with 8 DP replicas gives a ~32x eval throughput improvement on TPU
without affecting training behavior.

Total eval examples = num_test_batches * eval_batch_size, so users
should adjust num_test_batches when increasing eval_batch_size to keep
total eval set size constant.
diff --git a/src/maxtext/configs/post_train/rl.yml b/src/maxtext/configs/post_train/rl.yml
@@ -123,6 +123,12 @@ rollout_micro_batch_size: -1
 # Keep `num_test_batches` low so that evaluation runs quickly. It can be
 # increased to a max. of 330 (if batch size is 4).
 num_test_batches: 5  # 200
+# Optional override: batch size used during post-train RL evaluation. -1 (default)
+# = use `batch_size`. Set higher (e.g. 32-128) to feed vLLM bigger batches during
+# greedy eval — otherwise eval is bottlenecked by training batch_size, which is
+# small for GRPO (e.g. 4 prompts × 8 generations per step). Total eval examples
+# = num_test_batches * eval_batch_size, so adjust num_test_batches accordingly.
+eval_batch_size: -1
 test_batch_start_index: 0
 train_fraction: 1.0
 
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -2020,6 +2020,7 @@ class RLDataset(BaseModel):
   batch_size: int = Field(1, description="Global batch size for the dataset loader in RL.")
   num_batches: int = Field(4, description="Number of batches for RL training.")
   num_test_batches: int = Field(5, description="Number of batches for RL evaluation.")
+  eval_batch_size: int = Field(-1, description="Batch size for RL evaluation.")
   test_batch_start_index: int = Field(0, description="Start index for the test dataset")
   train_fraction: float = Field(1.0, description="Fraction of the dataset to be used for training.")
   train_micro_batch_size: int = Field(-1, description="Micro batch size for training.")
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -343,11 +343,23 @@ def _use_raw_prompt(x):
   train_dataset = train_dataset.to_iter_dataset().batch(trainer_config.batch_size)
 
   if trainer_config.num_test_batches > 0:
+    # eval_batch_size = -1 (default) → use trainer_config.batch_size (legacy
+    # behavior). Otherwise use the override so vLLM rollout during greedy eval
+    # can pack more prompts per call — important when training batch_size is
+    # small (e.g. 4 for GRPO) but the sampler has enough DP replicas to absorb
+    # a much larger eval batch. Total eval examples = num_test_batches *
+    # eval_batch_size_for_eval; adjust num_test_batches when changing
+    # eval_batch_size to keep total eval set size constant.
+    eval_batch_size_for_eval = (
+        trainer_config.batch_size
+        if getattr(trainer_config, "eval_batch_size", -1) <= 0
+        else trainer_config.eval_batch_size
+    )
     test_dataset = test_dataset.filter(_filter_long_prompts)
     test_dataset = test_dataset[
-        trainer_config.test_batch_start_index : trainer_config.num_test_batches * trainer_config.batch_size
+        trainer_config.test_batch_start_index : trainer_config.num_test_batches * eval_batch_size_for_eval
     ]
-    test_dataset = test_dataset.to_iter_dataset().batch(trainer_config.batch_size)
+    test_dataset = test_dataset.to_iter_dataset().batch(eval_batch_size_for_eval)
 
   return train_dataset, test_dataset