Merge pull request #4030 from AI-Hypercomputer:pr/eval-batch-size

Google-ML-Automation · Google-ML-Automation · commit eded896dd6e5 · 2026-06-09T15:51:32.000-07:00
PiperOrigin-RevId: 929448807
diff --git a/src/maxtext/configs/post_train/rl.yml b/src/maxtext/configs/post_train/rl.yml
@@ -123,6 +123,12 @@ rollout_micro_batch_size: -1
 # Keep `num_test_batches` low so that evaluation runs quickly. It can be
 # increased to a max. of 330 (if batch size is 4).
 num_test_batches: 5  # 200
+# Optional override: batch size used during post-train RL evaluation. -1 (default)
+# = use `batch_size`. Set higher (e.g. 32-128) to feed vLLM bigger batches during
+# greedy eval — otherwise eval is bottlenecked by training batch_size, which is
+# small for GRPO (e.g. 4 prompts × 8 generations per step). Total eval examples
+# = num_test_batches * eval_batch_size, so adjust num_test_batches accordingly.
+eval_batch_size: -1
 test_batch_start_index: 0
 train_fraction: 1.0
 
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -2039,6 +2039,7 @@ class RLDataset(BaseModel):
   batch_size: int = Field(1, description="Global batch size for the dataset loader in RL.")
   num_batches: int = Field(4, description="Number of batches for RL training.")
   num_test_batches: int = Field(5, description="Number of batches for RL evaluation.")
+  eval_batch_size: int = Field(-1, description="Batch size for RL evaluation.")
   test_batch_start_index: int = Field(0, description="Start index for the test dataset")
   train_fraction: float = Field(1.0, description="Fraction of the dataset to be used for training.")
   train_micro_batch_size: int = Field(-1, description="Micro batch size for training.")
diff --git a/src/maxtext/trainers/post_train/rl/train_rl.py b/src/maxtext/trainers/post_train/rl/train_rl.py
@@ -342,11 +342,23 @@ def _use_raw_prompt(x):
   train_dataset = train_dataset.to_iter_dataset().batch(trainer_config.batch_size)
 
   if trainer_config.num_test_batches > 0:
+    # eval_batch_size = -1 (default) → use trainer_config.batch_size (legacy
+    # behavior). Otherwise use the override so vLLM rollout during greedy eval
+    # can pack more prompts per call — important when training batch_size is
+    # small (e.g. 4 for GRPO) but the sampler has enough DP replicas to absorb
+    # a much larger eval batch. Total eval examples = num_test_batches *
+    # eval_batch_size_for_eval; adjust num_test_batches when changing
+    # eval_batch_size to keep total eval set size constant.
+    eval_batch_size_for_eval = (
+        trainer_config.batch_size
+        if getattr(trainer_config, "eval_batch_size", -1) <= 0
+        else trainer_config.eval_batch_size
+    )
     test_dataset = test_dataset.filter(_filter_long_prompts)
     test_dataset = test_dataset[
-        trainer_config.test_batch_start_index : trainer_config.num_test_batches * trainer_config.batch_size
+        trainer_config.test_batch_start_index : trainer_config.num_test_batches * eval_batch_size_for_eval
     ]
-    test_dataset = test_dataset.to_iter_dataset().batch(trainer_config.batch_size)
+    test_dataset = test_dataset.to_iter_dataset().batch(eval_batch_size_for_eval)
 
   return train_dataset, test_dataset