Merge pull request #3761 from AI-Hypercomputer:chengnuojin-fix-eval

Google-ML-Automation · Google-ML-Automation · commit 8c1a4d217f26 · 2026-04-28T17:10:21.000-07:00
PiperOrigin-RevId: 907255273
diff --git a/src/maxtext/trainers/pre_train/train.py b/src/maxtext/trainers/pre_train/train.py
@@ -87,9 +87,7 @@ def get_first_step(model, state):
 # -----------------------------------------------------------------------------
 
 
-def loss_fn(
-    model, config, data, dropout_rng, params, sparsity_state=None, is_train=True
-):
+def loss_fn(model, config, data, dropout_rng, params, sparsity_state=None, is_train=True):
   """loss_fn for both train and eval.
 
   Args:
@@ -121,9 +119,7 @@ def loss_fn(
   # make its specific collection mutable so the MTPBlock can sow into it.
   if config.mtp_eval_target_module > 0 and not is_train:
     mutable_collections.append("mtp_acceptance")
-  sparsity_enabled = (
-      is_train and config.weight_sparsity_n and config.weight_sparsity_m
-  )
+  sparsity_enabled = is_train and config.weight_sparsity_n and config.weight_sparsity_m
   if sparsity_enabled:
     mutable_collections.append("batch_stats")
   if isinstance(model, nn.Module):
@@ -143,9 +139,7 @@ def loss_fn(
         data["inputs_position"],
         decoder_segment_ids=data["inputs_segmentation"],
         encoder_images=data["images"] if config.use_multimodal else None,
-        encoder_image_masks=data["image_masks"]
-        if config.use_multimodal and "image_masks" in data
-        else None,
+        encoder_image_masks=data["image_masks"] if config.use_multimodal and "image_masks" in data else None,
         enable_dropout=config.enable_dropout if is_train else False,
         rngs={"dropout": rng1, "params": aqt_rng},
         mutable=mutable_collections,
@@ -286,11 +280,7 @@ def loss_fn(
       "indexer_loss": indexer_loss,
       "moe_bias_updates": moe_bias_updates,
       "mtp_loss": mtp_loss,
-      "batch_stats": (
-          intermediate_outputs.get("batch_stats", None)
-          if hasattr(intermediate_outputs, "get")
-          else None
-      ),
+      "batch_stats": (intermediate_outputs.get("batch_stats", None) if hasattr(intermediate_outputs, "get") else None),
   }
   return loss, aux
 
@@ -416,9 +406,7 @@ def move(path, value):
   if sparsity_enabled:
     full_grads = {"params": grads}
     if sparsity_enabled and "batch_stats" in state.params:
-      batch_stats_grads = jax.tree_util.tree_map(
-          jnp.zeros_like, state.params.get("batch_stats", {})
-      )
+      batch_stats_grads = jax.tree_util.tree_map(jnp.zeros_like, state.params.get("batch_stats", {}))
       full_grads["batch_stats"] = batch_stats_grads
     full_grads = max_utils.unbox_logicallypartioned(full_grads)
   else:
@@ -501,9 +489,7 @@ def eval_step(model, config, state, data, dropout_rng):
   batch_stats = state.params.get("batch_stats", {})
 
   eval_loss_fn = functools.partial(_loss_fn, model, config, data, dropout_rng, is_train=False)
-  loss, aux = eval_loss_fn(
-      pure_params, *extra_dpo_args, sparsity_state=batch_stats
-  )
+  loss, aux = eval_loss_fn(pure_params, *extra_dpo_args, sparsity_state=batch_stats)
 
   mtp_acceptance_rate = 0.0
   if config.mtp_eval_target_module > 0:
@@ -630,6 +616,8 @@ def train_loop(config, recorder, state=None):
         eval_step_count = 0
         # pylint: disable=not-callable
         for eval_batch in eval_data_iterator:
+          # Shard input eval data
+          eval_batch = jax.device_put(eval_batch, sharding.get_input_data_sharding(config, mesh))
           if config.eval_steps > 0 and eval_step_count >= config.eval_steps:
             break
           with jax.set_mesh(mesh), nn_partitioning.axis_rules(config.logical_axis_rules):
diff --git a/tests/integration/smoke/train_smoke_test.py b/tests/integration/smoke/train_smoke_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-""" Smoke test """
+"""Smoke test"""
 import os
 import unittest
 
@@ -94,6 +94,36 @@ def test_tiny_config_no_scan(self):
         ]
     )
 
+  def test_tiny_eval(self):
+    test_tmpdir = os.environ.get("TEST_TMPDIR")  # pylint: disable=unused-variable
+    train_main(
+        [
+            None,
+            get_test_config_path(),
+            # pylint: disable=f-string-without-interpolation
+            f"base_output_directory={self.base_output_directory}",
+            "run_name=runner_test",
+            r"dataset_path={self.dataset_path}",
+            "base_emb_dim=8",
+            "base_num_query_heads=4",
+            "base_num_kv_heads=4",
+            "base_mlp_dim=32",
+            "base_num_decoder_layers=1",
+            "head_dim=128",
+            "per_device_batch_size=2",
+            "max_target_length=128",
+            "dataset_type=synthetic",
+            "steps=5",
+            "eval_steps=2",
+            "eval_interval=10",
+            "enable_checkpointing=False",
+            rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
+            "enable_goodput_recording=False",
+            "enable_checkpoint_cloud_logger=False",
+            "monitor_goodput=False",
+        ]
+    )
+
   def test_qwen3_custom_moe_config(self):
     test_tmpdir = os.environ.get("TEST_TMPDIR")  # pylint: disable=unused-variable
     train_main(
@@ -159,6 +189,38 @@ def test_tiny_config_explicit_shardmode(self):
         ]
     )
 
+  def test_eval_explicit_shardmode(self):
+    test_tmpdir = os.environ.get("TEST_TMPDIR")  # pylint: disable=unused-variable
+    train_main(
+        [
+            None,
+            get_test_config_path(),
+            # pylint: disable=f-string-without-interpolation
+            f"base_output_directory={self.base_output_directory}",
+            "run_name=runner_test",
+            r"dataset_path={self.dataset_path}",
+            "base_emb_dim=8",
+            "base_num_query_heads=4",
+            "base_num_kv_heads=4",
+            "base_mlp_dim=32",
+            "base_num_decoder_layers=1",
+            "head_dim=128",
+            "per_device_batch_size=2",
+            "max_target_length=128",
+            "dataset_type=synthetic",
+            "steps=5",
+            "eval_steps=2",
+            "eval_interval=10",
+            "shard_mode=explicit",
+            "remove_size_one_mesh_axis_from_type=false",
+            "enable_checkpointing=False",
+            rf"tokenizer_path={os.path.join(MAXTEXT_ASSETS_ROOT, 'tokenizers', 'tokenizer.llama2')}",
+            "enable_goodput_recording=False",
+            "enable_checkpoint_cloud_logger=False",
+            "monitor_goodput=False",
+        ]
+    )
+
 
 if __name__ == "__main__":
   absltest.main()