Merge pull request #1673 from AI-Hypercomputer:llama4_benchmark

maxtext authors · maxtext authors · commit 62f0f1732f17 · 2025-05-02T16:40:04.000-07:00
PiperOrigin-RevId: 754175426
diff --git a/MaxText/configs/models/llama4-17b-128e.yml b/MaxText/configs/models/llama4-17b-128e.yml
@@ -31,17 +31,12 @@ rope_max_timescale: 500000
 rope_type: "llama3.1"
 rope_use_scale: False
 num_experts: 128
-capacity_factor: -1.0 # TODO: this will be removed once we support dropless with megablox/ragged_dot
 shared_experts: 1
 num_experts_per_tok: 1
 use_qk_norm: False
 nope_layer_interval: 4 # Every fourth layer should NOT use RoPE
 interleave_moe_layer_step: 2 # Every 2nd layer is MoE layer, and 1st layer is dense layer
 
-# TODO: delete the following variables once we add support for dropless with megablox/ragged_dot
-sparse_matmul: False
-megablox: False
-
 temperature_tuning: True
 # Chunk attention is used on all RoPE layers
 # otherwise, on NoPE layers, use global attention
diff --git a/MaxText/configs/models/llama4-17b-16e.yml b/MaxText/configs/models/llama4-17b-16e.yml
@@ -31,7 +31,6 @@ normalization_layer_epsilon: 1e-05
 rope_max_timescale: 500000
 rope_type: "llama3.1"
 num_experts: 16
-capacity_factor: -1.0 # TODO: this will be removed once we support dropless with megablox/ragged_dot
 shared_experts: 1
 num_experts_per_tok: 1
 use_qk_norm: True # Llama4 models apply an L2Norm to the Query and Keys after RoPE
diff --git a/MaxText/pyconfig.py b/MaxText/pyconfig.py
@@ -262,6 +262,8 @@ def validate_llama4_config(keys: dict):
     raise ValueError("Llama4 decoder has not been tested with capacity_factor >= 0 -- please set that value to -1 for now!")
   if keys["num_experts_per_tok"] > 1:
     raise ValueError("Only top-1 routing is supported for Llama4 for now!")
+  if keys["scan_layers"]:
+    raise ValueError("Only unscanned layer is supported for Llama4, and please set scan_layers=False for now!")
   if keys["base_num_decoder_layers"] % keys["interleave_moe_layer_step"] != 0:
     raise ValueError(
         f"The number of decoder layers ({keys['base_num_decoder_layers']}) must be divisible by interleave moe layer step ({keys['interleave_moe_layer_step']})"
diff --git a/MaxText/tests/train_compile_test.py b/MaxText/tests/train_compile_test.py
@@ -567,12 +567,15 @@ def test_moe_llama4_17b_16e(self):
             None,
             os.path.join(PKG_DIR, "configs", "base.yml"),
             f"compiled_trainstep_file={compiled_trainstep_file}",
-            "compile_topology=v6e-256",
+            "compile_topology=v5p-256",
             "compile_topology_num_slices=1",
             "model_name=llama4-17b-16e",
             "per_device_batch_size=1",
             "max_target_length=1024",
             "dtype=bfloat16",
             "weight_dtype=bfloat16",
+            "scan_layers=False",
+            "ici_fsdp_parallelism=32",
+            "ici_tensor_parallelism=4",
         )
     )