update test

shuningjin · shuningjin · commit f3099c30f6c2 · 2026-04-17T07:49:20.000Z
diff --git a/README.md b/README.md
@@ -117,25 +117,25 @@ MaxText aims to provide you with the best OSS models, whether as a reference imp
   * Gemma 2 (2B, 9B, 27B)
   * Gemma 1 (2B, 7B)
 * Alibaba
-  * Qwen 2.5 (1.5B, 7B, 14B)
-  * Qwen 3 MoE 2507 (235B, 480B)
-  * Qwen 3 MoE (30B, 235B)
+  * Qwen 3 Next (80B)
+  * Qwen 3 MoE (30B, 235B), Qwen 3 MoE 2507 (235B, 480B)
   * Qwen 3 Dense (0.6B, 1.7B, 4B, 8B, 14B, 32B)
-* DeepSeek
+  * Qwen 2.5 (1.5B, 7B, 14B)  
+* DeepSeek AI
   * DeepSeek V3.2 (671B)  
   * DeepSeek V3.1 (671B)
-  * DeepSeek V3 0324 (671B) & DeepSeek R1 0528 (671B)
+  * DeepSeek V3 0324 (671B), DeepSeek R1 0528 (671B)
   * DeepSeek V2 (16B, 236B)
-* Kimi
-  * Kimi K2
+* Moonshot AI
+  * Kimi K2 (1T)
 * Meta
   * Llama 4 Scout (109B) & Maverick (400B)
-  * Llama 3.3 70B, 3.1 (8B, 70B, 405B), 3.0 (8B, 70B, 405B)
+  * Llama 3.3 (70B), 3.1 (8B, 70B, 405B), 3.0 (8B, 70B, 405B)
   * Llama 2 (7B, 13B, 70B)
-* Open AI
+* OpenAI
   * GPT-OSS (20B, 120B)
   * GPT3 (52K, 6B, 22B, 175B)
-* Mistral
+* Mistral AI
   * Mixtral (8x7B, 8x22B)
   * Mistral (7B)
 * Diffusion Models
diff --git a/tests/unit/train_compile_test.py b/tests/unit/train_compile_test.py
@@ -734,7 +734,7 @@ def test_gpt3_6b(self):
             "",
             get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
-            "compile_topology=v5p-256",
+            "compile_topology=v5p-8",
             "compile_topology_num_slices=1",
             "model_name=gpt3-6b",
             "per_device_batch_size=1",
@@ -766,7 +766,7 @@ def test_qwen3_next(self):
             "",
             get_test_config_path(),
             f"compiled_trainstep_file={compiled_trainstep_file}",
-            "compile_topology=v5p-256",
+            "compile_topology=v5p-64",
             "compile_topology_num_slices=1",
             "model_name=qwen3-next-80b-a3b",
             "per_device_batch_size=1",
@@ -796,9 +796,6 @@ def test_deepseek32(self):
             "use_tokamax_splash=True",
             "dtype=bfloat16",
             "weight_dtype=bfloat16",
-            # without_device_limit
-            "n_routing_groups=-1",
-            "topk_routing_group=-1",
         )
     )
 
@@ -948,9 +945,9 @@ def test_circular_pipeline_ag_per_repeat_ep_ds(self):
     )
 
   @pytest.mark.cpu_only
-  def test_qk_clip(self):
-    """AOT test for qk-clip with DeepSeek3 Tiny model"""
-    compiled_trainstep_file = "/tmp/test_qk_clip.pickle"
+  def test_qk_clip_with_dot_product(self):
+    """AOT test for AdamW optimizer with QK clip on dot product attention for DeepSeek3 Tiny model"""
+    compiled_trainstep_file = "/tmp/test_qk_clip_with_dot_product.pickle"
     train_compile_main(
         (
             "",
@@ -963,13 +960,47 @@ def test_qk_clip(self):
             "sparse_matmul=True",
             "megablox=True",
             "use_tokamax_gmm=False",
-            # TODO(agagik): update to flash after support
+            "max_target_length=128",
+            "per_device_batch_size=1",
+            "dtype=bfloat16",
+            "weight_dtype=float32",
+            # dot product
             "attention=dot_product",
             "use_tokamax_splash=True",
+            # qk
+            "use_qk_clip=true",
+            "qk_clip_threshold=100",
+        )
+    )
+
+  @pytest.mark.cpu_only
+  def test_muon_clip_with_tokamax_splash(self):
+    """AOT test for Muon optimizer with QK clip on tokamax splash attention for DeepSeek3 Tiny model"""
+    compiled_trainstep_file = "/tmp/test_muon_clip_with_tokamax_splash.pickle"
+    train_compile_main(
+        (
+            "",
+            get_test_config_path(),
+            f"compiled_trainstep_file={compiled_trainstep_file}",
+            "compile_topology=v5p-8",
+            "compile_topology_num_slices=1",
+            "model_name=deepseek3-tiny",
+            "scan_layers=True",
+            "sparse_matmul=True",
+            "megablox=True",
+            "use_tokamax_gmm=False",
             "max_target_length=128",
             "per_device_batch_size=1",
             "dtype=bfloat16",
             "weight_dtype=float32",
+            # tokamax splash
+            "attention=flash",
+            "use_tokamax_splash=True",
+            # muon
+            "opt_type=muon",
+            "muon_consistent_rms=0.2",
+            "muon_weight_decay=0.1",
+            # qk
             "use_qk_clip=true",
             "qk_clip_threshold=100",
         )