.

shuningjin · shuningjin · commit e9fa350e003b · 2026-04-19T01:53:43.000Z
diff --git a/tests/end_to_end/tpu/deepseek/Run_DeepSeek.md b/tests/end_to_end/tpu/deepseek/Run_DeepSeek.md
@@ -170,7 +170,10 @@ python3 -m maxtext.trainers.post_train.sft.train_sft_deprecated src/maxtext/conf
 ```
 
 ## Continued pre-training for V3.2 Sparse Attention
-**DeepSeek Sparse Attention (DSA)** enhances the Multi-Head Latent Attention (MLA) architecture by introducing a **Lightning Indexer**, which selects the top-k tokens for attention. DeepSeek-V3.2 is instantiated from DeepSeek-V3.1 and undergoes continued pre-training to adapt this indexer via a two-stage strategy: **Dense Warm-up** and **Sparse Training**.  
+
+**DeepSeek Sparse Attention (DSA)** enhances the Multi-Head Latent Attention (MLA) architecture by introducing a **Lightning Indexer**, which selects the top-k tokens for attention. Note that Indexer is activated only if `max_target_length` > `indexer_topk` (2048).
+
+DeepSeek-V3.2 is instantiated from DeepSeek-V3.1 and undergoes continued pre-training to adapt this indexer via a two-stage strategy: **Dense Warm-up** and **Sparse Training**.
 
 1. **Dense Warmup Stage**
 The indexer is trained exclusively using dense indexer loss while all other model parameters remain frozen.  
@@ -186,6 +189,7 @@ python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
     async_checkpointing=false \
     ici_fsdp_parallelism=128 \
     steps=5 \
+    # Indexer is activated only if max_target_length > indexer_topk (2048)
     max_target_length=4096 \
     attention=flash \
     dtype=bfloat16 \
@@ -212,6 +216,7 @@ python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
     async_checkpointing=false \
     ici_fsdp_parallelism=128 \
     steps=5 \
+    # Indexer is activated only if max_target_length > indexer_topk (2048)
     max_target_length=4096 \
     attention=flash \
     dtype=bfloat16 \
diff --git a/tests/end_to_end/tpu/kimi/Run_Kimi.md b/tests/end_to_end/tpu/kimi/Run_Kimi.md
@@ -72,9 +72,11 @@ python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
     dataset_type=synthetic \
     scan_layers=True \
     use_ring_of_experts=True \
+    # muon optimizer
     opt_type=muon \
     muon_consistent_rms=0.2 \
     muon_weight_decay=0.1 \
+    # qk clip
     use_qk_clip=True \
     qk_clip_threshold=100
 ```
@@ -109,9 +111,11 @@ python3 -m maxtext.trainers.pre_train.train src/maxtext/configs/base.yml \
     scan_layers=True \
     load_parameters_path=${SCANNED_CHECKPOINT?} \
     use_ring_of_experts=True \
+    # muon optimizer
     opt_type=muon \
     muon_consistent_rms=0.2 \
     muon_weight_decay=0.1 \
+    # qk clip
     use_qk_clip=True \
     qk_clip_threshold=100
 ```
@@ -122,18 +126,21 @@ Example command to run decoding with Kimi K2. Given its 1T size, high tensor par
 ```sh
 python3 -m maxtext.inference.decode src/maxtext/configs/base.yml \
     base_output_directory=${BASE_OUTPUT_DIRECTORY?} \
-    load_parameters_path=${CONVERTED_CHECKPOINT?} \
     run_name=kimi_decode \
-    per_device_batch_size=1 \
     model_name=kimi-k2-1t \
-    max_target_length=2048 \
     tokenizer_type=huggingface \
     tokenizer_path=moonshotai/Kimi-K2-Instruct \
+    hf_access_token=${HF_TOKEN?} \
+    load_parameters_path=${UNSCANNED_CKPT_PATH?} \
+    scan_layers=False \
+    enable_checkpointing=true \
+    async_checkpointing=false \
+    per_device_batch_size=1 \
+    max_target_length=2048 \
     attention=dot_product \
     ici_tensor_parallelism=128 \
     ici_fsdp_parallelism=1 \
-    prompt="The primary goal of agentic intelligence is to " \
-    scan_layers=False
+    prompt="The primary goal of agentic intelligence is to "
 ```
 
 ## Correctness
@@ -158,6 +165,8 @@ python3 -m tests.assets.logits_generation.generate_hf_golden_logits \
   --trust-remote-code=True  
 ```
 
+Run command below to compare logits between HuggingFace and MaxText.
+
 ```sh
 JAX_PLATFORMS=cpu python3 -m tests.forward_pass_logit_checker \  
   src/maxtext/configs/base.yml \  
diff --git a/tests/unit/train_compile_test.py b/tests/unit/train_compile_test.py
@@ -569,7 +569,6 @@ def test_moe_deepseek_scanned_bf16(self):
         )
     )
 
-  @pytest.mark.skip(reason="Fix sharding issue of all layers of DeepSeek")
   @pytest.mark.cpu_only
   def test_moe_deepseek_unscanned_bf16(self):
     temp_dir = gettempdir()
@@ -964,10 +963,10 @@ def test_qk_clip_with_dot_product(self):
             "per_device_batch_size=1",
             "dtype=bfloat16",
             "weight_dtype=float32",
-            # dot product
+            # dot product attention
             "attention=dot_product",
             "use_tokamax_splash=True",
-            # qk
+            # qk clip
             "use_qk_clip=true",
             "qk_clip_threshold=100",
         )
@@ -993,14 +992,14 @@ def test_muon_clip_with_tokamax_splash(self):
             "per_device_batch_size=1",
             "dtype=bfloat16",
             "weight_dtype=float32",
-            # tokamax splash
+            # tokamax splash attention
             "attention=flash",
             "use_tokamax_splash=True",
-            # muon
+            # muon optimizer
             "opt_type=muon",
             "muon_consistent_rms=0.2",
             "muon_weight_decay=0.1",
-            # qk
+            # qk clip
             "use_qk_clip=true",
             "qk_clip_threshold=100",
         )