custom DeepSeek v3

Google-ML-Automation · Google-ML-Automation · commit b305ca19c4e0 · 2026-03-18T14:33:10.000-07:00
PiperOrigin-RevId: 882309708
diff --git a/src/maxtext/common/common_types.py b/src/maxtext/common/common_types.py
@@ -102,6 +102,7 @@ class DecoderBlockType(enum.Enum):
   SIMPLE_MLP = "simple_mlp"
   LLAMA4 = "llama4"
   OLMO3 = "olmo3"
+  DEEPSEEK_CUSTOM = "deepseek_custom"
 
 
 class AttentionType(enum.Enum):
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -155,6 +155,12 @@ base_num_kv_heads: 16
 base_mlp_dim: 7168
 base_num_decoder_layers: 16
 head_dim: 128
+attention_output_dim: -1
+local_num_query_heads: -1
+local_num_kv_heads: -1
+global_num_query_heads: -1
+global_num_kv_heads: -1
+attention_layer_hybrid_ratio: -1
 mlp_activations: ["silu", "linear"]
 mlp_activations_limit: -1.0
 dropout_rate: 0.0
@@ -240,6 +246,8 @@ use_2d_fsdp_sharding: False
 
 # deepseek moe
 base_moe_mlp_dim: 7168 # intermediate dimension at MoE layer. For a fully MoE model, base_mlp_dim must be equal to base_moe_mlp_dim.
+moe_model_dim: -1 # dimension of token entering moe layer.
+shared_expert_mlp_dim: -1 # intermediate dimension of the shared expert.
 first_num_dense_layers: 0 # number of initial dense layers in the model
 shared_experts: 1
 routed_scaling_factor: 1.0 # scaling factor for routing scores
@@ -484,6 +492,7 @@ logical_axis_rules: [
                       ['embed_no_exp', ['fsdp', 'fsdp_transpose', 'sequence', 'context']],
                       ['embed_no_exp', ['fsdp', 'sequence', 'context']],
                       ['embed_tensor_transpose', ['tensor_transpose']],
+                      ['attention_out_proj', ['fsdp', 'fsdp_transpose', 'sequence', 'tensor_transpose', 'context', 'expert']],
                       ['q_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'tensor_transpose', 'expert']],
                       ['q_lora', ['fsdp', 'sequence', 'context', 'tensor_transpose', 'expert']],
                       ['q_lora', ['fsdp', 'fsdp_transpose', 'sequence', 'context', 'expert']],
diff --git a/src/maxtext/configs/models/deepseek3-custom-large.yml b/src/maxtext/configs/models/deepseek3-custom-large.yml
@@ -0,0 +1,62 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for DeepSeek Custom
+
+
+base_emb_dim: 16384
+moe_model_dim: 8192
+base_moe_mlp_dim: 16384 # (2 * 8192)
+shared_expert_mlp_dim: 32768 # (4 * 8192)
+
+base_num_decoder_layers: 61
+first_num_dense_layers: 3
+mlp_activations: ["silu","linear"]
+vocab_size: 129280
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+
+num_experts: 256
+num_experts_per_tok: 4 # (1 shared + 4 routed)
+shared_experts: 1
+routed_scaling_factor: 2.5
+routed_score_func: "sigmoid"
+routed_bias: True
+decoder_block: "deepseek_custom"
+
+# Hybrid GQA Attention
+attention_output_dim: 8192  # same as moe_model_dim
+attention_layer_hybrid_ratio: 2  # 1 Local : 1 Global ratio
+inhomogeneous_layer_cycle_interval: 2  # same as attention_layer_hybrid_ratio
+head_dim: 256
+
+local_num_query_heads: 64
+local_num_kv_heads: 8
+sliding_window_size: 1024
+
+global_num_query_heads: 64
+global_num_kv_heads: 4
+
+mscale: 1.0
+# RoPE
+rope_type: "yarn"
+rope_max_timescale: 10_000 # DeepSeek uses  "rope_theta": 10000
+max_position_embeddings: 163840
+original_max_position_embeddings: 4096
+rope_factor: 40
+beta_fast: 32
+rope_interleave: True
+rope_truncate: True
+rope_attention_scaling: False
diff --git a/src/maxtext/configs/models/deepseek3-custom-small.yml b/src/maxtext/configs/models/deepseek3-custom-small.yml
@@ -0,0 +1,49 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for DeepSeek Custom
+
+base_emb_dim: 1024
+moe_model_dim: 512
+base_moe_mlp_dim: 1024
+shared_expert_mlp_dim: 4096
+num_experts: 16
+num_experts_per_tok: 2
+shared_experts: 1
+base_num_decoder_layers: 4
+first_num_dense_layers: 1
+mlp_activations: ["silu", "linear"]
+vocab_size: 129280
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+routed_scaling_factor: 2.5
+routed_score_func: "sigmoid"
+routed_bias: True
+decoder_block: "deepseek_custom"
+
+
+# Hybrid GQA Attention
+
+attention_output_dim: 512 # same as moe_model_dim
+attention_layer_hybrid_ratio: 2 # 1 Local : 1 Global ratio
+inhomogeneous_layer_cycle_interval: 2 # same as attention_layer_hybrid_ratio
+head_dim: 256
+
+local_num_query_heads: 4
+local_num_kv_heads: 2
+sliding_window_size: 128
+
+global_num_query_heads: 4
+global_num_kv_heads: 1
diff --git a/src/maxtext/configs/models/deepseek3-custom.yml b/src/maxtext/configs/models/deepseek3-custom.yml
@@ -0,0 +1,62 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for DeepSeek Custom
+
+
+base_emb_dim: 7168
+moe_model_dim: 3072
+base_moe_mlp_dim: 6144 # (2 * 3072)
+shared_expert_mlp_dim: 15360 # (5 * 3072)
+
+base_num_decoder_layers: 61
+first_num_dense_layers: 3
+mlp_activations: ["silu","linear"]
+vocab_size: 129280
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+
+num_experts: 256
+num_experts_per_tok: 4 # (1 shared + 4 routed)
+shared_experts: 1
+routed_scaling_factor: 2.5
+routed_score_func: "sigmoid"
+routed_bias: True
+decoder_block: "deepseek_custom"
+
+# Hybrid GQA Attention
+attention_output_dim: 3072  # same as moe_model_dim
+attention_layer_hybrid_ratio: 2  # 1 Local : 1 Global ratio
+inhomogeneous_layer_cycle_interval: 2  # same as attention_layer_hybrid_ratio
+head_dim: 256
+
+local_num_query_heads: 64
+local_num_kv_heads: 8
+sliding_window_size: 1024
+
+global_num_query_heads: 64
+global_num_kv_heads: 4
+
+mscale: 1.0
+# RoPE
+rope_type: "yarn"
+rope_max_timescale: 10_000 # DeepSeek uses  "rope_theta": 10000
+max_position_embeddings: 163840
+original_max_position_embeddings: 4096
+rope_factor: 40
+beta_fast: 32
+rope_interleave: True
+rope_truncate: True
+rope_attention_scaling: False
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -223,6 +223,9 @@ class ProfilerType(str, Enum):
     "deepseek3-tiny",
     "deepseek3.2-671b",
     "deepseek-custom",
+    "deepseek3-custom-small",
+    "deepseek3-custom",
+    "deepseek3-custom-large",
     "kimi-k2-1t",
     "gemma-7b",
     "gemma-2b",
@@ -430,6 +433,14 @@ class ModelArchitecture(BaseModel):
   base_mlp_dim: int = Field(7168, description="Base dimension of the MLP layer.")
   base_num_decoder_layers: int = Field(16, description="Base number of decoder layers.")
   head_dim: int = Field(128, description="Dimension of each attention head.")
+  attention_output_dim: int = Field(-1, description="Override output dimension for attention block")
+  local_num_query_heads: int = Field(-1, description="Number of query heads in local context layers.")
+  local_num_kv_heads: int = Field(-1, description="Number of KV heads in local context layers.")
+  global_num_query_heads: int = Field(-1, description="Number of query heads in global context layers.")
+  global_num_kv_heads: int = Field(-1, description="Number of KV heads in global context layers.")
+  attention_layer_hybrid_ratio: int = Field(
+      -1, description="Ratio of layer context styles (e.g. 5 means 4 local followed by 1 global)."
+  )
   mlp_activations: list[str] = Field(["silu", "linear"], description="Activation functions in the MLP layer.")
   mlp_activations_limit: float = Field(
       -1.0,
@@ -707,6 +718,8 @@ class DeepSeekMoE(BaseModel):
   """Configuration specific to DeepSeek-style MoE layers."""
 
   base_moe_mlp_dim: int = Field(7168, description="Intermediate dimension at MoE layer (DeepSeek style).")
+  moe_model_dim: int = Field(-1, description="Dimension of tokens entering the MoE layer.")
+  shared_expert_mlp_dim: int = Field(-1, description="Intermediate dimension for the shared expert.")
   first_num_dense_layers: NonNegativeInt = Field(0, description="Number of initial dense layers in the model.")
   shared_experts: PositiveInt = Field(1, description="Number of shared experts.")
   routed_scaling_factor: float = Field(1.0, description="Scaling factor for routing scores.")
diff --git a/src/maxtext/layers/decoders.py b/src/maxtext/layers/decoders.py
@@ -42,6 +42,7 @@
 from maxtext.models import (
     deepseek,
     deepseek_batchsplit,
+    deepseek_custom,
     gemma,
     gemma2,
     gemma3,
@@ -52,7 +53,6 @@
     mistral,
     mixtral,
     olmo3,
-    qwen2,
     qwen3,
     simple_layer,
 )
@@ -458,6 +458,14 @@ def get_decoder_layers(self):
             deepseek.DeepSeekDenseLayerToLinen,
             deepseek.DeepSeekMoELayerToLinen,
         ]
+      case DecoderBlockType.DEEPSEEK_CUSTOM:
+        deepseek_custom_moe_layer = deepseek_custom.DeepSeekMoELayerToLinen
+        if self.config.scan_layers and self.config.attention_layer_hybrid_ratio > 1:
+          deepseek_custom_moe_layer = deepseek_custom.DeepSeekMoEScannableBlockToLinen
+        return [
+            deepseek_custom.DeepSeekDenseLayerToLinen,
+            deepseek_custom_moe_layer,
+        ]
       case DecoderBlockType.GEMMA:
         return [gemma.GemmaDecoderLayerToLinen]
       case DecoderBlockType.GEMMA2:
@@ -468,8 +476,6 @@ def get_decoder_layers(self):
         return [gpt3.Gpt3DecoderLayerToLinen]
       case DecoderBlockType.GPT_OSS:
         return [gpt_oss.GptOssScannableBlockToLinen] if self.config.scan_layers else [gpt_oss.GptOssDecoderLayerToLinen]
-      case DecoderBlockType.QWEN2:
-        return [qwen2.Qwen2DecoderLayerToLinen]
       case DecoderBlockType.QWEN3:
         return [qwen3.Qwen3DecoderLayerToLinen]
       case DecoderBlockType.QWEN3_MOE:
@@ -525,10 +531,10 @@ def get_norm_layer(self, num_features: int):
         DecoderBlockType.MISTRAL,
         DecoderBlockType.MIXTRAL,
         DecoderBlockType.DEEPSEEK,
+        DecoderBlockType.DEEPSEEK_CUSTOM,
         DecoderBlockType.GEMMA,
         DecoderBlockType.GEMMA2,
         DecoderBlockType.GEMMA3,
-        DecoderBlockType.QWEN2,
         DecoderBlockType.QWEN3,
         DecoderBlockType.QWEN3_MOE,
         DecoderBlockType.GPT_OSS,
@@ -577,7 +583,7 @@ def get_pipeline_stage_module(self, decoder_blocks):
     """get pipeline stage module"""
 
     def get_layer_to_pipeline(blocks, cfg):
-      if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
+      if cfg.decoder_block in (DecoderBlockType.DEEPSEEK, DecoderBlockType.DEEPSEEK_CUSTOM):
         return blocks[1]  # return the sparse block
       else:
         return blocks[0]
@@ -803,7 +809,7 @@ def __call__(
           if cfg.pipeline_fsdp_ag_once or cfg.pipeline_fsdp_ag_per_repeat
           else None
       )
-      if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
+      if cfg.decoder_block in (DecoderBlockType.DEEPSEEK, DecoderBlockType.DEEPSEEK_CUSTOM):
         assert len(RemattedBlockLayers) == 2, "Scanned layers must have a length of 2 using deepseek."
         dense_layer = RemattedBlockLayers[0]
         moe_layer = RemattedBlockLayers[1]
@@ -849,7 +855,7 @@ def __call__(
             )(y, *broadcast_args)
     else:
       if cfg.scan_layers:
-        if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
+        if cfg.decoder_block in (DecoderBlockType.DEEPSEEK, DecoderBlockType.DEEPSEEK_CUSTOM):
           assert len(RemattedBlockLayers) == 2, "Scanned layers must have a length of 2 using deepseek."
           layer_call_kwargs = {
               "page_state": page_state,
@@ -927,10 +933,31 @@ def __call__(
                   policy=policy,
               )
             else:
+              scan_length = num_moe_layers
+              if cfg.decoder_block == DecoderBlockType.DEEPSEEK_CUSTOM and cfg.scan_layers:
+                if num_moe_layers % cfg.inhomogeneous_layer_cycle_interval != 0:
+                  raise ValueError(
+                      f"num_moe_layers ({num_moe_layers}) must be divisible by "
+                      f"inhomogeneous_layer_cycle_interval ({cfg.inhomogeneous_layer_cycle_interval}) "
+                      "when using DeepSeek Custom and scan_layers is True."
+                  )
+                if cfg.attention_layer_hybrid_ratio != cfg.inhomogeneous_layer_cycle_interval:
+                  raise ValueError(
+                      f"attention_layer_hybrid_ratio ({cfg.attention_layer_hybrid_ratio}) and "
+                      f"inhomogeneous_layer_cycle_interval ({cfg.inhomogeneous_layer_cycle_interval}) "
+                      "must be the same."
+                  )
+                scan_length = num_moe_layers // cfg.inhomogeneous_layer_cycle_interval
+                max_logging.log(
+                    f"scan_length: {scan_length}, "
+                    f"num_moe_layers // cfg.inhomogeneous_layer_cycle_interval: "
+                    f"{num_moe_layers // cfg.inhomogeneous_layer_cycle_interval}"
+                )
+
               y, _ = self.scan_decoder_layers(
                   cfg,
                   moe_layer,
-                  num_moe_layers,
+                  scan_length,
                   "moe_layers",
                   mesh,
                   in_axes_tuple=(nn.broadcast,) * len(broadcast_args),
@@ -968,7 +995,7 @@ def __call__(
               **layer_kwargs,
           )(y, *broadcast_args)
       else:
-        if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
+        if cfg.decoder_block in (DecoderBlockType.DEEPSEEK, DecoderBlockType.DEEPSEEK_CUSTOM):
           assert len(RemattedBlockLayers) == 2, "Unscanned layers must have a length of 2 using deepseek."
           dense_layer = RemattedBlockLayers[0]
           moe_layer = RemattedBlockLayers[1]
@@ -1058,11 +1085,14 @@ def __call__(
                 kv_caches["key_cache"][lyr] = returned_cache[0]
                 kv_caches["value_cache"][lyr] = returned_cache[1]
 
-            if deepstack_visual_embeds is not None and lyr < len(deepstack_visual_embeds):
-              visual_embeds = deepstack_visual_embeds[lyr]
+            if (
+                deepstack_visual_embeds is not None
+                and lyr < len(deepstack_visual_embeds)
+                and bidirectional_mask is not None
+                and deepstack_visual_embeds[lyr] is not None
+            ):
               # Use bidirectional_mask to identify visual token positions
-              if bidirectional_mask is not None and visual_embeds is not None:
-                y = deepstack_process(y, bidirectional_mask, visual_embeds)
+              y = deepstack_process(y, bidirectional_mask, deepstack_visual_embeds[lyr])
 
     assert isinstance(y, jax.Array)
 
diff --git a/src/maxtext/layers/linears.py b/src/maxtext/layers/linears.py
@@ -474,6 +474,7 @@ def get_norm_layer(self, num_features: int):
         DecoderBlockType.GEMMA3,
         DecoderBlockType.QWEN3,
         DecoderBlockType.DEEPSEEK,
+        DecoderBlockType.DEEPSEEK_CUSTOM,
         DecoderBlockType.LLAMA4,
     ):
       return functools.partial(normalizations.RMSNorm, num_features=num_features)
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
diff --git a/src/maxtext/models/deepseek_custom.py b/src/maxtext/models/deepseek_custom.py
diff --git a/tests/integration/smoke/train_smoke_test.py b/tests/integration/smoke/train_smoke_test.py