custom DeepSeek v3

Google-ML-Automation · Google-ML-Automation · commit e6176a05df46 · 2026-03-18T11:51:18.000-07:00
PiperOrigin-RevId: 882309708
diff --git a/src/maxtext/common/common_types.py b/src/maxtext/common/common_types.py
@@ -102,6 +102,7 @@ class DecoderBlockType(enum.Enum):
   SIMPLE_MLP = "simple_mlp"
   LLAMA4 = "llama4"
   OLMO3 = "olmo3"
+  DEEPSEEK_CUSTOM = "deepseek_custom"
 
 
 class AttentionType(enum.Enum):
diff --git a/src/maxtext/configs/base.yml b/src/maxtext/configs/base.yml
@@ -184,6 +184,11 @@ num_experts_per_tok: 1
 megablox: true
 sparse_matmul: true
 capacity_factor: -1.0 # a factor to decide expert capacity for token dropping, and no dropping by default
+ragged_buffer_factor: -1.0 # a factor to determine the size of the ragged buffer for routed MoE activations.
+# By default (-1), this buffer will be worst case size to ensure no dropping.
+# When set to 1.0 this buffer if set to the size assuming perfectly balanced. If the routing dictates
+# a size larger than this then tokens will be dropped.
+# In general if ragged_buffer_factor>0, the ragged_buffer_size is is balanced_size * ragged_buffer_factor.
 load_balance_loss_weight: 0.0 # weight for the load balance loss
 use_random_routing: false # whether to use random routing for debug/test purpose
 use_custom_sort_vjp: true # whether to use a custom VJP sort for efficient backward pass processing in sparse matmul
diff --git a/src/maxtext/configs/models/deepseek3-custom-large.yml b/src/maxtext/configs/models/deepseek3-custom-large.yml
@@ -0,0 +1,62 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for DeepSeek Custom
+
+
+base_emb_dim: 16384
+moe_model_dim: 8192
+base_moe_mlp_dim: 16384 # (2 * 8192)
+shared_expert_mlp_dim: 32768 # (4 * 8192)
+
+base_num_decoder_layers: 61
+first_num_dense_layers: 3
+mlp_activations: ["silu","linear"]
+vocab_size: 129280
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+
+num_experts: 256
+num_experts_per_tok: 4 # (1 shared + 4 routed)
+shared_experts: 1
+routed_scaling_factor: 2.5
+routed_score_func: "sigmoid"
+routed_bias: True
+decoder_block: "deepseek_custom"
+
+# Hybrid GQA Attention
+attention_output_dim: 8192  # same as moe_model_dim
+attention_layer_hybrid_ratio: 2  # 1 Local : 1 Global ratio
+inhomogeneous_layer_cycle_interval: 2  # same as attention_layer_hybrid_ratio
+head_dim: 256
+
+local_num_query_heads: 64
+local_num_kv_heads: 8
+sliding_window_size: 1024
+
+global_num_query_heads: 64
+global_num_kv_heads: 4
+
+mscale: 1.0
+# RoPE
+rope_type: "yarn"
+rope_max_timescale: 10_000 # DeepSeek uses  "rope_theta": 10000
+max_position_embeddings: 163840
+original_max_position_embeddings: 4096
+rope_factor: 40
+beta_fast: 32
+rope_interleave: True
+rope_truncate: True
+rope_attention_scaling: False
diff --git a/src/maxtext/configs/models/deepseek3-custom-small.yml b/src/maxtext/configs/models/deepseek3-custom-small.yml
@@ -0,0 +1,49 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for DeepSeek Custom
+
+base_emb_dim: 1024
+moe_model_dim: 512
+base_moe_mlp_dim: 1024
+shared_expert_mlp_dim: 4096
+num_experts: 16
+num_experts_per_tok: 2
+shared_experts: 1
+base_num_decoder_layers: 4
+first_num_dense_layers: 1
+mlp_activations: ["silu", "linear"]
+vocab_size: 129280
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+routed_scaling_factor: 2.5
+routed_score_func: "sigmoid"
+routed_bias: True
+decoder_block: "deepseek_custom"
+
+
+# Hybrid GQA Attention
+
+attention_output_dim: 512 # same as moe_model_dim
+attention_layer_hybrid_ratio: 2 # 1 Local : 1 Global ratio
+inhomogeneous_layer_cycle_interval: 2 # same as attention_layer_hybrid_ratio
+head_dim: 256
+
+local_num_query_heads: 4
+local_num_kv_heads: 2
+sliding_window_size: 128
+
+global_num_query_heads: 4
+global_num_kv_heads: 1
diff --git a/src/maxtext/configs/models/deepseek3-custom-top8.yml b/src/maxtext/configs/models/deepseek3-custom-top8.yml
@@ -0,0 +1,62 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for DeepSeek Custom
+
+
+base_emb_dim: 7168
+moe_model_dim: 3072
+base_moe_mlp_dim: 6144 # (2 * 3072)
+shared_expert_mlp_dim: 15360 # (5 * 3072)
+
+base_num_decoder_layers: 61
+first_num_dense_layers: 3
+mlp_activations: ["silu","linear"]
+vocab_size: 129280
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+
+num_experts: 256
+num_experts_per_tok: 8 # (1 shared + 8 routed)
+shared_experts: 1
+routed_scaling_factor: 2.5
+routed_score_func: "sigmoid"
+routed_bias: True
+decoder_block: "deepseek_custom"
+
+# Hybrid GQA Attention
+attention_output_dim: 3072  # same as moe_model_dim
+attention_layer_hybrid_ratio: 2  # 1 Local : 1 Global ratio
+inhomogeneous_layer_cycle_interval: 2  # same as attention_layer_hybrid_ratio
+head_dim: 256
+
+local_num_query_heads: 64
+local_num_kv_heads: 8
+sliding_window_size: 1024
+
+global_num_query_heads: 64
+global_num_kv_heads: 4
+
+mscale: 1.0
+# RoPE
+rope_type: "yarn"
+rope_max_timescale: 10_000 # DeepSeek uses  "rope_theta": 10000
+max_position_embeddings: 163840
+original_max_position_embeddings: 4096
+rope_factor: 40
+beta_fast: 32
+rope_interleave: True
+rope_truncate: True
+rope_attention_scaling: False
diff --git a/src/maxtext/configs/models/deepseek3-custom.yml b/src/maxtext/configs/models/deepseek3-custom.yml
@@ -0,0 +1,62 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for DeepSeek Custom
+
+
+base_emb_dim: 7168
+moe_model_dim: 3072
+base_moe_mlp_dim: 6144 # (2 * 3072)
+shared_expert_mlp_dim: 15360 # (5 * 3072)
+
+base_num_decoder_layers: 61
+first_num_dense_layers: 3
+mlp_activations: ["silu","linear"]
+vocab_size: 129280
+enable_dropout: False
+logits_via_embedding: False
+normalization_layer_epsilon: 1.0e-6
+
+num_experts: 256
+num_experts_per_tok: 4 # (1 shared + 4 routed)
+shared_experts: 1
+routed_scaling_factor: 2.5
+routed_score_func: "sigmoid"
+routed_bias: True
+decoder_block: "deepseek_custom"
+
+# Hybrid GQA Attention
+attention_output_dim: 3072  # same as moe_model_dim
+attention_layer_hybrid_ratio: 2  # 1 Local : 1 Global ratio
+inhomogeneous_layer_cycle_interval: 2  # same as attention_layer_hybrid_ratio
+head_dim: 256
+
+local_num_query_heads: 64
+local_num_kv_heads: 8
+sliding_window_size: 1024
+
+global_num_query_heads: 64
+global_num_kv_heads: 4
+
+mscale: 1.0
+# RoPE
+rope_type: "yarn"
+rope_max_timescale: 10_000 # DeepSeek uses  "rope_theta": 10000
+max_position_embeddings: 163840
+original_max_position_embeddings: 4096
+rope_factor: 40
+beta_fast: 32
+rope_interleave: True
+rope_truncate: True
+rope_attention_scaling: False
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py
@@ -610,6 +610,7 @@ class MoEGeneral(BaseModel):
   num_experts: PositiveInt = Field(1, description="The total number of experts in each MoE layer.")
   num_experts_per_tok: PositiveInt = Field(1, description="The number of experts to route each token to.")
   capacity_factor: float = Field(-1.0, description="Expert capacity factor. If < 0, no token dropping.")
+  ragged_buffer_factor: float = Field(-1.0, description="Ragged buffer factor. If < 0, ragged buffer is worst case size.")
   load_balance_loss_weight: NonNegativeFloat = Field(0.0, description="Weight for the load balancing auxiliary loss.")
   use_custom_sort_vjp: bool = Field(
       True,
diff --git a/src/maxtext/layers/decoders.py b/src/maxtext/layers/decoders.py
@@ -42,6 +42,7 @@
 from maxtext.models import (
     deepseek,
     deepseek_batchsplit,
+    deepseek_custom,
     gemma,
     gemma2,
     gemma3,
@@ -458,6 +459,14 @@ def get_decoder_layers(self):
             deepseek.DeepSeekDenseLayerToLinen,
             deepseek.DeepSeekMoELayerToLinen,
         ]
+      case DecoderBlockType.DEEPSEEK_CUSTOM:
+        deepseek_custom_moe_layer = deepseek_custom.DeepSeekMoELayerToLinen
+        if self.config.scan_layers and self.config.attention_layer_hybrid_ratio > 1:
+          deepseek_custom_moe_layer = deepseek_custom.DeepSeekMoEScannableBlockToLinen
+        return [
+            deepseek_custom.DeepSeekDenseLayerToLinen,
+            deepseek_custom_moe_layer,
+        ]
       case DecoderBlockType.GEMMA:
         return [gemma.GemmaDecoderLayerToLinen]
       case DecoderBlockType.GEMMA2:
@@ -525,6 +534,7 @@ def get_norm_layer(self, num_features: int):
         DecoderBlockType.MISTRAL,
         DecoderBlockType.MIXTRAL,
         DecoderBlockType.DEEPSEEK,
+        DecoderBlockType.DEEPSEEK_CUSTOM,
         DecoderBlockType.GEMMA,
         DecoderBlockType.GEMMA2,
         DecoderBlockType.GEMMA3,
@@ -577,7 +587,7 @@ def get_pipeline_stage_module(self, decoder_blocks):
     """get pipeline stage module"""
 
     def get_layer_to_pipeline(blocks, cfg):
-      if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
+      if cfg.decoder_block in (DecoderBlockType.DEEPSEEK, DecoderBlockType.DEEPSEEK_CUSTOM):
         return blocks[1]  # return the sparse block
       else:
         return blocks[0]
@@ -803,7 +813,7 @@ def __call__(
           if cfg.pipeline_fsdp_ag_once or cfg.pipeline_fsdp_ag_per_repeat
           else None
       )
-      if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
+      if cfg.decoder_block in (DecoderBlockType.DEEPSEEK, DecoderBlockType.DEEPSEEK_CUSTOM):
         assert len(RemattedBlockLayers) == 2, "Scanned layers must have a length of 2 using deepseek."
         dense_layer = RemattedBlockLayers[0]
         moe_layer = RemattedBlockLayers[1]
@@ -849,7 +859,7 @@ def __call__(
             )(y, *broadcast_args)
     else:
       if cfg.scan_layers:
-        if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
+        if cfg.decoder_block in (DecoderBlockType.DEEPSEEK, DecoderBlockType.DEEPSEEK_CUSTOM):
           assert len(RemattedBlockLayers) == 2, "Scanned layers must have a length of 2 using deepseek."
           layer_call_kwargs = {
               "page_state": page_state,
@@ -927,10 +937,31 @@ def __call__(
                   policy=policy,
               )
             else:
+              scan_length = num_moe_layers
+              if cfg.decoder_block == DecoderBlockType.DEEPSEEK_CUSTOM and cfg.scan_layers:
+                if num_moe_layers % cfg.inhomogeneous_layer_cycle_interval != 0:
+                  raise ValueError(
+                      f"num_moe_layers ({num_moe_layers}) must be divisible by "
+                      f"inhomogeneous_layer_cycle_interval ({cfg.inhomogeneous_layer_cycle_interval}) "
+                      "when using DeepSeek Custom and scan_layers is True."
+                  )
+                if cfg.attention_layer_hybrid_ratio != cfg.inhomogeneous_layer_cycle_interval:
+                  raise ValueError(
+                      f"attention_layer_hybrid_ratio ({cfg.attention_layer_hybrid_ratio}) and "
+                      f"inhomogeneous_layer_cycle_interval ({cfg.inhomogeneous_layer_cycle_interval}) "
+                      "must be the same."
+                  )
+                scan_length = num_moe_layers // cfg.inhomogeneous_layer_cycle_interval
+                max_logging.log(
+                    f"scan_length: {scan_length}, "
+                    f"num_moe_layers // cfg.inhomogeneous_layer_cycle_interval: "
+                    f"{num_moe_layers // cfg.inhomogeneous_layer_cycle_interval}"
+                )
+
               y, _ = self.scan_decoder_layers(
                   cfg,
                   moe_layer,
-                  num_moe_layers,
+                  scan_length,
                   "moe_layers",
                   mesh,
                   in_axes_tuple=(nn.broadcast,) * len(broadcast_args),
@@ -968,7 +999,7 @@ def __call__(
               **layer_kwargs,
           )(y, *broadcast_args)
       else:
-        if cfg.decoder_block == DecoderBlockType.DEEPSEEK:
+        if cfg.decoder_block in (DecoderBlockType.DEEPSEEK, DecoderBlockType.DEEPSEEK_CUSTOM):
           assert len(RemattedBlockLayers) == 2, "Unscanned layers must have a length of 2 using deepseek."
           dense_layer = RemattedBlockLayers[0]
           moe_layer = RemattedBlockLayers[1]
@@ -1058,11 +1089,14 @@ def __call__(
                 kv_caches["key_cache"][lyr] = returned_cache[0]
                 kv_caches["value_cache"][lyr] = returned_cache[1]
 
-            if deepstack_visual_embeds is not None and lyr < len(deepstack_visual_embeds):
-              visual_embeds = deepstack_visual_embeds[lyr]
+            if (
+                deepstack_visual_embeds is not None
+                and lyr < len(deepstack_visual_embeds)
+                and bidirectional_mask is not None
+                and deepstack_visual_embeds[lyr] is not None
+            ):
               # Use bidirectional_mask to identify visual token positions
-              if bidirectional_mask is not None and visual_embeds is not None:
-                y = deepstack_process(y, bidirectional_mask, visual_embeds)
+              y = deepstack_process(y, bidirectional_mask, deepstack_visual_embeds[lyr])
 
     assert isinstance(y, jax.Array)
 
diff --git a/src/maxtext/layers/linears.py b/src/maxtext/layers/linears.py
@@ -474,6 +474,7 @@ def get_norm_layer(self, num_features: int):
         DecoderBlockType.GEMMA3,
         DecoderBlockType.QWEN3,
         DecoderBlockType.DEEPSEEK,
+        DecoderBlockType.DEEPSEEK_CUSTOM,
         DecoderBlockType.LLAMA4,
     ):
       return functools.partial(normalizations.RMSNorm, num_features=num_features)
diff --git a/src/maxtext/layers/moe.py b/src/maxtext/layers/moe.py
diff --git a/src/maxtext/models/deepseek_custom.py b/src/maxtext/models/deepseek_custom.py
diff --git a/tests/integration/smoke/train_smoke_test.py b/tests/integration/smoke/train_smoke_test.py