Merge pull request #3416 from AI-Hypercomputer:atwigg/add_qwen3_base

Google-ML-Automation · Google-ML-Automation · commit dcb9e2bb4a35 · 2026-03-19T19:07:37.000-07:00
PiperOrigin-RevId: 886494773
diff --git a/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py b/src/maxtext/checkpoint_conversion/utils/hf_model_configs.py
@@ -261,6 +261,22 @@
     torch_dtype="bfloat16",
 )
 
+qwen3_1_7b_config = transformers.Qwen3Config(
+    vocab_size=151936,
+    hidden_size=2048,
+    intermediate_size=6144,
+    num_hidden_layers=28,
+    num_attention_heads=16,
+    num_key_value_heads=8,
+    head_dim=128,
+    hidden_act="silu",
+    max_position_embeddings=40960,
+    rms_norm_eps=1.0e-6,
+    rope_theta=1000000.0,
+    tie_word_embeddings=True,
+    torch_dtype="bfloat16",
+)
+
 qwen3_4b_config = transformers.Qwen3Config(
     vocab_size=151936,
     hidden_size=2560,
@@ -853,16 +869,22 @@
     "qwen2.5-7b": qwen25_7b_config,
     "qwen2.5-14b": qwen25_14b_config,
     "qwen3-0.6b": qwen3_0_6b_config,
+    "qwen3-1.7b": qwen3_1_7b_config,
+    "qwen3-1.7b-base": qwen3_1_7b_config,
     "qwen3-4b": qwen3_4b_config,
+    "qwen3-4b-base": qwen3_4b_config,
     "qwen3-4b-thinking-2507": qwen3_4b_config,
     "qwen3-8b": qwen3_8b_config,
+    "qwen3-8b-base": qwen3_8b_config,
     "qwen3-14b": qwen3_14b_config,
+    "qwen3-14b-base": qwen3_14b_config,
     "qwen3-32b": qwen3_32b_config,
     "llama3.1-8b": llama31_8b_config,
     "llama3.1-8b-Instruct": llama31_8b_config,
     "llama3.1-70b": llama31_70b_config,
     "llama3.1-405b": llama31_405b_config,
     "qwen3-30b-a3b": qwen3_30b_a3b_thinking_2507_config,
+    "qwen3-30b-a3b-base": qwen3_30b_a3b_thinking_2507_config,
     "qwen3-235b-a22b": qwen3_235b_a22b_thinking_2507_config,
     "qwen3-480b-a35b": qwen3_coder_480b_a35b_config,
     "deepseek3-671b": deepseek3_671b_config,
diff --git a/src/maxtext/checkpoint_conversion/utils/param_mapping.py b/src/maxtext/checkpoint_conversion/utils/param_mapping.py
@@ -587,11 +587,11 @@ def scale_query_layer(input_tensor, target_shape):
   return mapping
 
 
-def QWEN_MAXTEXT_TO_HF_PARAM_MAPPING(config, maxtext_config, scan_layers=False):
-  """Returns mapping from MaxText to HuggingFace Qwen weight paths.
+def QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING(config, maxtext_config, scan_layers=False):
+  """Returns mapping from MaxText to HuggingFace Qwen3 weight paths.
 
   This function generates a dictionary that maps parameter names from a MaxText
-  Qwen checkpoint to their corresponding names in the Hugging Face format.
+  Qwen3 checkpoint to their corresponding names in the Hugging Face format.
   It handles both dense and Mixture-of-Experts (MoE) model variants.
 
   Args:
@@ -631,15 +631,6 @@ def QWEN_MAXTEXT_TO_HF_PARAM_MAPPING(config, maxtext_config, scan_layers=False):
             "params-decoder-layers-self_attention-value-kernel": [
                 f"model.layers.{i}.self_attn.v_proj.weight" for i in range(n_layers)
             ],
-            "params-decoder-layers-self_attention-query-bias": [
-                f"model.layers.{i}.self_attn.q_proj.bias" for i in range(n_layers)
-            ],
-            "params-decoder-layers-self_attention-key-bias": [
-                f"model.layers.{i}.self_attn.k_proj.bias" for i in range(n_layers)
-            ],
-            "params-decoder-layers-self_attention-value-bias": [
-                f"model.layers.{i}.self_attn.v_proj.bias" for i in range(n_layers)
-            ],
             "params-decoder-layers-self_attention-out-kernel": [
                 f"model.layers.{i}.self_attn.o_proj.weight" for i in range(n_layers)
             ],
@@ -697,9 +688,6 @@ def QWEN_MAXTEXT_TO_HF_PARAM_MAPPING(config, maxtext_config, scan_layers=False):
               f"params-decoder-layers_{i}-self_attention-key-kernel": f"model.layers.{i}.self_attn.k_proj.weight",
               f"params-decoder-layers_{i}-self_attention-value-kernel": f"model.layers.{i}.self_attn.v_proj.weight",
               f"params-decoder-layers_{i}-self_attention-out-kernel": f"model.layers.{i}.self_attn.o_proj.weight",
-              f"params-decoder-layers_{i}-self_attention-query-bias": f"model.layers.{i}.self_attn.q_proj.bias",
-              f"params-decoder-layers_{i}-self_attention-key-bias": f"model.layers.{i}.self_attn.k_proj.bias",
-              f"params-decoder-layers_{i}-self_attention-value-bias": f"model.layers.{i}.self_attn.v_proj.bias",
               f"params-decoder-layers_{i}-self_attention-query_norm-scale": f"model.layers.{i}.self_attn.q_norm.weight",
               f"params-decoder-layers_{i}-self_attention-key_norm-scale": f"model.layers.{i}.self_attn.k_norm.weight",
               f"params-decoder-layers_{i}-post_self_attention_layer_norm-scale": f"model.layers.{i}.post_attention_layernorm.weight",
@@ -733,8 +721,8 @@ def QWEN_MAXTEXT_TO_HF_PARAM_MAPPING(config, maxtext_config, scan_layers=False):
   return mapping
 
 
-def QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN(config, maxtext_config, scan_layers=False, saving_to_hf=False):
-  """Creates parameter transformation functions for Qwen.
+def QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN(config, maxtext_config, scan_layers=False, saving_to_hf=False):
+  """Creates parameter transformation functions for Qwen3.
 
   This function provides a dictionary of transformation functions (hooks) for
   converting Qwen3 model parameters between MaxText and Hugging Face formats.
@@ -778,15 +766,6 @@ def reshape_kernel(input_tensor, target_shape):
     else:
       return input_tensor.T.reshape(target_shape)
 
-  def reshape_bias(input_tensor, target_shape=None):
-    """Reshapes biases between MaxText 2D (heads, dim) and HF 1D (hidden)."""
-    if saving_to_hf:
-      # MaxText [heads, head_dim] -> HF [hidden_dim] (flatten)
-      return input_tensor.reshape(target_shape)
-    else:
-      # HF [hidden_dim] -> MaxText [heads, head_dim]
-      return input_tensor.reshape(target_shape)
-
   mapping = {
       "params-token_embedder-embedding": pad_embedding_layer,
       "params-decoder-logits_dense-kernel": reshape_kernel,
@@ -801,11 +780,6 @@ def reshape_bias(input_tensor, target_shape=None):
       "mlp-wi_1-kernel",
       "mlp-wo-kernel",
   ]
-  bias_hooks = [
-      "self_attention-query-bias",
-      "self_attention-key-bias",
-      "self_attention-value-bias",
-  ]
   moe_kernel_hooks = [
       "moe_block-gate-kernel",
       "moe_block-wi_0-kernel",
@@ -819,17 +793,13 @@ def reshape_bias(input_tensor, target_shape=None):
   if scan_layers:
     for key in kernel_hooks:
       mapping[f"params-decoder-layers-{key}"] = reshape_kernel
-    for key in bias_hooks:
-      mapping[f"params-decoder-layers-{key}"] = reshape_bias
     if num_experts > 1:
       for key in moe_kernel_hooks:
         mapping[f"params-decoder-layers-{key}"] = reshape_kernel
   else:
     for i in range(n_layers):
       for key in kernel_hooks:
         mapping[f"params-decoder-layers_{i}-{key}"] = reshape_kernel
-      for key in bias_hooks:
-        mapping[f"params-decoder-layers_{i}-{key}"] = reshape_bias
       if num_experts > 1:
         for key in moe_kernel_hooks:
           mapping[f"params-decoder-layers_{i}-{key}"] = reshape_kernel
@@ -1406,7 +1376,7 @@ def QWEN3_OMNI_MOE_MAXTEXT_TO_HF_PARAM_MAPPING(config, maxtext_config, scan_laye
   # Text mapping with "thinker." prefix, reusing QWEN3-MOE mapping function
   num_experts_text = config["thinker_config"]["text_config"].get("num_experts", 0)
   n_layers_text = config["thinker_config"]["text_config"]["num_hidden_layers"]
-  text_mapping = QWEN_MAXTEXT_TO_HF_PARAM_MAPPING(
+  text_mapping = QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING(
       config={"num_hidden_layers": n_layers_text, "num_experts": num_experts_text},
       maxtext_config=maxtext_config,
       scan_layers=scan_layers,
@@ -1574,7 +1544,7 @@ def QWEN3_OMNI_MOE_MAXTEXT_TO_HF_PARAM_HOOK_FN(config, maxtext_config, scan_laye
   # Text hooks, reusing QWEN3-MOE hook function
   num_experts_text = config["thinker_config"]["text_config"].get("num_experts", 0)
   n_layers_text = config["thinker_config"]["text_config"]["num_hidden_layers"]
-  text_hooks = QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN(
+  text_hooks = QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN(
       config={"num_hidden_layers": n_layers_text, "num_experts": num_experts_text},
       maxtext_config=maxtext_config,
       scan_layers=scan_layers,
@@ -2362,23 +2332,24 @@ def pad_hf_embedding_layer(input_tensor, target_shape):
     "gemma3-4b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
     "gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
     "gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen2.5-0.5b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen2.5-1.5b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen2.5-3b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen3-4b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen3-4b-thinking-2507": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen3-8b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen3-14b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen3-32b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-0.6b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-1.7b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-1.7b-base": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-4b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-4b-base": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-4b-thinking-2507": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-8b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-8b-base": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-14b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-14b-base": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-32b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
     "llama3.1-8b": LLAMA31_MAXTEXT_TO_HF_PARAM_MAPPING,
     "llama3.1-70b": LLAMA31_MAXTEXT_TO_HF_PARAM_MAPPING,
     "llama3.1-405b": LLAMA31_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen3-30b-a3b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen3-235b-a22b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
-    "qwen3-coder-480b-a35b": QWEN_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-30b-a3b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-30b-a3b-base": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-235b-a22b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
+    "qwen3-coder-480b-a35b": QWEN3_MAXTEXT_TO_HF_PARAM_MAPPING,
     "deepseek3-671b": DEEPSEEK_MAXTEXT_TO_HF_PARAM_MAPPING,
     "gpt-oss-20b": GPT_OSS_MAXTEXT_TO_HF_PARAM_MAPPING,
     "gpt-oss-120b": GPT_OSS_MAXTEXT_TO_HF_PARAM_MAPPING,
@@ -2399,23 +2370,24 @@ def pad_hf_embedding_layer(input_tensor, target_shape):
     "gemma3-4b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "gemma3-12b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "gemma3-27b": GEMMA3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen2.5-0.5b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen2.5-1.5b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen2.5-3b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen2.5-7b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen2.5-14b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen3-0.6b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen3-4b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen3-4b-thinking-2507": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen3-8b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen3-14b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen3-32b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-0.6b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-1.7b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-1.7b-base": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-4b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-4b-base": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-4b-thinking-2507": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-8b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-8b-base": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-14b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-14b-base": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-32b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "llama3.1-8b": LLAMA31_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "llama3.1-70b": LLAMA31_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "llama3.1-405b": LLAMA31_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen3-30b-a3b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen3-235b-a22b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
-    "qwen3-coder-480b-a35b": QWEN_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-30b-a3b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-30b-a3b-base": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-235b-a22b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
+    "qwen3-coder-480b-a35b": QWEN3_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "deepseek3-671b": DEEPSEEK_MAXTEXT_TO_HF_PARAM_HOOK_FN,
     "gpt-oss-20b": GPT_OSS_TO_HF_PARAM_HOOK_FN,
     "gpt-oss-120b": GPT_OSS_TO_HF_PARAM_HOOK_FN,
diff --git a/src/maxtext/configs/models/qwen3-1.7b-base.yml b/src/maxtext/configs/models/qwen3-1.7b-base.yml
@@ -0,0 +1,37 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for qwen3-1.7b-base
+
+base_emb_dim: 2048
+base_num_query_heads: 16
+base_num_kv_heads: 8
+base_mlp_dim: 6144
+base_num_decoder_layers: 28
+head_dim: 128
+mlp_activations: ["silu", "linear"] # "hidden_act": "silu" implies SwiGLU
+vocab_size: 151936
+
+decoder_block: "qwen3"
+
+normalization_layer_epsilon: 1.0e-6
+rope_max_timescale: 1000000
+
+use_qk_norm: True
+
+logits_via_embedding: True # from "tie_word_embeddings": true
+normalize_embedding_logits: False
+enable_dropout: False # deterministic for testing
+
+tokenizer_type: "huggingface"
diff --git a/src/maxtext/configs/models/qwen3-1.7b.yml b/src/maxtext/configs/models/qwen3-1.7b.yml
@@ -0,0 +1,37 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for qwen3-1.7b
+
+base_emb_dim: 2048
+base_num_query_heads: 16
+base_num_kv_heads: 8
+base_mlp_dim: 6144
+base_num_decoder_layers: 28
+head_dim: 128
+mlp_activations: ["silu", "linear"] # "hidden_act": "silu" implies SwiGLU
+vocab_size: 151936
+
+decoder_block: "qwen3"
+
+normalization_layer_epsilon: 1.0e-6
+rope_max_timescale: 1000000
+
+use_qk_norm: True
+
+logits_via_embedding: True # from "tie_word_embeddings": true
+normalize_embedding_logits: False
+enable_dropout: False # deterministic for testing
+
+tokenizer_type: "huggingface"
diff --git a/src/maxtext/configs/models/qwen3-14b-base.yml b/src/maxtext/configs/models/qwen3-14b-base.yml
@@ -0,0 +1,37 @@
+# Copyright 2023–2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# model config for qwen3-14b-base
+
+base_emb_dim: 5120
+base_num_query_heads: 40
+base_num_kv_heads: 8
+base_mlp_dim: 17408
+base_num_decoder_layers: 40
+head_dim: 128
+mlp_activations: ["silu", "linear"] # "hidden_act": "silu" implies SwiGLU
+vocab_size: 151936
+
+decoder_block: "qwen3"
+
+normalization_layer_epsilon: 1.0e-6
+rope_max_timescale: 1000000
+
+use_qk_norm: True
+
+logits_via_embedding: False # different from 0.6 and 4B variants, "tie_word_embeddings": false
+normalize_embedding_logits: False
+
+tokenizer_type: "huggingface"
+
diff --git a/src/maxtext/configs/models/qwen3-30b-a3b-base.yml b/src/maxtext/configs/models/qwen3-30b-a3b-base.yml
@@ -0,0 +1,40 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Model config for Qwen3-30B-A3B-base
+
+# Core Architectural Parameters
+decoder_block: "qwen3_moe"
+base_emb_dim: 2048
+base_mlp_dim: 768
+base_num_query_heads: 32
+base_num_kv_heads: 4
+base_num_decoder_layers: 48
+head_dim: 128
+mlp_activations: ["silu", "linear"]
+vocab_size: 151936
+normalization_layer_epsilon: 1.0e-6
+use_qk_norm: True
+
+# MoE Specific Parameters
+num_experts: 128
+num_experts_per_tok: 8
+base_moe_mlp_dim: 768
+norm_topk_prob: true
+
+# RoPE Settings
+rope_max_timescale: 10_000_000
+
+# General Model Settings
+enable_dropout: False
diff --git a/src/maxtext/configs/models/qwen3-4b-base.yml b/src/maxtext/configs/models/qwen3-4b-base.yml
diff --git a/src/maxtext/configs/models/qwen3-8b-base.yml b/src/maxtext/configs/models/qwen3-8b-base.yml
diff --git a/src/maxtext/configs/types.py b/src/maxtext/configs/types.py