Skip to content

Commit 831ade9

Browse files
committed
Fix HF model configuration mapping for qwen3-30b-a3b-base
1 parent a1c0f8b commit 831ade9

1 file changed

Lines changed: 35 additions & 1 deletion

File tree

src/maxtext/checkpoint_conversion/utils/hf_model_configs.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,40 @@
718718
vocab_size=151936,
719719
)
720720

721+
qwen3_30b_a3b_base_config = transformers.Qwen3MoeConfig(
722+
architectures=["Qwen3MoeForCausalLM"],
723+
attention_bias=False,
724+
attention_dropout=0.0,
725+
bos_token_id=151643,
726+
decoder_sparse_step=1,
727+
eos_token_id=151645,
728+
head_dim=128,
729+
hidden_act="silu",
730+
hidden_size=2048,
731+
initializer_range=0.02,
732+
intermediate_size=6144,
733+
max_position_embeddings=262144,
734+
max_window_layers=48,
735+
model_type="qwen3_moe",
736+
moe_intermediate_size=768,
737+
norm_topk_prob=True,
738+
num_attention_heads=32,
739+
num_experts=128,
740+
num_experts_per_tok=8,
741+
num_hidden_layers=48,
742+
num_key_value_heads=4,
743+
output_router_logits=False,
744+
rms_norm_eps=1e-06,
745+
rope_scaling=None,
746+
rope_theta=1000000,
747+
router_aux_loss_coef=0.001,
748+
sliding_window=None,
749+
tie_word_embeddings=False,
750+
torch_dtype="bfloat16",
751+
use_cache=True,
752+
vocab_size=151936,
753+
)
754+
721755
qwen3_235b_a22b_thinking_2507_config = transformers.Qwen3MoeConfig(
722756
architectures=["Qwen3MoeForCausalLM"],
723757
attention_bias=False,
@@ -1579,7 +1613,7 @@ def __init__(self, **kwargs):
15791613
"llama3.1-70b": llama31_70b_config,
15801614
"llama3.1-405b": llama31_405b_config,
15811615
"qwen3-30b-a3b": qwen3_30b_a3b_thinking_2507_config,
1582-
"qwen3-30b-a3b-base": qwen3_30b_a3b_thinking_2507_config,
1616+
"qwen3-30b-a3b-base": qwen3_30b_a3b_base_config,
15831617
"qwen3-235b-a22b": qwen3_235b_a22b_thinking_2507_config,
15841618
"qwen3-480b-a35b": qwen3_coder_480b_a35b_config,
15851619
"deepseek2-16b": deepseek2_16b_config,

0 commit comments

Comments
 (0)