Skip to content

Commit 659d5b1

Browse files
committed
fix: set rope_max_timescale to 1M for qwen3-30b-a3b-base and update HF configuration/shape mappings
1 parent 57a6b30 commit 659d5b1

3 files changed

Lines changed: 37 additions & 2 deletions

File tree

src/maxtext/checkpoint_conversion/utils/hf_model_configs.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,40 @@
718718
vocab_size=151936,
719719
)
720720

721+
qwen3_30b_a3b_base_config = transformers.Qwen3MoeConfig(
722+
architectures=["Qwen3MoeForCausalLM"],
723+
attention_bias=False,
724+
attention_dropout=0.0,
725+
bos_token_id=151643,
726+
decoder_sparse_step=1,
727+
eos_token_id=151645,
728+
head_dim=128,
729+
hidden_act="silu",
730+
hidden_size=2048,
731+
initializer_range=0.02,
732+
intermediate_size=6144,
733+
max_position_embeddings=262144,
734+
max_window_layers=48,
735+
model_type="qwen3_moe",
736+
moe_intermediate_size=768,
737+
norm_topk_prob=True,
738+
num_attention_heads=32,
739+
num_experts=128,
740+
num_experts_per_tok=8,
741+
num_hidden_layers=48,
742+
num_key_value_heads=4,
743+
output_router_logits=False,
744+
rms_norm_eps=1e-06,
745+
rope_scaling=None,
746+
rope_theta=1000000,
747+
router_aux_loss_coef=0.001,
748+
sliding_window=None,
749+
tie_word_embeddings=False,
750+
torch_dtype="bfloat16",
751+
use_cache=True,
752+
vocab_size=151936,
753+
)
754+
721755
qwen3_235b_a22b_thinking_2507_config = transformers.Qwen3MoeConfig(
722756
architectures=["Qwen3MoeForCausalLM"],
723757
attention_bias=False,
@@ -1579,7 +1613,7 @@ def __init__(self, **kwargs):
15791613
"llama3.1-70b": llama31_70b_config,
15801614
"llama3.1-405b": llama31_405b_config,
15811615
"qwen3-30b-a3b": qwen3_30b_a3b_thinking_2507_config,
1582-
"qwen3-30b-a3b-base": qwen3_30b_a3b_thinking_2507_config,
1616+
"qwen3-30b-a3b-base": qwen3_30b_a3b_base_config,
15831617
"qwen3-235b-a22b": qwen3_235b_a22b_thinking_2507_config,
15841618
"qwen3-480b-a35b": qwen3_coder_480b_a35b_config,
15851619
"deepseek2-16b": deepseek2_16b_config,

src/maxtext/checkpoint_conversion/utils/hf_shape.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1131,6 +1131,7 @@ def MIXTRAL_HF_WEIGHTS_TO_SHAPE(config):
11311131
"llama3.1-70b": LLAMA31_HF_WEIGHTS_TO_SHAPE,
11321132
"llama3.1-405b": LLAMA31_HF_WEIGHTS_TO_SHAPE,
11331133
"qwen3-30b-a3b": QWEN_HF_WEIGHTS_TO_SHAPE,
1134+
"qwen3-30b-a3b-base": QWEN_HF_WEIGHTS_TO_SHAPE,
11341135
"qwen3-235b-a22b": QWEN_HF_WEIGHTS_TO_SHAPE,
11351136
"qwen3-480b-a35b": QWEN_HF_WEIGHTS_TO_SHAPE,
11361137
"deepseek2-16b": DEEPSEEK_HF_WEIGHTS_TO_SHAPE,

src/maxtext/configs/models/qwen3-30b-a3b-base.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ base_moe_mlp_dim: 768
3434
norm_topk_prob: true
3535

3636
# RoPE Settings
37-
rope_max_timescale: 10_000_000
37+
rope_max_timescale: 1_000_000
3838

3939
# General Model Settings
4040
enable_dropout: false

0 commit comments

Comments
 (0)