Skip to content

Commit a264ca0

Browse files
Merge pull request #4039 from AI-Hypercomputer:yujiedeng/repo-fix
PiperOrigin-RevId: 933555873
2 parents f0a04cd + b617934 commit a264ca0

2 files changed

Lines changed: 40 additions & 2 deletions

File tree

src/maxtext/checkpoint_conversion/utils/hf_model_configs.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,44 @@
718718
vocab_size=151936,
719719
)
720720

721+
# Matches Qwen/Qwen3-30B-A3B-Base
722+
qwen3_30b_a3b_base_config = transformers.Qwen3MoeConfig(
723+
architectures=["Qwen3MoeForCausalLM"],
724+
attention_bias=False,
725+
attention_dropout=0.0,
726+
bos_token_id=151643,
727+
decoder_sparse_step=1,
728+
eos_token_id=151643,
729+
head_dim=128,
730+
hidden_act="silu",
731+
hidden_size=2048,
732+
initializer_range=0.02,
733+
intermediate_size=6144,
734+
max_position_embeddings=32768,
735+
max_window_layers=48,
736+
mlp_only_layers=[],
737+
model_type="qwen3_moe",
738+
moe_intermediate_size=768,
739+
norm_topk_prob=True,
740+
num_attention_heads=32,
741+
num_experts=128,
742+
num_experts_per_tok=8,
743+
num_hidden_layers=48,
744+
num_key_value_heads=4,
745+
output_router_logits=False,
746+
rms_norm_eps=1e-06,
747+
rope_scaling=None,
748+
rope_theta=1000000,
749+
router_aux_loss_coef=0.001,
750+
sliding_window=None,
751+
tie_word_embeddings=False,
752+
torch_dtype="bfloat16",
753+
transformers_version="4.51.0",
754+
use_cache=True,
755+
use_sliding_window=False,
756+
vocab_size=151936,
757+
)
758+
721759
qwen3_235b_a22b_thinking_2507_config = transformers.Qwen3MoeConfig(
722760
architectures=["Qwen3MoeForCausalLM"],
723761
attention_bias=False,
@@ -1579,7 +1617,7 @@ def __init__(self, **kwargs):
15791617
"llama3.1-70b": llama31_70b_config,
15801618
"llama3.1-405b": llama31_405b_config,
15811619
"qwen3-30b-a3b": qwen3_30b_a3b_thinking_2507_config,
1582-
"qwen3-30b-a3b-base": qwen3_30b_a3b_thinking_2507_config,
1620+
"qwen3-30b-a3b-base": qwen3_30b_a3b_base_config,
15831621
"qwen3-235b-a22b": qwen3_235b_a22b_thinking_2507_config,
15841622
"qwen3-480b-a35b": qwen3_coder_480b_a35b_config,
15851623
"deepseek2-16b": deepseek2_16b_config,

src/maxtext/configs/models/qwen3-30b-a3b-base.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ base_moe_mlp_dim: 768
3434
norm_topk_prob: true
3535

3636
# RoPE Settings
37-
rope_max_timescale: 10_000_000
37+
rope_max_timescale: 1_000_000
3838

3939
# General Model Settings
4040
enable_dropout: false

0 commit comments

Comments
 (0)