|
718 | 718 | vocab_size=151936, |
719 | 719 | ) |
720 | 720 |
|
| 721 | +qwen3_30b_a3b_base_config = transformers.Qwen3MoeConfig( |
| 722 | + architectures=["Qwen3MoeForCausalLM"], |
| 723 | + attention_bias=False, |
| 724 | + attention_dropout=0.0, |
| 725 | + bos_token_id=151643, |
| 726 | + decoder_sparse_step=1, |
| 727 | + eos_token_id=151645, |
| 728 | + head_dim=128, |
| 729 | + hidden_act="silu", |
| 730 | + hidden_size=2048, |
| 731 | + initializer_range=0.02, |
| 732 | + intermediate_size=6144, |
| 733 | + max_position_embeddings=262144, |
| 734 | + max_window_layers=48, |
| 735 | + model_type="qwen3_moe", |
| 736 | + moe_intermediate_size=768, |
| 737 | + norm_topk_prob=True, |
| 738 | + num_attention_heads=32, |
| 739 | + num_experts=128, |
| 740 | + num_experts_per_tok=8, |
| 741 | + num_hidden_layers=48, |
| 742 | + num_key_value_heads=4, |
| 743 | + output_router_logits=False, |
| 744 | + rms_norm_eps=1e-06, |
| 745 | + rope_scaling=None, |
| 746 | + rope_theta=1000000, |
| 747 | + router_aux_loss_coef=0.001, |
| 748 | + sliding_window=None, |
| 749 | + tie_word_embeddings=False, |
| 750 | + torch_dtype="bfloat16", |
| 751 | + use_cache=True, |
| 752 | + vocab_size=151936, |
| 753 | +) |
| 754 | + |
721 | 755 | qwen3_235b_a22b_thinking_2507_config = transformers.Qwen3MoeConfig( |
722 | 756 | architectures=["Qwen3MoeForCausalLM"], |
723 | 757 | attention_bias=False, |
@@ -1579,7 +1613,7 @@ def __init__(self, **kwargs): |
1579 | 1613 | "llama3.1-70b": llama31_70b_config, |
1580 | 1614 | "llama3.1-405b": llama31_405b_config, |
1581 | 1615 | "qwen3-30b-a3b": qwen3_30b_a3b_thinking_2507_config, |
1582 | | - "qwen3-30b-a3b-base": qwen3_30b_a3b_thinking_2507_config, |
| 1616 | + "qwen3-30b-a3b-base": qwen3_30b_a3b_base_config, |
1583 | 1617 | "qwen3-235b-a22b": qwen3_235b_a22b_thinking_2507_config, |
1584 | 1618 | "qwen3-480b-a35b": qwen3_coder_480b_a35b_config, |
1585 | 1619 | "deepseek2-16b": deepseek2_16b_config, |
|
0 commit comments