|
26 | 26 | AutoModelForQuestionAnswering, |
27 | 27 | AutoTokenizer, |
28 | 28 | BertConfig, |
| 29 | + DeepseekV3Config, |
29 | 30 | GptOssConfig, |
30 | 31 | LlamaConfig, |
31 | 32 | PreTrainedModel, |
@@ -120,6 +121,44 @@ def create_tiny_qwen3_moe_dir( |
120 | 121 | return qwen3_moe_dir |
121 | 122 |
|
122 | 123 |
|
| 124 | +##### DeepSeek V3 ##### |
| 125 | +def get_tiny_deepseek_v3(**config_kwargs) -> PreTrainedModel: |
| 126 | + set_seed(SEED) |
| 127 | + kwargs = { |
| 128 | + "dtype": torch.bfloat16, |
| 129 | + "vocab_size": 128, |
| 130 | + "hidden_size": 128, |
| 131 | + "intermediate_size": 256, |
| 132 | + "moe_intermediate_size": 64, |
| 133 | + "num_hidden_layers": 2, |
| 134 | + "num_attention_heads": 2, |
| 135 | + "num_key_value_heads": 2, |
| 136 | + "n_routed_experts": 4, |
| 137 | + "num_experts_per_tok": 2, |
| 138 | + "n_shared_experts": 1, |
| 139 | + "first_k_dense_replace": 0, |
| 140 | + "kv_lora_rank": 16, |
| 141 | + "q_lora_rank": 32, |
| 142 | + "qk_rope_head_dim": 16, |
| 143 | + "qk_nope_head_dim": 16, |
| 144 | + "v_head_dim": 16, |
| 145 | + "max_position_embeddings": 128, |
| 146 | + # Required so vLLM allocates ``gate.e_score_correction_bias`` (HF saves it unconditionally). |
| 147 | + "topk_method": "noaux_tc", |
| 148 | + } |
| 149 | + kwargs.update(**config_kwargs) |
| 150 | + cfg = DeepseekV3Config(**kwargs) |
| 151 | + # Survive transformers versions that drop unknown kwargs from the dataclass. |
| 152 | + cfg.topk_method = kwargs["topk_method"] |
| 153 | + return AutoModelForCausalLM.from_config(cfg) |
| 154 | + |
| 155 | + |
| 156 | +def create_tiny_deepseek_v3_dir(tmp_path: Path | str, **config_kwargs) -> Path: |
| 157 | + deepseek_dir = Path(tmp_path) / "tiny_deepseek_v3" |
| 158 | + get_tiny_deepseek_v3(**config_kwargs).save_pretrained(deepseek_dir) |
| 159 | + return deepseek_dir |
| 160 | + |
| 161 | + |
123 | 162 | ##### GPT-OSS ##### |
124 | 163 | def get_tiny_gpt_oss(**config_kwargs) -> PreTrainedModel: |
125 | 164 | set_seed(SEED) |
|
0 commit comments