88#include < string>
99
1010static std::string dsv4_kv (const char * suffix) {
11- return std::string (" deepseek4" ) + suffix;
11+ return std::string (" deepseek4. " ) + suffix;
1212}
1313
1414static float dsv4_rope_attn_factor (float freq_scale, float ext_factor) {
@@ -19,7 +19,7 @@ static float dsv4_rope_attn_factor(float freq_scale, float ext_factor) {
1919 return 1 .0f / (1 .0f + 0 .1f *logf (1 .0f /freq_scale));
2020}
2121
22- void llama_model_deepseek_v4_flash ::load_arch_hparams (llama_model_loader & ml) {
22+ void llama_model_deepseek4 ::load_arch_hparams (llama_model_loader & ml) {
2323 ml.get_key (LLM_KV_ATTENTION_LAYERNORM_RMS_EPS , hparams.f_norm_rms_eps );
2424 ml.get_key (LLM_KV_ATTENTION_Q_LORA_RANK , hparams.n_lora_q );
2525 ml.get_key (LLM_KV_ATTENTION_SLIDING_WINDOW , hparams.n_swa );
@@ -46,7 +46,7 @@ void llama_model_deepseek_v4_flash::load_arch_hparams(llama_model_loader & ml) {
4646 uint32_t n_compress_ratios = 0 ;
4747 ml.get_arr_n (dsv4_kv (" attention.compress_ratios" ), n_compress_ratios);
4848 if (n_compress_ratios < hparams.n_layer ) {
49- throw std::runtime_error (" DeepSeek-V4 Flash compress_ratios is shorter than block_count" );
49+ throw std::runtime_error (" DeepSeek-V4 compress_ratios is shorter than block_count" );
5050 }
5151 ml.get_arr (dsv4_kv (" attention.compress_ratios" ), hparams.dsv4_compress_ratios );
5252
@@ -55,10 +55,10 @@ void llama_model_deepseek_v4_flash::load_arch_hparams(llama_model_loader & ml) {
5555 ml.get_key (dsv4_kv (" moe.score_func" ), score_func);
5656 ml.get_key (dsv4_kv (" moe.topk_method" ), topk_method);
5757 if (score_func != " sqrtsoftplus" ) {
58- throw std::runtime_error (" DeepSeek-V4 Flash loader currently expects sqrtsoftplus MoE scoring" );
58+ throw std::runtime_error (" DeepSeek-V4 loader currently expects sqrtsoftplus MoE scoring" );
5959 }
6060 if (topk_method != " noaux_tc" ) {
61- throw std::runtime_error (" DeepSeek-V4 Flash loader currently expects noaux_tc MoE top-k" );
61+ throw std::runtime_error (" DeepSeek-V4 loader currently expects noaux_tc MoE top-k" );
6262 }
6363
6464 hparams.swa_type = LLAMA_SWA_TYPE_STANDARD ;
@@ -70,7 +70,7 @@ void llama_model_deepseek_v4_flash::load_arch_hparams(llama_model_loader & ml) {
7070 }
7171}
7272
73- void llama_model_deepseek_v4_flash ::load_arch_tensors (llama_model_loader &) {
73+ void llama_model_deepseek4 ::load_arch_tensors (llama_model_loader &) {
7474 LLAMA_LOAD_LOCALS ;
7575
7676 const int64_t q_lora_rank = hparams.n_lora_q ;
@@ -133,7 +133,7 @@ void llama_model_deepseek_v4_flash::load_arch_tensors(llama_model_loader &) {
133133 layer.indexer_comp_ape = create_tensor (tn (LLM_TENSOR_INDEXER_COMPRESSOR_APE , nullptr , i), {2 * n_embd_indexer, ratio}, 0 );
134134 layer.indexer_comp_norm = create_tensor (tn (LLM_TENSOR_INDEXER_COMPRESSOR_NORM , " weight" , i), {n_embd_indexer}, 0 );
135135 } else if (ratio != 128 ) {
136- throw std::runtime_error (" DeepSeek-V4 Flash loader only supports compression ratios 0, 4, and 128" );
136+ throw std::runtime_error (" DeepSeek-V4 loader only supports compression ratios 0, 4, and 128" );
137137 }
138138 }
139139
@@ -155,7 +155,7 @@ void llama_model_deepseek_v4_flash::load_arch_tensors(llama_model_loader &) {
155155 }
156156}
157157
158- std::unique_ptr<llm_graph_context> llama_model_deepseek_v4_flash ::build_arch_graph (const llm_graph_params & params) const {
158+ std::unique_ptr<llm_graph_context> llama_model_deepseek4 ::build_arch_graph (const llm_graph_params & params) const {
159159 return std::make_unique<graph>(*this , params);
160160}
161161
@@ -207,7 +207,7 @@ static ggml_tensor * dsv4_hc_affine(
207207 return x;
208208}
209209
210- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_hc_weighted_sum (
210+ ggml_tensor * llama_model_deepseek4 ::graph::build_hc_weighted_sum (
211211 ggml_tensor * x,
212212 ggml_tensor * weights) const {
213213 const int64_t hc = hparams.dsv4_hc_mult ;
@@ -225,7 +225,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_hc_weighted_sum(
225225 return acc;
226226}
227227
228- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_hc_sinkhorn (
228+ ggml_tensor * llama_model_deepseek4 ::graph::build_hc_sinkhorn (
229229 ggml_tensor * comb,
230230 int il) const {
231231 GGML_UNUSED (il);
@@ -262,7 +262,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_hc_sinkhorn(
262262 return comb;
263263}
264264
265- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_hc_pre (
265+ ggml_tensor * llama_model_deepseek4 ::graph::build_hc_pre (
266266 ggml_tensor * x,
267267 ggml_tensor * hc_fn,
268268 ggml_tensor * hc_scale,
@@ -314,7 +314,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_hc_pre(
314314 return build_hc_weighted_sum (x, pre );
315315}
316316
317- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_hc_post (
317+ ggml_tensor * llama_model_deepseek4 ::graph::build_hc_post (
318318 ggml_tensor * x,
319319 ggml_tensor * residual,
320320 ggml_tensor * post ,
@@ -343,7 +343,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_hc_post(
343343 return out;
344344}
345345
346- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_hc_head (
346+ ggml_tensor * llama_model_deepseek4 ::graph::build_hc_head (
347347 ggml_tensor * x,
348348 ggml_tensor * hc_fn,
349349 ggml_tensor * hc_scale,
@@ -367,7 +367,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_hc_head(
367367 return build_hc_weighted_sum (x, pre );
368368}
369369
370- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_hca_compressed_kv_from_state (
370+ ggml_tensor * llama_model_deepseek4 ::graph::build_hca_compressed_kv_from_state (
371371 ggml_tensor * kv_state,
372372 ggml_tensor * score_state,
373373 ggml_tensor * state_read_idxs,
@@ -425,7 +425,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_hca_compressed_kv_from
425425 return comp;
426426}
427427
428- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_overlap_compressed_kv_from_state (
428+ ggml_tensor * llama_model_deepseek4 ::graph::build_overlap_compressed_kv_from_state (
429429 ggml_tensor * kv_state,
430430 ggml_tensor * score_state,
431431 ggml_tensor * state_read_idxs,
@@ -507,7 +507,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_overlap_compressed_kv_
507507 return comp;
508508}
509509
510- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_lid_top_k (
510+ ggml_tensor * llama_model_deepseek4 ::graph::build_lid_top_k (
511511 const llama_model & model,
512512 llm_graph_input_dsv4 * inp_dsv4,
513513 ggml_tensor * qr,
@@ -597,7 +597,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_lid_top_k(
597597 return top_k;
598598}
599599
600- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_top_k_mask (
600+ ggml_tensor * llama_model_deepseek4 ::graph::build_top_k_mask (
601601 ggml_tensor * kq_mask,
602602 ggml_tensor * top_k,
603603 const char * name,
@@ -626,7 +626,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_top_k_mask(
626626 return kq_mask_top_k;
627627}
628628
629- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_csa_lid_attention (
629+ ggml_tensor * llama_model_deepseek4 ::graph::build_csa_lid_attention (
630630 const llama_model & model,
631631 llm_graph_input_dsv4 * inp_dsv4,
632632 llm_graph_input_attn_kv_iswa * inp_attn,
@@ -689,7 +689,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_csa_lid_attention(
689689 return out;
690690}
691691
692- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_hca_attention (
692+ ggml_tensor * llama_model_deepseek4 ::graph::build_hca_attention (
693693 llm_graph_input_dsv4 * inp_dsv4,
694694 llm_graph_input_attn_kv_iswa * inp_attn,
695695 ggml_tensor * q,
@@ -746,7 +746,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_hca_attention(
746746 return out;
747747}
748748
749- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_raw_attention (
749+ ggml_tensor * llama_model_deepseek4 ::graph::build_raw_attention (
750750 llm_graph_input_attn_kv_iswa * inp_attn,
751751 ggml_tensor * q,
752752 ggml_tensor * kv,
@@ -782,7 +782,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_raw_attention(
782782 return out;
783783}
784784
785- ggml_tensor * llama_model_deepseek_v4_flash ::graph::build_attention (
785+ ggml_tensor * llama_model_deepseek4 ::graph::build_attention (
786786 const llama_model & model,
787787 llm_graph_input_dsv4 * inp_dsv4,
788788 ggml_tensor * cur,
@@ -1080,7 +1080,7 @@ ggml_tensor * llama_model_deepseek_v4_flash::graph::build_attention(
10801080 return out;
10811081}
10821082
1083- llama_model_deepseek_v4_flash ::graph::graph (const llama_model & model, const llm_graph_params & params) :
1083+ llama_model_deepseek4 ::graph::graph (const llama_model & model, const llm_graph_params & params) :
10841084 llm_graph_context(params) {
10851085 ggml_tensor * cur;
10861086
0 commit comments