Skip to content

Commit 07c3713

Browse files
committed
add basic setup
1 parent 65861e7 commit 07c3713

11 files changed

Lines changed: 760 additions & 24 deletions

src/llama-arch.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
7676
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
7777
{ LLM_ARCH_DEEPSEEK2OCR, "deepseek2-ocr" },
7878
{ LLM_ARCH_DEEPSEEK32, "deepseek32" },
79+
{ LLM_ARCH_DEEPSEEK_V4_FLASH, "deepseek-v4-flash" },
7980
{ LLM_ARCH_CHATGLM, "chatglm" },
8081
{ LLM_ARCH_GLM4, "glm4" },
8182
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
@@ -427,6 +428,23 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
427428
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
428429
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
429430
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
431+
{ LLM_TENSOR_ATTN_KV, "blk.%d.attn_kv" },
432+
{ LLM_TENSOR_ATTN_KV_NORM, "blk.%d.attn_kv_norm" },
433+
{ LLM_TENSOR_ATTN_OUT_A, "blk.%d.attn_wo_a" },
434+
{ LLM_TENSOR_ATTN_OUT_B, "blk.%d.attn_wo_b" },
435+
{ LLM_TENSOR_HC_HEAD_FN, "output_hc.fn" },
436+
{ LLM_TENSOR_HC_HEAD_BASE, "output_hc.base" },
437+
{ LLM_TENSOR_HC_HEAD_SCALE, "output_hc.scale" },
438+
{ LLM_TENSOR_HC_ATTN_FN, "blk.%d.hc_attn.fn" },
439+
{ LLM_TENSOR_HC_ATTN_BASE, "blk.%d.hc_attn.base" },
440+
{ LLM_TENSOR_HC_ATTN_SCALE, "blk.%d.hc_attn.scale" },
441+
{ LLM_TENSOR_HC_FFN_FN, "blk.%d.hc_ffn.fn" },
442+
{ LLM_TENSOR_HC_FFN_BASE, "blk.%d.hc_ffn.base" },
443+
{ LLM_TENSOR_HC_FFN_SCALE, "blk.%d.hc_ffn.scale" },
444+
{ LLM_TENSOR_ATTN_COMPRESSOR_WKV, "blk.%d.attn_comp.wkv" },
445+
{ LLM_TENSOR_ATTN_COMPRESSOR_WGATE, "blk.%d.attn_comp.wgate" },
446+
{ LLM_TENSOR_ATTN_COMPRESSOR_APE, "blk.%d.attn_comp.ape" },
447+
{ LLM_TENSOR_ATTN_COMPRESSOR_NORM, "blk.%d.attn_comp.norm" },
430448
{ LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" },
431449
{ LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" },
432450
{ LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" },
@@ -551,6 +569,11 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
551569
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
552570
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
553571
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
572+
{ LLM_TENSOR_INDEXER_COMPRESSOR_WKV, "blk.%d.indexer_comp.wkv" },
573+
{ LLM_TENSOR_INDEXER_COMPRESSOR_WGATE, "blk.%d.indexer_comp.wgate" },
574+
{ LLM_TENSOR_INDEXER_COMPRESSOR_APE, "blk.%d.indexer_comp.ape" },
575+
{ LLM_TENSOR_INDEXER_COMPRESSOR_NORM, "blk.%d.indexer_comp.norm" },
576+
{ LLM_TENSOR_FFN_GATE_TID2EID, "blk.%d.ffn_gate_tid2eid" },
554577
};
555578

556579
// declare information about the model weight tensors:
@@ -597,6 +620,23 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
597620
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
598621
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
599622
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
623+
{LLM_TENSOR_ATTN_KV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
624+
{LLM_TENSOR_ATTN_KV_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
625+
{LLM_TENSOR_ATTN_OUT_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
626+
{LLM_TENSOR_ATTN_OUT_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
627+
{LLM_TENSOR_HC_HEAD_FN, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
628+
{LLM_TENSOR_HC_HEAD_BASE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}},
629+
{LLM_TENSOR_HC_HEAD_SCALE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
630+
{LLM_TENSOR_HC_ATTN_FN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
631+
{LLM_TENSOR_HC_ATTN_BASE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
632+
{LLM_TENSOR_HC_ATTN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
633+
{LLM_TENSOR_HC_FFN_FN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
634+
{LLM_TENSOR_HC_FFN_BASE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
635+
{LLM_TENSOR_HC_FFN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
636+
{LLM_TENSOR_ATTN_COMPRESSOR_WKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
637+
{LLM_TENSOR_ATTN_COMPRESSOR_WGATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
638+
{LLM_TENSOR_ATTN_COMPRESSOR_APE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
639+
{LLM_TENSOR_ATTN_COMPRESSOR_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
600640
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
601641
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
602642
{LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
@@ -760,6 +800,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
760800
{LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
761801
{LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
762802
{LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
803+
{LLM_TENSOR_INDEXER_COMPRESSOR_WKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
804+
{LLM_TENSOR_INDEXER_COMPRESSOR_WGATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
805+
{LLM_TENSOR_INDEXER_COMPRESSOR_APE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
806+
{LLM_TENSOR_INDEXER_COMPRESSOR_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
807+
{LLM_TENSOR_FFN_GATE_TID2EID, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
763808
// NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the
764809
// last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so
765810
// the model loader doesn't fault on the block index.
@@ -907,6 +952,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
907952
case LLM_ARCH_OLMOE:
908953
case LLM_ARCH_DEEPSEEK2:
909954
case LLM_ARCH_DEEPSEEK32:
955+
case LLM_ARCH_DEEPSEEK_V4_FLASH:
910956
case LLM_ARCH_GLM_DSA:
911957
case LLM_ARCH_BITNET:
912958
case LLM_ARCH_T5:

src/llama-arch.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ enum llm_arch {
8080
LLM_ARCH_DEEPSEEK2,
8181
LLM_ARCH_DEEPSEEK2OCR,
8282
LLM_ARCH_DEEPSEEK32,
83+
LLM_ARCH_DEEPSEEK_V4_FLASH,
8384
LLM_ARCH_CHATGLM,
8485
LLM_ARCH_GLM4,
8586
LLM_ARCH_GLM4_MOE,
@@ -487,10 +488,27 @@ enum llm_tensor {
487488
LLM_TENSOR_ATTN_Q_B,
488489
LLM_TENSOR_ATTN_KV_A_MQA,
489490
LLM_TENSOR_ATTN_KV_B,
491+
LLM_TENSOR_ATTN_KV,
492+
LLM_TENSOR_ATTN_KV_NORM,
493+
LLM_TENSOR_ATTN_OUT_A,
494+
LLM_TENSOR_ATTN_OUT_B,
490495
LLM_TENSOR_ATTN_K_B,
491496
LLM_TENSOR_ATTN_V_B,
492497
LLM_TENSOR_ATTN_Q_A_NORM,
493498
LLM_TENSOR_ATTN_KV_A_NORM,
499+
LLM_TENSOR_HC_HEAD_FN,
500+
LLM_TENSOR_HC_HEAD_BASE,
501+
LLM_TENSOR_HC_HEAD_SCALE,
502+
LLM_TENSOR_HC_ATTN_FN,
503+
LLM_TENSOR_HC_ATTN_BASE,
504+
LLM_TENSOR_HC_ATTN_SCALE,
505+
LLM_TENSOR_HC_FFN_FN,
506+
LLM_TENSOR_HC_FFN_BASE,
507+
LLM_TENSOR_HC_FFN_SCALE,
508+
LLM_TENSOR_ATTN_COMPRESSOR_WKV,
509+
LLM_TENSOR_ATTN_COMPRESSOR_WGATE,
510+
LLM_TENSOR_ATTN_COMPRESSOR_APE,
511+
LLM_TENSOR_ATTN_COMPRESSOR_NORM,
494512
LLM_TENSOR_ATTN_SUB_NORM,
495513
LLM_TENSOR_FFN_SUB_NORM,
496514
LLM_TENSOR_DEC_ATTN_NORM,
@@ -552,6 +570,11 @@ enum llm_tensor {
552570
LLM_TENSOR_INDEXER_PROJ,
553571
LLM_TENSOR_INDEXER_ATTN_K,
554572
LLM_TENSOR_INDEXER_ATTN_Q_B,
573+
LLM_TENSOR_INDEXER_COMPRESSOR_WKV,
574+
LLM_TENSOR_INDEXER_COMPRESSOR_WGATE,
575+
LLM_TENSOR_INDEXER_COMPRESSOR_APE,
576+
LLM_TENSOR_INDEXER_COMPRESSOR_NORM,
577+
LLM_TENSOR_FFN_GATE_TID2EID,
555578
LLM_TENSOR_NEXTN_EH_PROJ,
556579
LLM_TENSOR_NEXTN_EMBED_TOKENS,
557580
LLM_TENSOR_NEXTN_ENORM,

src/llama-context.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2205,7 +2205,11 @@ void llama_context::output_reorder() {
22052205
//
22062206

22072207
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
2208-
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
2208+
if (model.arch == LLM_ARCH_QWEN3NEXT ||
2209+
model.arch == LLM_ARCH_KIMI_LINEAR ||
2210+
model.arch == LLM_ARCH_QWEN35 ||
2211+
model.arch == LLM_ARCH_QWEN35MOE ||
2212+
model.arch == LLM_ARCH_DEEPSEEK_V4_FLASH) {
22092213
return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
22102214
}
22112215
uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());

src/llama-graph.cpp

Lines changed: 42 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,20 +1286,24 @@ ggml_tensor * llm_graph_context::build_ffn(
12861286
switch (type_op) {
12871287
case LLM_FFN_SILU:
12881288
if (gate && type_gate == LLM_FFN_PAR) {
1289-
// Step35: HF clamps gate (after SiLU) and up before multiplication
1290-
if (arch == LLM_ARCH_STEP35 && il >= 0) {
1289+
if (il >= 0) {
12911290
const float limit = hparams.swiglu_clamp_shexp[il];
12921291
constexpr float eps = 1e-6f;
12931292
if (limit > eps) {
1294-
ggml_tensor * gate_act = ggml_silu(ctx0, cur);
1295-
cb(gate_act, "ffn_silu", il);
1296-
gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit);
1297-
cb(gate_act, "ffn_silu_clamped", il);
1298-
12991293
tmp = ggml_clamp(ctx0, tmp, -limit, limit);
13001294
cb(tmp, "ffn_up_clamped", il);
13011295

1302-
cur = ggml_mul(ctx0, gate_act, tmp);
1296+
if (arch == LLM_ARCH_DEEPSEEK_V4_FLASH) {
1297+
cur = ggml_clamp(ctx0, cur, -INFINITY, limit);
1298+
cb(cur, "ffn_gate_clamped", il);
1299+
cur = ggml_swiglu_split(ctx0, cur, tmp);
1300+
} else {
1301+
ggml_tensor * gate_act = ggml_silu(ctx0, cur);
1302+
cb(gate_act, "ffn_silu", il);
1303+
gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit);
1304+
cb(gate_act, "ffn_silu_clamped", il);
1305+
cur = ggml_mul(ctx0, gate_act, tmp);
1306+
}
13031307
cb(cur, "ffn_swiglu_limited", il);
13041308
type_gate = LLM_FFN_SEQ;
13051309
break;
@@ -1409,7 +1413,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
14091413
ggml_tensor * gate_up_exps,
14101414
ggml_tensor * up_exps_s,
14111415
ggml_tensor * gate_exps_s,
1412-
ggml_tensor * down_exps_s) const {
1416+
ggml_tensor * down_exps_s,
1417+
ggml_tensor * selected_experts_in) const {
14131418
return build_moe_ffn(
14141419
cur,
14151420
gate_inp, /* gate_inp_b */ nullptr,
@@ -1429,7 +1434,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
14291434
/* gate_up_exps_b */ nullptr,
14301435
up_exps_s,
14311436
gate_exps_s,
1432-
down_exps_s
1437+
down_exps_s,
1438+
selected_experts_in
14331439
);
14341440
}
14351441

@@ -1456,7 +1462,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
14561462
ggml_tensor * gate_up_exps_b,
14571463
ggml_tensor * up_exps_s,
14581464
ggml_tensor * gate_exps_s,
1459-
ggml_tensor * down_exps_s) const {
1465+
ggml_tensor * down_exps_s,
1466+
ggml_tensor * selected_experts_in) const {
14601467
const int64_t n_embd = cur->ne[0];
14611468
const int64_t n_tokens = cur->ne[1];
14621469
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
@@ -1465,6 +1472,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
14651472

14661473
if (probs_in == nullptr) {
14671474
logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
1475+
if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SQRT_SOFTPLUS) {
1476+
ggml_mul_mat_set_prec(logits, GGML_PREC_F32);
1477+
}
14681478
cb(logits, "ffn_moe_logits", il);
14691479
} else {
14701480
logits = probs_in;
@@ -1489,6 +1499,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
14891499
{
14901500
probs = logits; // [n_expert, n_tokens]
14911501
} break;
1502+
case LLAMA_EXPERT_GATING_FUNC_TYPE_SQRT_SOFTPLUS:
1503+
{
1504+
probs = ggml_sqrt(ctx0, ggml_softplus(ctx0, logits)); // [n_expert, n_tokens]
1505+
} break;
14921506
default:
14931507
GGML_ABORT("fatal error");
14941508
}
@@ -1539,8 +1553,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
15391553
}
15401554

15411555
// select experts
1542-
ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
1543-
cb(selected_experts->src[0], "ffn_moe_argsort", il);
1556+
ggml_tensor * selected_experts = selected_experts_in;
1557+
if (selected_experts == nullptr) {
1558+
selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
1559+
cb(selected_experts->src[0], "ffn_moe_argsort", il);
1560+
}
15441561
cb(selected_experts, "ffn_moe_topk", il);
15451562

15461563
if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
@@ -1668,20 +1685,24 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
16681685
switch (type_op) {
16691686
case LLM_FFN_SILU:
16701687
if (gate_exps) {
1671-
// Step35: per-layer clamp for routed experts
1672-
if (arch == LLM_ARCH_STEP35 && il >= 0) {
1688+
if (il >= 0) {
16731689
const float limit = hparams.swiglu_clamp_exp[il];
16741690
constexpr float eps = 1e-6f;
16751691
if (limit > eps) {
1676-
ggml_tensor * gate_act = ggml_silu(ctx0, cur);
1677-
cb(gate_act, "ffn_moe_silu", il);
1678-
gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit);
1679-
cb(gate_act, "ffn_moe_silu_clamped", il);
1680-
16811692
up = ggml_clamp(ctx0, up, -limit, limit);
16821693
cb(up, "ffn_moe_up_clamped", il);
16831694

1684-
cur = ggml_mul(ctx0, gate_act, up);
1695+
if (arch == LLM_ARCH_DEEPSEEK_V4_FLASH) {
1696+
cur = ggml_clamp(ctx0, cur, -INFINITY, limit);
1697+
cb(cur, "ffn_moe_gate_clamped", il);
1698+
cur = ggml_swiglu_split(ctx0, cur, up);
1699+
} else {
1700+
ggml_tensor * gate_act = ggml_silu(ctx0, cur);
1701+
cb(gate_act, "ffn_moe_silu", il);
1702+
gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit);
1703+
cb(gate_act, "ffn_moe_silu_clamped", il);
1704+
cur = ggml_mul(ctx0, gate_act, up);
1705+
}
16851706
cb(cur, "ffn_moe_swiglu_limited", il);
16861707
break;
16871708
}

src/llama-graph.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -906,7 +906,8 @@ struct llm_graph_context {
906906
ggml_tensor * gate_up_exps = nullptr,
907907
ggml_tensor * up_exps_s = nullptr,
908908
ggml_tensor * gate_exps_s = nullptr,
909-
ggml_tensor * down_exps_s = nullptr) const;
909+
ggml_tensor * down_exps_s = nullptr,
910+
ggml_tensor * selected_experts_in = nullptr) const;
910911

911912
ggml_tensor * build_moe_ffn(
912913
ggml_tensor * cur,
@@ -931,7 +932,8 @@ struct llm_graph_context {
931932
ggml_tensor * gate_up_exps_b = nullptr,
932933
ggml_tensor * up_exps_s = nullptr,
933934
ggml_tensor * gate_exps_s = nullptr,
934-
ggml_tensor * down_exps_s = nullptr) const;
935+
ggml_tensor * down_exps_s = nullptr,
936+
ggml_tensor * selected_experts_in = nullptr) const;
935937

936938
//
937939
// inputs

src/llama-hparams.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ enum llama_expert_gating_func_type {
1414
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
1515
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
1616
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
17+
LLAMA_EXPERT_GATING_FUNC_TYPE_SQRT_SOFTPLUS = 4,
1718
};
1819

1920
enum llama_swa_type {
@@ -211,6 +212,16 @@ struct llama_hparams {
211212
uint32_t indexer_head_size = 0;
212213
uint32_t indexer_top_k = 0;
213214

215+
// DeepSeek-V4 Flash
216+
uint32_t dsv4_o_group_count = 0;
217+
uint32_t dsv4_o_lora_rank = 0;
218+
uint32_t dsv4_hc_mult = 0;
219+
uint32_t dsv4_hc_sinkhorn_iters = 0;
220+
uint32_t dsv4_hash_layer_count = 0;
221+
float dsv4_compress_rope_base = 0.0f;
222+
float dsv4_hc_eps = 0.0f;
223+
std::array<uint32_t, LLAMA_MAX_LAYERS> dsv4_compress_ratios;
224+
214225
// qwen3vl deepstack
215226
uint32_t n_deepstack_layers = 0;
216227

src/llama-model-loader.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,8 @@ namespace GGUFMeta {
294294
}
295295

296296
template bool llama_model_loader::get_arr_n(enum llm_kv kid, uint32_t & result, bool required);
297+
template std::enable_if<std::is_integral<uint32_t>::value, bool>::type
298+
llama_model_loader::get_arr_n<uint32_t>(const std::string & key, uint32_t & result, bool required);
297299

298300
template<typename T>
299301
bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
@@ -393,6 +395,10 @@ namespace GGUFMeta {
393395
}
394396

395397
template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
398+
template bool llama_model_loader::get_arr<uint32_t, LLAMA_MAX_LAYERS>(
399+
const std::string & key,
400+
std::array<uint32_t, LLAMA_MAX_LAYERS> & result,
401+
bool required);
396402

397403
template<typename T>
398404
bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {

src/llama-model.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
175175
return new llama_model_deepseek2ocr(params);
176176
case LLM_ARCH_DEEPSEEK32:
177177
return new llama_model_deepseek32(params);
178+
case LLM_ARCH_DEEPSEEK_V4_FLASH:
179+
return new llama_model_deepseek_v4_flash(params);
178180
case LLM_ARCH_GLM_DSA:
179181
return new llama_model_glm_dsa(params);
180182
case LLM_ARCH_MISTRAL4:
@@ -794,6 +796,7 @@ static const char * llama_expert_gating_func_name(llama_expert_gating_func_type
794796
switch (type) {
795797
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return "softmax";
796798
case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return "sigmoid";
799+
case LLAMA_EXPERT_GATING_FUNC_TYPE_SQRT_SOFTPLUS: return "sqrtsoftplus";
797800
default: return "unknown";
798801
}
799802
}
@@ -2295,6 +2298,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
22952298
case LLM_ARCH_DEEPSEEK2:
22962299
case LLM_ARCH_DEEPSEEK2OCR:
22972300
case LLM_ARCH_DEEPSEEK32:
2301+
case LLM_ARCH_DEEPSEEK_V4_FLASH:
22982302
case LLM_ARCH_PLM:
22992303
case LLM_ARCH_CHATGLM:
23002304
case LLM_ARCH_GRANITE:

0 commit comments

Comments
 (0)