@@ -76,6 +76,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
7676 { LLM_ARCH_DEEPSEEK2, " deepseek2" },
7777 { LLM_ARCH_DEEPSEEK2OCR, " deepseek2-ocr" },
7878 { LLM_ARCH_DEEPSEEK32, " deepseek32" },
79+ { LLM_ARCH_DEEPSEEK_V4_FLASH, " deepseek-v4-flash" },
7980 { LLM_ARCH_CHATGLM, " chatglm" },
8081 { LLM_ARCH_GLM4, " glm4" },
8182 { LLM_ARCH_GLM4_MOE, " glm4moe" },
@@ -427,6 +428,23 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
427428 { LLM_TENSOR_ATTN_Q_B, " blk.%d.attn_q_b" },
428429 { LLM_TENSOR_ATTN_KV_A_MQA, " blk.%d.attn_kv_a_mqa" },
429430 { LLM_TENSOR_ATTN_KV_B, " blk.%d.attn_kv_b" },
431+ { LLM_TENSOR_ATTN_KV, " blk.%d.attn_kv" },
432+ { LLM_TENSOR_ATTN_KV_NORM, " blk.%d.attn_kv_norm" },
433+ { LLM_TENSOR_ATTN_OUT_A, " blk.%d.attn_wo_a" },
434+ { LLM_TENSOR_ATTN_OUT_B, " blk.%d.attn_wo_b" },
435+ { LLM_TENSOR_HC_HEAD_FN, " output_hc.fn" },
436+ { LLM_TENSOR_HC_HEAD_BASE, " output_hc.base" },
437+ { LLM_TENSOR_HC_HEAD_SCALE, " output_hc.scale" },
438+ { LLM_TENSOR_HC_ATTN_FN, " blk.%d.hc_attn.fn" },
439+ { LLM_TENSOR_HC_ATTN_BASE, " blk.%d.hc_attn.base" },
440+ { LLM_TENSOR_HC_ATTN_SCALE, " blk.%d.hc_attn.scale" },
441+ { LLM_TENSOR_HC_FFN_FN, " blk.%d.hc_ffn.fn" },
442+ { LLM_TENSOR_HC_FFN_BASE, " blk.%d.hc_ffn.base" },
443+ { LLM_TENSOR_HC_FFN_SCALE, " blk.%d.hc_ffn.scale" },
444+ { LLM_TENSOR_ATTN_COMPRESSOR_WKV, " blk.%d.attn_comp.wkv" },
445+ { LLM_TENSOR_ATTN_COMPRESSOR_WGATE, " blk.%d.attn_comp.wgate" },
446+ { LLM_TENSOR_ATTN_COMPRESSOR_APE, " blk.%d.attn_comp.ape" },
447+ { LLM_TENSOR_ATTN_COMPRESSOR_NORM, " blk.%d.attn_comp.norm" },
430448 { LLM_TENSOR_PER_LAYER_TOKEN_EMBD, " per_layer_token_embd" },
431449 { LLM_TENSOR_PER_LAYER_MODEL_PROJ, " per_layer_model_proj" },
432450 { LLM_TENSOR_PER_LAYER_PROJ_NORM, " per_layer_proj_norm" },
@@ -551,6 +569,11 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
551569 { LLM_TENSOR_INDEXER_PROJ, " blk.%d.indexer.proj" },
552570 { LLM_TENSOR_INDEXER_ATTN_K, " blk.%d.indexer.attn_k" },
553571 { LLM_TENSOR_INDEXER_ATTN_Q_B, " blk.%d.indexer.attn_q_b" },
572+ { LLM_TENSOR_INDEXER_COMPRESSOR_WKV, " blk.%d.indexer_comp.wkv" },
573+ { LLM_TENSOR_INDEXER_COMPRESSOR_WGATE, " blk.%d.indexer_comp.wgate" },
574+ { LLM_TENSOR_INDEXER_COMPRESSOR_APE, " blk.%d.indexer_comp.ape" },
575+ { LLM_TENSOR_INDEXER_COMPRESSOR_NORM, " blk.%d.indexer_comp.norm" },
576+ { LLM_TENSOR_FFN_GATE_TID2EID, " blk.%d.ffn_gate_tid2eid" },
554577};
555578
556579// declare information about the model weight tensors:
@@ -597,6 +620,23 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
597620 {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
598621 {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
599622 {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
623+ {LLM_TENSOR_ATTN_KV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
624+ {LLM_TENSOR_ATTN_KV_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
625+ {LLM_TENSOR_ATTN_OUT_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
626+ {LLM_TENSOR_ATTN_OUT_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
627+ {LLM_TENSOR_HC_HEAD_FN, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
628+ {LLM_TENSOR_HC_HEAD_BASE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}},
629+ {LLM_TENSOR_HC_HEAD_SCALE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
630+ {LLM_TENSOR_HC_ATTN_FN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
631+ {LLM_TENSOR_HC_ATTN_BASE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
632+ {LLM_TENSOR_HC_ATTN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
633+ {LLM_TENSOR_HC_FFN_FN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
634+ {LLM_TENSOR_HC_FFN_BASE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
635+ {LLM_TENSOR_HC_FFN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
636+ {LLM_TENSOR_ATTN_COMPRESSOR_WKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
637+ {LLM_TENSOR_ATTN_COMPRESSOR_WGATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
638+ {LLM_TENSOR_ATTN_COMPRESSOR_APE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
639+ {LLM_TENSOR_ATTN_COMPRESSOR_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
600640 {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
601641 {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
602642 {LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
@@ -760,6 +800,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
760800 {LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
761801 {LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
762802 {LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
803+ {LLM_TENSOR_INDEXER_COMPRESSOR_WKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
804+ {LLM_TENSOR_INDEXER_COMPRESSOR_WGATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
805+ {LLM_TENSOR_INDEXER_COMPRESSOR_APE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
806+ {LLM_TENSOR_INDEXER_COMPRESSOR_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
807+ {LLM_TENSOR_FFN_GATE_TID2EID, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
763808 // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the
764809 // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so
765810 // the model loader doesn't fault on the block index.
@@ -907,6 +952,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
907952 case LLM_ARCH_OLMOE:
908953 case LLM_ARCH_DEEPSEEK2:
909954 case LLM_ARCH_DEEPSEEK32:
955+ case LLM_ARCH_DEEPSEEK_V4_FLASH:
910956 case LLM_ARCH_GLM_DSA:
911957 case LLM_ARCH_BITNET:
912958 case LLM_ARCH_T5:
0 commit comments