diff --git a/conversion/__init__.py b/conversion/__init__.py
index 4a1fd5bb70f0..02ea6385208a 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -51,6 +51,7 @@
"DeepseekV3ForCausalLM": "deepseek",
"DeepseekV32ForCausalLM": "deepseek",
"DFlashDraftModel": "qwen",
+ "DeepseekV4ForCausalLM": "deepseek",
"DistilBertForMaskedLM": "bert",
"DistilBertForSequenceClassification": "bert",
"DistilBertModel": "bert",
diff --git a/conversion/base.py b/conversion/base.py
index 08fd3747c408..0421aa4bc4d3 100644
--- a/conversion/base.py
+++ b/conversion/base.py
@@ -1273,7 +1273,7 @@ def set_gguf_parameters(self):
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_eps(f_norm_eps)
logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
- if (n_experts := self.find_hparam(["num_local_experts", "num_experts"], optional=True)) is not None:
+ if (n_experts := self.find_hparam(["num_local_experts", "num_experts", "n_routed_experts"], optional=True)) is not None:
self.gguf_writer.add_expert_count(n_experts)
logger.info(f"gguf: expert count = {n_experts}")
if (n_experts_used := self.find_hparam(["num_experts_per_tok", "num_experts_per_token", "top_k_experts"], optional=True)) is not None:
@@ -1291,6 +1291,8 @@ def set_gguf_parameters(self):
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID)
elif score_func == "softmax":
self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX)
+ elif score_func == "sqrtsoftplus":
+ self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SQRTSOFTPLUS)
else:
raise ValueError(f"Unsupported expert score gating function value: {score_func}")
logger.info(f"gguf: expert score gating function = {score_func}")
@@ -2600,6 +2602,17 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
return cls._wrap_fn(func)(*args, **kwargs)
+if hasattr(torch, "float8_e8m0fnu"):
+ _torch_float8_e8m0 = torch.float8_e8m0fnu
+ LazyTorchTensor._dtype_map[_torch_float8_e8m0] = np.uint8
+ LazyTorchTensor._dtype_byteswap_map[_torch_float8_e8m0] = np.uint8
+ LazyTorchTensor._dtype_str_map["F8_E8M0"] = _torch_float8_e8m0
+else:
+ # Older torch builds do not expose F8_E8M0. Keep the raw bytes so callers
+ # that know the format can decode them explicitly.
+ LazyTorchTensor._dtype_str_map["F8_E8M0"] = torch.uint8
+
+
def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> str:
# TODO @ngxson : this won't work correctly if the model has both audio & vision encoders
# maybe we should fallback to text model's arch in that case, since not many models have both
diff --git a/conversion/deepseek.py b/conversion/deepseek.py
index 4c93fb66df64..cfac5201ec46 100644
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@@ -1,15 +1,18 @@
from __future__ import annotations
+import json
import re
+from pathlib import Path
from typing import Any, Callable, Iterable, TYPE_CHECKING
+import numpy as np
import torch
if TYPE_CHECKING:
from torch import Tensor
-from .base import MmprojModel, ModelBase, TextModel, gguf, logger
+from .base import LazyTorchTensor, MmprojModel, ModelBase, TextModel, gguf, logger
from .qwen import QwenModel
@@ -467,3 +470,310 @@ def set_gguf_parameters(self):
self.gguf_writer.add_indexer_head_count(self.hparams["index_n_heads"])
self.gguf_writer.add_indexer_key_length(self.hparams["index_head_dim"])
self.gguf_writer.add_indexer_top_k(self.hparams["index_topk"])
+
+
+@ModelBase.register("DeepseekV4ForCausalLM")
+class DeepseekV4Model(TextModel):
+ model_arch = gguf.MODEL_ARCH.DEEPSEEK4
+ _skipped_mtp_tensors = 0
+
+ def __init__(self, *args, **kwargs):
+ type(self)._skipped_mtp_tensors = 0
+ super().__init__(*args, **kwargs)
+
+ with open(self.dir_model / "config.json", "r", encoding="utf-8") as f:
+ raw_hparams = json.load(f)
+ for key, value in raw_hparams.items():
+ self.hparams.setdefault(key, value)
+
+ self.block_count = self.hparams["num_hidden_layers"]
+ self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+
+ self._dsv4_fp8_dequantized: set[str] = set()
+ self._dsv4_bf16_tensors: set[str] = set()
+ self._dsv4_f32_tensors: set[str] = set()
+ self._dsv4_mxfp4_generated = False
+ self._collect_source_dtypes()
+
+ if type(self)._skipped_mtp_tensors:
+ logger.info("Skipping %d DeepSeek-V4 MTP tensor(s) for conversion v0", type(self)._skipped_mtp_tensors)
+
+ # add a default chat template; if the model has a built-in template, it will be overridden later
+ template_path = Path(__file__).parent.parent / "models" / "templates" / "deepseek-ai-DeepSeek-V4.jinja"
+ if template_path.is_file():
+ with open(template_path, "r", encoding="utf-8") as f:
+ self.gguf_writer.add_chat_template(f.read())
+
+ @classmethod
+ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None:
+ name, _ = item
+ if name.startswith("mtp."):
+ cls._skipped_mtp_tensors += 1
+ return None
+ return super().filter_tensors(item)
+
+ def set_vocab(self):
+ self._set_vocab_gpt2()
+
+ @staticmethod
+ def _float8_dtypes() -> tuple[torch.dtype, ...]:
+ return tuple(
+ dtype for dtype in (
+ getattr(torch, "float8_e4m3fn", None),
+ getattr(torch, "float8_e5m2", None),
+ ) if dtype is not None
+ )
+
+ @staticmethod
+ def _e8m0_to_float(scale: Tensor) -> Tensor:
+ torch_float8_e8m0 = getattr(torch, "float8_e8m0fnu", None)
+ if torch_float8_e8m0 is not None and scale.dtype == torch_float8_e8m0:
+ return scale.float()
+
+ bits = scale.view(torch.uint8).float()
+ return torch.exp2(bits - 127.0)
+
+ def _collect_source_dtypes(self) -> None:
+ for name, gen in self.model_tensors.items():
+ dtype = gen().dtype
+ if dtype == torch.bfloat16:
+ self._dsv4_bf16_tensors.add(name)
+ elif dtype == torch.float32:
+ self._dsv4_f32_tensors.add(name)
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ hparams = self.hparams
+
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+ self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+ self.gguf_writer.add_sliding_window(hparams["sliding_window"])
+
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+ self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
+ self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
+ self.gguf_writer.add_swiglu_clamp_exp([hparams["swiglu_limit"]] * self.block_count)
+ self.gguf_writer.add_swiglu_clamp_shexp([hparams["swiglu_limit"]] * self.block_count)
+
+ self.gguf_writer.add_indexer_head_count(hparams["index_n_heads"])
+ self.gguf_writer.add_indexer_key_length(hparams["index_head_dim"])
+ self.gguf_writer.add_indexer_top_k(hparams["index_topk"])
+
+ self.gguf_writer.add_attention_output_group_count(hparams["o_groups"])
+ self.gguf_writer.add_attention_output_lora_rank(hparams["o_lora_rank"])
+ self.gguf_writer.add_attention_compress_ratios(hparams["compress_ratios"])
+ self.gguf_writer.add_attention_compress_rope_freq_base(hparams["compress_rope_theta"])
+ self.gguf_writer.add_hyper_connection_count(hparams["hc_mult"])
+ self.gguf_writer.add_hyper_connection_sinkhorn_iterations(hparams["hc_sinkhorn_iters"])
+ self.gguf_writer.add_hyper_connection_epsilon(hparams["hc_eps"])
+ self.gguf_writer.add_hash_layer_count(hparams["num_hash_layers"])
+
+ def dequant_model(self):
+ fp8_dtypes = self._float8_dtypes()
+ tensors_to_remove: list[str] = []
+
+ def dequant_fp8_weight(weight: Tensor, scale: Tensor) -> Tensor:
+ out_features, in_features = weight.shape
+ scale_f = self._e8m0_to_float(scale)
+ scale_f = scale_f.repeat_interleave(128, 0)[:out_features]
+ scale_f = scale_f.repeat_interleave(128, 1)[:, :in_features]
+ return weight.float() * scale_f
+
+ for name in list(self.model_tensors.keys()):
+ if not name.endswith(".scale"):
+ continue
+ weight_name = name.removesuffix(".scale") + ".weight"
+ if weight_name not in self.model_tensors:
+ continue
+
+ weight = self.model_tensors[weight_name]
+ scale = self.model_tensors[name]
+ if weight().dtype not in fp8_dtypes:
+ continue
+
+ self.model_tensors[weight_name] = lambda w=weight, s=scale: dequant_fp8_weight(w(), s())
+ self._dsv4_fp8_dequantized.add(weight_name)
+ tensors_to_remove.append(name)
+
+ for name in tensors_to_remove:
+ del self.model_tensors[name]
+
+ @staticmethod
+ def _pack_mxfp4_blocks(weight: Tensor, scale: Tensor) -> np.ndarray:
+ packed = weight.contiguous().view(torch.uint8)
+ scale_u8 = scale.contiguous().view(torch.uint8)
+
+ out_features, packed_cols = packed.shape
+ logical_cols = packed_cols * 2
+ if logical_cols % 32 != 0:
+ raise ValueError(f"MXFP4 source row has {logical_cols} values, expected a multiple of 32")
+
+ n_blocks = logical_cols // 32
+ if tuple(scale_u8.shape) != (out_features, n_blocks):
+ raise ValueError(f"MXFP4 scale shape {tuple(scale_u8.shape)} does not match {(out_features, n_blocks)}")
+
+ src = packed.reshape(out_features, n_blocks, 16)
+ low = src & 0x0F
+ high = (src >> 4) & 0x0F
+
+ # The safetensors bytes store adjacent values as low/high nibbles.
+ # ggml MXFP4 blocks store values 0..15 in low nibbles and 16..31 in high nibbles.
+ vals = torch.stack((low, high), dim=-1).reshape(out_features, n_blocks, 32)
+ qs = vals[:, :, :16] | (vals[:, :, 16:] << 4)
+ raw = torch.cat((scale_u8.unsqueeze(-1), qs.to(torch.uint8)), dim=-1)
+ return raw.reshape(out_features, n_blocks * 17).cpu().numpy()
+
+ def _write_mxfp4_expert_tensor(self, bid: int, proj: str, tensor_key: gguf.MODEL_TENSOR) -> list[str]:
+ n_experts = self.hparams["n_routed_experts"]
+ data: np.ndarray | None = None
+ consumed: list[str] = []
+
+ for eid in range(n_experts):
+ weight_name = f"layers.{bid}.ffn.experts.{eid}.{proj}.weight"
+ scale_name = f"layers.{bid}.ffn.experts.{eid}.{proj}.scale"
+ if weight_name not in self.model_tensors or scale_name not in self.model_tensors:
+ raise KeyError(f"Missing routed expert tensors for {weight_name}")
+
+ weight = LazyTorchTensor.to_eager(self.model_tensors[weight_name]())
+ scale = LazyTorchTensor.to_eager(self.model_tensors[scale_name]())
+ packed = self._pack_mxfp4_blocks(weight, scale)
+ if data is None:
+ data = np.empty((n_experts, *packed.shape), dtype=packed.dtype)
+ data[eid] = packed
+ consumed.extend((weight_name, scale_name))
+
+ assert data is not None
+ new_name = self.format_tensor_name(tensor_key, bid)
+ shape = gguf.quant_shape_from_byte_shape(data.shape, gguf.GGMLQuantizationType.MXFP4)
+ logger.info(f"{new_name}: repacked routed experts to MXFP4, shape = {{{', '.join(str(n) for n in reversed(shape))}}}")
+ self.gguf_writer.add_tensor(new_name, data, raw_dtype=gguf.GGMLQuantizationType.MXFP4)
+
+ return consumed
+
+ def _write_hash_routing_tensors(self) -> list[str]:
+ consumed: list[str] = []
+
+ for bid in range(self.hparams["num_hash_layers"]):
+ name = f"layers.{bid}.ffn.gate.tid2eid"
+ if name not in self.model_tensors:
+ raise KeyError(f"Missing hash routing tensor {name}")
+
+ data_torch = LazyTorchTensor.to_eager(self.model_tensors[name]())
+ data = data_torch.to(torch.int32).cpu().numpy()
+ new_name = self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_TID2EID, bid, ".weight")
+ logger.info(f"{new_name}: converted hash routing table to I32, shape = {{{', '.join(str(n) for n in reversed(data.shape))}}}")
+ self.gguf_writer.add_tensor(new_name, data)
+ consumed.append(name)
+
+ return consumed
+
+ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
+ if self._dsv4_mxfp4_generated:
+ return ()
+
+ consumed: list[str] = self._write_hash_routing_tensors()
+ for bid in range(self.block_count):
+ consumed.extend(self._write_mxfp4_expert_tensor(bid, "w1", gguf.MODEL_TENSOR.FFN_GATE_EXP))
+ consumed.extend(self._write_mxfp4_expert_tensor(bid, "w2", gguf.MODEL_TENSOR.FFN_DOWN_EXP))
+ consumed.extend(self._write_mxfp4_expert_tensor(bid, "w3", gguf.MODEL_TENSOR.FFN_UP_EXP))
+
+ for name in consumed:
+ del self.model_tensors[name]
+
+ self._dsv4_mxfp4_generated = True
+ return ()
+
+ def _format_dsv4_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> str:
+ return self.format_tensor_name(key, bid, suffix)
+
+ def _map_dsv4_tensor_name(self, name: str, bid: int | None) -> tuple[gguf.MODEL_TENSOR, str]:
+ root_map: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
+ "embed.weight": (gguf.MODEL_TENSOR.TOKEN_EMBD, ".weight"),
+ "norm.weight": (gguf.MODEL_TENSOR.OUTPUT_NORM, ".weight"),
+ "head.weight": (gguf.MODEL_TENSOR.OUTPUT, ".weight"),
+ "hc_head_fn": (gguf.MODEL_TENSOR.HC_HEAD_FN, ".weight"),
+ "hc_head_base": (gguf.MODEL_TENSOR.HC_HEAD_BASE, ".weight"),
+ "hc_head_scale": (gguf.MODEL_TENSOR.HC_HEAD_SCALE, ".weight"),
+ }
+ if name in root_map:
+ return root_map[name]
+
+ match = re.match(r"layers\.(\d+)\.(.+)$", name)
+ if match is None:
+ raise ValueError(f"Unsupported DeepSeek-V4 tensor {name!r}")
+
+ layer = int(match.group(1))
+ if bid != layer:
+ raise ValueError(f"Tensor {name!r} parsed bid {bid} but layer name has {layer}")
+
+ layer_map: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
+ "hc_attn_fn": (gguf.MODEL_TENSOR.HC_ATTN_FN, ".weight"),
+ "hc_attn_base": (gguf.MODEL_TENSOR.HC_ATTN_BASE, ".weight"),
+ "hc_attn_scale": (gguf.MODEL_TENSOR.HC_ATTN_SCALE, ".weight"),
+ "hc_ffn_fn": (gguf.MODEL_TENSOR.HC_FFN_FN, ".weight"),
+ "hc_ffn_base": (gguf.MODEL_TENSOR.HC_FFN_BASE, ".weight"),
+ "hc_ffn_scale": (gguf.MODEL_TENSOR.HC_FFN_SCALE, ".weight"),
+ "attn.attn_sink": (gguf.MODEL_TENSOR.ATTN_SINKS, ".weight"),
+ "attn.wq_a.weight": (gguf.MODEL_TENSOR.ATTN_Q_A, ".weight"),
+ "attn.wq_b.weight": (gguf.MODEL_TENSOR.ATTN_Q_B, ".weight"),
+ "attn.q_norm.weight": (gguf.MODEL_TENSOR.ATTN_Q_A_NORM, ".weight"),
+ "attn.wkv.weight": (gguf.MODEL_TENSOR.ATTN_KV, ".weight"),
+ "attn.kv_norm.weight": (gguf.MODEL_TENSOR.ATTN_KV_NORM, ".weight"),
+ "attn.wo_a.weight": (gguf.MODEL_TENSOR.ATTN_OUT_A, ".weight"),
+ "attn.wo_b.weight": (gguf.MODEL_TENSOR.ATTN_OUT_B, ".weight"),
+ "attn.compressor.ape": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_APE, ".weight"),
+ "attn.compressor.wkv.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_WKV, ".weight"),
+ "attn.compressor.wgate.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_WGATE, ".weight"),
+ "attn.compressor.norm.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_NORM, ".weight"),
+ "attn.indexer.wq_b.weight": (gguf.MODEL_TENSOR.INDEXER_ATTN_Q_B, ".weight"),
+ "attn.indexer.weights_proj.weight": (gguf.MODEL_TENSOR.INDEXER_PROJ, ".weight"),
+ "attn.indexer.compressor.ape": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_APE, ".weight"),
+ "attn.indexer.compressor.wkv.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_WKV, ".weight"),
+ "attn.indexer.compressor.wgate.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_WGATE, ".weight"),
+ "attn.indexer.compressor.norm.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_NORM, ".weight"),
+ "attn_norm.weight": (gguf.MODEL_TENSOR.ATTN_NORM, ".weight"),
+ "ffn_norm.weight": (gguf.MODEL_TENSOR.FFN_NORM, ".weight"),
+ "ffn.gate.weight": (gguf.MODEL_TENSOR.FFN_GATE_INP, ".weight"),
+ "ffn.gate.bias": (gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, ".bias"),
+ "ffn.gate.tid2eid": (gguf.MODEL_TENSOR.FFN_GATE_TID2EID, ".weight"),
+ "ffn.shared_experts.w1.weight": (gguf.MODEL_TENSOR.FFN_GATE_SHEXP, ".weight"),
+ "ffn.shared_experts.w2.weight": (gguf.MODEL_TENSOR.FFN_DOWN_SHEXP, ".weight"),
+ "ffn.shared_experts.w3.weight": (gguf.MODEL_TENSOR.FFN_UP_SHEXP, ".weight"),
+ }
+
+ tensor_name = match.group(2)
+ if tensor_name in layer_map:
+ return layer_map[tensor_name]
+
+ if re.match(r"ffn\.experts\.\d+\.w[123]\.(weight|scale)$", tensor_name):
+ return gguf.MODEL_TENSOR.FFN_GATE_EXP, ".weight"
+
+ raise ValueError(f"Unsupported DeepSeek-V4 tensor {name!r}")
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ if re.match(r"layers\.\d+\.ffn\.experts\.\d+\.w[123]\.(weight|scale)$", name):
+ return []
+
+ tensor_key, suffix = self._map_dsv4_tensor_name(name, bid)
+ if tensor_key == gguf.MODEL_TENSOR.FFN_GATE_TID2EID:
+ return []
+
+ return [(self._format_dsv4_tensor_name(tensor_key, bid, suffix), data_torch)]
+
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
+ del new_name, bid # unused
+
+ if name in self._dsv4_fp8_dequantized and n_dims >= 2:
+ return gguf.GGMLQuantizationType.Q8_0
+ if name in self._dsv4_f32_tensors:
+ return gguf.GGMLQuantizationType.F32
+ if name in self._dsv4_bf16_tensors and n_dims >= 2:
+ return gguf.GGMLQuantizationType.BF16
+
+ return False
+
+ def prepare_tensors(self):
+ super().prepare_tensors()
+ self._is_mxfp4 = True
+ self.ftype = gguf.LlamaFileType.MOSTLY_MXFP4_MOE
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index bcd10beb0418..b26fab727dd3 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -145,6 +145,7 @@ class LLM:
TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
FULL_ATTENTION_INTERVAL = "{arch}.full_attention_interval"
+ HASH_LAYER_COUNT = "{arch}.hash_layer_count"
ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale"
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
@@ -179,8 +180,12 @@ class Attention:
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
SLIDING_WINDOW = "{arch}.attention.sliding_window"
SCALE = "{arch}.attention.scale"
+ OUTPUT_GROUP_COUNT = "{arch}.attention.output_group_count"
+ OUTPUT_LORA_RANK = "{arch}.attention.output_lora_rank"
OUTPUT_SCALE = "{arch}.attention.output_scale"
VALUE_SCALE = "{arch}.attention.value_scale"
+ COMPRESS_RATIOS = "{arch}.attention.compress_ratios"
+ COMPRESS_ROPE_FREQ_BASE = "{arch}.attention.compress_rope_freq_base"
TEMPERATURE_LENGTH = "{arch}.attention.temperature_length"
KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
@@ -195,6 +200,11 @@ class Indexer:
KEY_LENGTH = "{arch}.attention.indexer.key_length"
TOP_K = "{arch}.attention.indexer.top_k"
+ class HyperConnection:
+ COUNT = "{arch}.hyper_connection.count"
+ SINKHORN_ITERATIONS = "{arch}.hyper_connection.sinkhorn_iterations"
+ EPSILON = "{arch}.hyper_connection.epsilon"
+
class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"
DIMENSION_COUNT_SWA = "{arch}.rope.dimension_count_swa"
@@ -469,6 +479,7 @@ class MODEL_ARCH(IntEnum):
DEEPSEEK2 = auto()
DEEPSEEK2OCR = auto()
DEEPSEEK32 = auto()
+ DEEPSEEK4 = auto()
CHATGLM = auto()
GLM4 = auto()
GLM4_MOE = auto()
@@ -554,6 +565,9 @@ class MODEL_TENSOR(IntEnum):
DENSE_2_OUT = auto() # embeddinggemma 2_Dense
DENSE_3_OUT = auto() # embeddinggemma 3_Dense
OUTPUT_NORM = auto()
+ HC_HEAD_FN = auto()
+ HC_HEAD_BASE = auto()
+ HC_HEAD_SCALE = auto()
ROPE_FREQS = auto()
ROPE_FACTORS_LONG = auto()
ROPE_FACTORS_SHORT = auto()
@@ -593,6 +607,7 @@ class MODEL_TENSOR(IntEnum):
FFN_DOWN_CHEXP = auto()
FFN_UP_CHEXP = auto()
FFN_EXP_PROBS_B = auto()
+ FFN_GATE_TID2EID = auto()
MOE_LATENT_DOWN = auto() # nemotron 3 super
MOE_LATENT_UP = auto() # nemotron 3 super
ATTN_Q_NORM = auto()
@@ -680,6 +695,20 @@ class MODEL_TENSOR(IntEnum):
ATTN_V_B = auto()
ATTN_Q_A_NORM = auto()
ATTN_KV_A_NORM = auto()
+ ATTN_KV = auto()
+ ATTN_KV_NORM = auto()
+ ATTN_OUT_A = auto()
+ ATTN_OUT_B = auto()
+ HC_ATTN_FN = auto()
+ HC_ATTN_BASE = auto()
+ HC_ATTN_SCALE = auto()
+ HC_FFN_FN = auto()
+ HC_FFN_BASE = auto()
+ HC_FFN_SCALE = auto()
+ ATTN_COMPRESSOR_WKV = auto()
+ ATTN_COMPRESSOR_WGATE = auto()
+ ATTN_COMPRESSOR_APE = auto()
+ ATTN_COMPRESSOR_NORM = auto()
FFN_SUB_NORM = auto()
ATTN_SUB_NORM = auto()
DEC_ATTN_NORM = auto()
@@ -741,6 +770,10 @@ class MODEL_TENSOR(IntEnum):
INDEXER_PROJ = auto()
INDEXER_ATTN_K = auto()
INDEXER_ATTN_Q_B = auto()
+ INDEXER_COMPRESSOR_WKV = auto()
+ INDEXER_COMPRESSOR_WGATE = auto()
+ INDEXER_COMPRESSOR_APE = auto()
+ INDEXER_COMPRESSOR_NORM = auto()
# vision
V_MMPROJ = auto()
V_MMPROJ_FC = auto()
@@ -1026,6 +1059,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr",
MODEL_ARCH.DEEPSEEK32: "deepseek32",
+ MODEL_ARCH.DEEPSEEK4: "deepseek4",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.GLM4: "glm4",
MODEL_ARCH.GLM4_MOE: "glm4moe",
@@ -1110,6 +1144,9 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
+ MODEL_TENSOR.HC_HEAD_FN: "output_hc_fn",
+ MODEL_TENSOR.HC_HEAD_BASE: "output_hc_base",
+ MODEL_TENSOR.HC_HEAD_SCALE: "output_hc_scale",
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
@@ -1151,6 +1188,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
MODEL_TENSOR.FFN_GATE_UP_EXP: "blk.{bid}.ffn_gate_up_exps",
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
+ MODEL_TENSOR.FFN_GATE_TID2EID: "blk.{bid}.ffn_gate_tid2eid",
MODEL_TENSOR.MOE_LATENT_DOWN: "blk.{bid}.ffn_latent_down", # nemotron 3 super
MODEL_TENSOR.MOE_LATENT_UP: "blk.{bid}.ffn_latent_up", # nemotron 3 super
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
@@ -1236,6 +1274,20 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b",
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
+ MODEL_TENSOR.ATTN_KV: "blk.{bid}.attn_kv",
+ MODEL_TENSOR.ATTN_KV_NORM: "blk.{bid}.attn_kv_a_norm",
+ MODEL_TENSOR.ATTN_OUT_A: "blk.{bid}.attn_output_a",
+ MODEL_TENSOR.ATTN_OUT_B: "blk.{bid}.attn_output_b",
+ MODEL_TENSOR.HC_ATTN_FN: "blk.{bid}.hc_attn_fn",
+ MODEL_TENSOR.HC_ATTN_BASE: "blk.{bid}.hc_attn_base",
+ MODEL_TENSOR.HC_ATTN_SCALE: "blk.{bid}.hc_attn_scale",
+ MODEL_TENSOR.HC_FFN_FN: "blk.{bid}.hc_ffn_fn",
+ MODEL_TENSOR.HC_FFN_BASE: "blk.{bid}.hc_ffn_base",
+ MODEL_TENSOR.HC_FFN_SCALE: "blk.{bid}.hc_ffn_scale",
+ MODEL_TENSOR.ATTN_COMPRESSOR_WKV: "blk.{bid}.attn_compressor_kv",
+ MODEL_TENSOR.ATTN_COMPRESSOR_WGATE: "blk.{bid}.attn_compressor_gate",
+ MODEL_TENSOR.ATTN_COMPRESSOR_APE: "blk.{bid}.attn_compressor_ape",
+ MODEL_TENSOR.ATTN_COMPRESSOR_NORM: "blk.{bid}.attn_compressor_norm",
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
@@ -1297,6 +1349,10 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.INDEXER_PROJ: "blk.{bid}.indexer.proj",
MODEL_TENSOR.INDEXER_ATTN_K: "blk.{bid}.indexer.attn_k",
MODEL_TENSOR.INDEXER_ATTN_Q_B: "blk.{bid}.indexer.attn_q_b",
+ MODEL_TENSOR.INDEXER_COMPRESSOR_WKV: "blk.{bid}.indexer_compressor_kv",
+ MODEL_TENSOR.INDEXER_COMPRESSOR_WGATE: "blk.{bid}.indexer_compressor_gate",
+ MODEL_TENSOR.INDEXER_COMPRESSOR_APE: "blk.{bid}.indexer_compressor_ape",
+ MODEL_TENSOR.INDEXER_COMPRESSOR_NORM: "blk.{bid}.indexer_compressor_norm",
# vision
MODEL_TENSOR.V_MMPROJ: "mm.{bid}",
MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc",
@@ -3137,6 +3193,49 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
],
+ MODEL_ARCH.DEEPSEEK4: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.HC_HEAD_FN,
+ MODEL_TENSOR.HC_HEAD_BASE,
+ MODEL_TENSOR.HC_HEAD_SCALE,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_SINKS,
+ MODEL_TENSOR.ATTN_Q_A,
+ MODEL_TENSOR.ATTN_Q_B,
+ MODEL_TENSOR.ATTN_Q_A_NORM,
+ MODEL_TENSOR.ATTN_KV,
+ MODEL_TENSOR.ATTN_KV_NORM,
+ MODEL_TENSOR.ATTN_OUT_A,
+ MODEL_TENSOR.ATTN_OUT_B,
+ MODEL_TENSOR.HC_ATTN_FN,
+ MODEL_TENSOR.HC_ATTN_BASE,
+ MODEL_TENSOR.HC_ATTN_SCALE,
+ MODEL_TENSOR.HC_FFN_FN,
+ MODEL_TENSOR.HC_FFN_BASE,
+ MODEL_TENSOR.HC_FFN_SCALE,
+ MODEL_TENSOR.ATTN_COMPRESSOR_WKV,
+ MODEL_TENSOR.ATTN_COMPRESSOR_WGATE,
+ MODEL_TENSOR.ATTN_COMPRESSOR_APE,
+ MODEL_TENSOR.ATTN_COMPRESSOR_NORM,
+ MODEL_TENSOR.INDEXER_PROJ,
+ MODEL_TENSOR.INDEXER_ATTN_Q_B,
+ MODEL_TENSOR.INDEXER_COMPRESSOR_WKV,
+ MODEL_TENSOR.INDEXER_COMPRESSOR_WGATE,
+ MODEL_TENSOR.INDEXER_COMPRESSOR_APE,
+ MODEL_TENSOR.INDEXER_COMPRESSOR_NORM,
+ MODEL_TENSOR.FFN_GATE_INP,
+ MODEL_TENSOR.FFN_GATE_TID2EID,
+ MODEL_TENSOR.FFN_EXP_PROBS_B,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.FFN_GATE_EXP,
+ MODEL_TENSOR.FFN_DOWN_EXP,
+ MODEL_TENSOR.FFN_UP_EXP,
+ MODEL_TENSOR.FFN_GATE_SHEXP,
+ MODEL_TENSOR.FFN_DOWN_SHEXP,
+ MODEL_TENSOR.FFN_UP_SHEXP,
+ ],
MODEL_ARCH.ERNIE4_5_MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@@ -4436,8 +4535,9 @@ class GGMLQuantizationType(IntEnum):
class ExpertGatingFuncType(IntEnum):
- SOFTMAX = 1
- SIGMOID = 2
+ SOFTMAX = 1
+ SIGMOID = 2
+ SQRTSOFTPLUS = 4
# TODO: add GGMLFileType from ggml_ftype in ggml.h
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index a06ec88b32ca..a95b4c117a56 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -715,6 +715,9 @@ def add_leading_dense_block_count(self, length: int) -> None:
def add_full_attention_interval(self, interval: int) -> None:
self.add_uint32(Keys.LLM.FULL_ATTENTION_INTERVAL.format(arch=self.arch), interval)
+ def add_hash_layer_count(self, count: int) -> None:
+ self.add_uint32(Keys.LLM.HASH_LAYER_COUNT.format(arch=self.arch), count)
+
def add_feed_forward_length(self, length: int | Sequence[int]) -> None:
if isinstance(length, int):
self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)
@@ -940,6 +943,27 @@ def add_relative_attn_buckets_count(self, value: int) -> None:
def add_sliding_window(self, value: int) -> None:
self.add_uint32(Keys.Attention.SLIDING_WINDOW.format(arch=self.arch), value)
+ def add_attention_output_group_count(self, count: int) -> None:
+ self.add_uint32(Keys.Attention.OUTPUT_GROUP_COUNT.format(arch=self.arch), count)
+
+ def add_attention_output_lora_rank(self, length: int) -> None:
+ self.add_uint32(Keys.Attention.OUTPUT_LORA_RANK.format(arch=self.arch), length)
+
+ def add_attention_compress_ratios(self, values: Sequence[int]) -> None:
+ self.add_array(Keys.Attention.COMPRESS_RATIOS.format(arch=self.arch), values)
+
+ def add_attention_compress_rope_freq_base(self, value: float) -> None:
+ self.add_float32(Keys.Attention.COMPRESS_ROPE_FREQ_BASE.format(arch=self.arch), value)
+
+ def add_hyper_connection_count(self, count: int) -> None:
+ self.add_uint32(Keys.HyperConnection.COUNT.format(arch=self.arch), count)
+
+ def add_hyper_connection_sinkhorn_iterations(self, count: int) -> None:
+ self.add_uint32(Keys.HyperConnection.SINKHORN_ITERATIONS.format(arch=self.arch), count)
+
+ def add_hyper_connection_epsilon(self, value: float) -> None:
+ self.add_float32(Keys.HyperConnection.EPSILON.format(arch=self.arch), value)
+
def add_attention_scale(self, value: float) -> None:
self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)
diff --git a/models/templates/deepseek-ai-DeepSeek-V4.jinja b/models/templates/deepseek-ai-DeepSeek-V4.jinja
new file mode 100644
index 000000000000..f19f787b1b7e
--- /dev/null
+++ b/models/templates/deepseek-ai-DeepSeek-V4.jinja
@@ -0,0 +1,112 @@
+{%- if not add_generation_prompt is defined -%}
+ {%- set add_generation_prompt = false -%}
+{%- endif -%}
+{%- if not thinking is defined -%}
+ {%- if enable_thinking is defined -%}
+ {%- set thinking = enable_thinking -%}
+ {%- else -%}
+ {%- set thinking = false -%}
+ {%- endif -%}
+{%- endif -%}
+{%- set dsml_token = '|DSML|' -%}
+{%- set thinking_start_token = '' -%}
+{%- set thinking_end_token = '' -%}
+{%- set tools_header = '## Tools\n\nYou have access to a set of tools to help answer the user\'s question. You can invoke tools by writing a "<' + dsml_token + 'tool_calls>" block like the following:\n\n<' + dsml_token + 'tool_calls>\n<' + dsml_token + 'invoke name="$TOOL_NAME">\n<' + dsml_token + 'parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE' + dsml_token + 'parameter>\n...\n' + dsml_token + 'invoke>\n<' + dsml_token + 'invoke name="$TOOL_NAME2">\n...\n' + dsml_token + 'invoke>\n' + dsml_token + 'tool_calls>\n\nString parameters should be specified as is and set `string="true"`. For all other types (numbers, booleans, arrays, objects), pass the value in JSON format and set `string="false"`.\n\nIf thinking_mode is enabled (triggered by ' + thinking_start_token + '), you MUST output your complete reasoning inside ' + thinking_start_token + '...' + thinking_end_token + ' BEFORE any tool calls or final response.\n\nOtherwise, output directly after ' + thinking_end_token + ' with tool calls or final response.\n\n### Available Tool Schemas\n\n' -%}
+{%- set tools_footer = '\nYou MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls.\n' -%}
+{%- set ns = namespace(system_prompt='', is_first_sp=true) -%}
+{%- for message in messages -%}
+ {%- if message['role'] == 'system' -%}
+ {%- if ns.is_first_sp -%}
+ {%- set ns.system_prompt = ns.system_prompt + (message['content'] or '') -%}
+ {%- set ns.is_first_sp = false -%}
+ {%- else -%}
+ {%- set ns.system_prompt = ns.system_prompt + '\n\n' + (message['content'] or '') -%}
+ {%- endif -%}
+ {%- endif -%}
+{%- endfor -%}
+{%- if tools is defined and tools -%}
+ {%- set ts = namespace(schemas='') -%}
+ {%- for tool in tools -%}
+ {%- if tool['type'] == 'function' -%}
+ {%- set ts.schemas = ts.schemas + (tool['function'] | tojson) + '\n' -%}
+ {%- endif -%}
+ {%- endfor -%}
+ {%- if ns.system_prompt -%}
+ {%- set ns.system_prompt = ns.system_prompt + '\n\n' + tools_header + ts.schemas + tools_footer -%}
+ {%- else -%}
+ {%- set ns.system_prompt = tools_header + ts.schemas + tools_footer -%}
+ {%- endif -%}
+{%- endif -%}
+{{- bos_token -}}
+{{- ns.system_prompt -}}
+{%- set last_user_idx = namespace(value=-1) -%}
+{%- for message in messages -%}
+ {%- if message['role'] == 'user' or message['role'] == 'developer' or message['role'] == 'tool' -%}
+ {%- set last_user_idx.value = loop.index0 -%}
+ {%- endif -%}
+{%- endfor -%}
+{%- set state = namespace(in_user=false) -%}
+{%- for message in messages -%}
+ {%- if message['role'] == 'user' or message['role'] == 'developer' -%}
+ {%- if state.in_user -%}
+ {{- '\n\n' -}}
+ {%- else -%}
+ {{- '<|User|>' -}}
+ {%- set state.in_user = true -%}
+ {%- endif -%}
+ {{- message['content'] or '' -}}
+ {%- elif message['role'] == 'tool' -%}
+ {%- if state.in_user -%}
+ {{- '\n\n' -}}
+ {%- else -%}
+ {{- '<|User|>' -}}
+ {%- set state.in_user = true -%}
+ {%- endif -%}
+ {{- '' + (message['content'] or '') + '' -}}
+ {%- elif message['role'] == 'assistant' -%}
+ {%- set state.in_user = false -%}
+ {{- '<|Assistant|>' -}}
+ {%- set is_after_last_user = loop.index0 > last_user_idx.value -%}
+ {%- if is_after_last_user and thinking -%}
+ {{- thinking_start_token -}}
+ {%- if message['reasoning_content'] is defined and message['reasoning_content'] -%}
+ {{- message['reasoning_content'] -}}
+ {%- endif -%}
+ {{- thinking_end_token -}}
+ {%- else -%}
+ {{- thinking_end_token -}}
+ {%- endif -%}
+ {%- if message['content'] is defined and message['content'] -%}
+ {{- message['content'] -}}
+ {%- endif -%}
+ {%- if message['tool_calls'] -%}
+ {{- '\n\n<' + dsml_token + 'tool_calls>\n' -}}
+ {%- for tool in message['tool_calls'] -%}
+ {%- set func = tool['function'] -%}
+ {{- '<' + dsml_token + 'invoke name="' + func['name'] + '">\n' -}}
+ {%- set args = func['arguments'] -%}
+ {%- if args is string -%}
+ {%- set args = args | from_json -%}
+ {%- endif -%}
+ {%- for key, val in args.items() -%}
+ {%- if val is string -%}
+ {{- '<' + dsml_token + 'parameter name="' + key + '" string="true">' + val + '' + dsml_token + 'parameter>\n' -}}
+ {%- else -%}
+ {{- '<' + dsml_token + 'parameter name="' + key + '" string="false">' + (val | tojson) + '' + dsml_token + 'parameter>\n' -}}
+ {%- endif -%}
+ {%- endfor -%}
+ {{- '' + dsml_token + 'invoke>\n' -}}
+ {%- endfor -%}
+ {{- '' + dsml_token + 'tool_calls>' -}}
+ {%- endif -%}
+ {{- '<|end▁of▁sentence|>' -}}
+ {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+ {{- '<|Assistant|>' -}}
+ {%- if thinking -%}
+ {{- thinking_start_token -}}
+ {%- else -%}
+ {{- thinking_end_token -}}
+ {%- endif -%}
+{%- endif -%}
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index d15ccfd99f14..320784c3a8cc 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -25,6 +25,7 @@ add_library(llama
llama-kv-cache.cpp
llama-kv-cache-iswa.cpp
llama-kv-cache-dsa.cpp
+ llama-kv-cache-dsv4.cpp
llama-memory.cpp
llama-memory-hybrid.cpp
llama-memory-hybrid-iswa.cpp
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index d80915ffdba5..98f391a9115f 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -77,6 +77,7 @@ static const std::map LLM_ARCH_NAMES = {
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_DEEPSEEK2OCR, "deepseek2-ocr" },
{ LLM_ARCH_DEEPSEEK32, "deepseek32" },
+ { LLM_ARCH_DEEPSEEK4, "deepseek4" },
{ LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_GLM4, "glm4" },
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
@@ -440,6 +441,23 @@ static const std::map LLM_TENSOR_NAMES = {
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
+ { LLM_TENSOR_ATTN_KV, "blk.%d.attn_kv" },
+ { LLM_TENSOR_ATTN_KV_NORM, "blk.%d.attn_kv_a_norm" },
+ { LLM_TENSOR_ATTN_OUT_A, "blk.%d.attn_output_a" },
+ { LLM_TENSOR_ATTN_OUT_B, "blk.%d.attn_output_b" },
+ { LLM_TENSOR_HC_HEAD_FN, "output_hc_fn" },
+ { LLM_TENSOR_HC_HEAD_BASE, "output_hc_base" },
+ { LLM_TENSOR_HC_HEAD_SCALE, "output_hc_scale" },
+ { LLM_TENSOR_HC_ATTN_FN, "blk.%d.hc_attn_fn" },
+ { LLM_TENSOR_HC_ATTN_BASE, "blk.%d.hc_attn_base" },
+ { LLM_TENSOR_HC_ATTN_SCALE, "blk.%d.hc_attn_scale" },
+ { LLM_TENSOR_HC_FFN_FN, "blk.%d.hc_ffn_fn" },
+ { LLM_TENSOR_HC_FFN_BASE, "blk.%d.hc_ffn_base" },
+ { LLM_TENSOR_HC_FFN_SCALE, "blk.%d.hc_ffn_scale" },
+ { LLM_TENSOR_ATTN_COMPRESSOR_WKV, "blk.%d.attn_compressor_kv" },
+ { LLM_TENSOR_ATTN_COMPRESSOR_WGATE, "blk.%d.attn_compressor_gate" },
+ { LLM_TENSOR_ATTN_COMPRESSOR_APE, "blk.%d.attn_compressor_ape" },
+ { LLM_TENSOR_ATTN_COMPRESSOR_NORM, "blk.%d.attn_compressor_norm" },
{ LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" },
{ LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" },
{ LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" },
@@ -566,6 +584,11 @@ static const std::map LLM_TENSOR_NAMES = {
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
+ { LLM_TENSOR_INDEXER_COMPRESSOR_WKV, "blk.%d.indexer_compressor_kv" },
+ { LLM_TENSOR_INDEXER_COMPRESSOR_WGATE, "blk.%d.indexer_compressor_gate" },
+ { LLM_TENSOR_INDEXER_COMPRESSOR_APE, "blk.%d.indexer_compressor_ape" },
+ { LLM_TENSOR_INDEXER_COMPRESSOR_NORM, "blk.%d.indexer_compressor_norm" },
+ { LLM_TENSOR_FFN_GATE_TID2EID, "blk.%d.ffn_gate_tid2eid" },
{ LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" },
{ LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" },
{ LLM_TENSOR_FC, "fc" },
@@ -616,6 +639,23 @@ static const std::map LLM_TENSOR_INFOS = {
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_KV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_KV_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ATTN_OUT_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_OUT_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_HC_HEAD_FN, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_HC_HEAD_BASE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}},
+ {LLM_TENSOR_HC_HEAD_SCALE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
+ {LLM_TENSOR_HC_ATTN_FN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_HC_ATTN_BASE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_HC_ATTN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_HC_FFN_FN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_HC_FFN_BASE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_HC_FFN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_ATTN_COMPRESSOR_WKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_COMPRESSOR_WGATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_COMPRESSOR_APE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_ATTN_COMPRESSOR_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
@@ -779,6 +819,11 @@ static const std::map LLM_TENSOR_INFOS = {
{LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_INDEXER_COMPRESSOR_WKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_INDEXER_COMPRESSOR_WGATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_INDEXER_COMPRESSOR_APE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_INDEXER_COMPRESSOR_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_FFN_GATE_TID2EID, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
{LLM_TENSOR_NEXTN_PROJ_PRE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_NEXTN_PROJ_POST, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
// NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the
@@ -933,6 +978,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
case LLM_ARCH_OLMOE:
case LLM_ARCH_DEEPSEEK2:
case LLM_ARCH_DEEPSEEK32:
+ case LLM_ARCH_DEEPSEEK4:
case LLM_ARCH_GLM_DSA:
case LLM_ARCH_BITNET:
case LLM_ARCH_T5:
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 946518d5f224..7087785d522d 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -82,6 +82,7 @@ enum llm_arch {
LLM_ARCH_DEEPSEEK2,
LLM_ARCH_DEEPSEEK2OCR,
LLM_ARCH_DEEPSEEK32,
+ LLM_ARCH_DEEPSEEK4,
LLM_ARCH_CHATGLM,
LLM_ARCH_GLM4,
LLM_ARCH_GLM4_MOE,
@@ -501,10 +502,27 @@ enum llm_tensor {
LLM_TENSOR_ATTN_Q_B,
LLM_TENSOR_ATTN_KV_A_MQA,
LLM_TENSOR_ATTN_KV_B,
+ LLM_TENSOR_ATTN_KV,
+ LLM_TENSOR_ATTN_KV_NORM,
+ LLM_TENSOR_ATTN_OUT_A,
+ LLM_TENSOR_ATTN_OUT_B,
LLM_TENSOR_ATTN_K_B,
LLM_TENSOR_ATTN_V_B,
LLM_TENSOR_ATTN_Q_A_NORM,
LLM_TENSOR_ATTN_KV_A_NORM,
+ LLM_TENSOR_HC_HEAD_FN,
+ LLM_TENSOR_HC_HEAD_BASE,
+ LLM_TENSOR_HC_HEAD_SCALE,
+ LLM_TENSOR_HC_ATTN_FN,
+ LLM_TENSOR_HC_ATTN_BASE,
+ LLM_TENSOR_HC_ATTN_SCALE,
+ LLM_TENSOR_HC_FFN_FN,
+ LLM_TENSOR_HC_FFN_BASE,
+ LLM_TENSOR_HC_FFN_SCALE,
+ LLM_TENSOR_ATTN_COMPRESSOR_WKV,
+ LLM_TENSOR_ATTN_COMPRESSOR_WGATE,
+ LLM_TENSOR_ATTN_COMPRESSOR_APE,
+ LLM_TENSOR_ATTN_COMPRESSOR_NORM,
LLM_TENSOR_ATTN_SUB_NORM,
LLM_TENSOR_FFN_SUB_NORM,
LLM_TENSOR_DEC_ATTN_NORM,
@@ -566,6 +584,11 @@ enum llm_tensor {
LLM_TENSOR_INDEXER_PROJ,
LLM_TENSOR_INDEXER_ATTN_K,
LLM_TENSOR_INDEXER_ATTN_Q_B,
+ LLM_TENSOR_INDEXER_COMPRESSOR_WKV,
+ LLM_TENSOR_INDEXER_COMPRESSOR_WGATE,
+ LLM_TENSOR_INDEXER_COMPRESSOR_APE,
+ LLM_TENSOR_INDEXER_COMPRESSOR_NORM,
+ LLM_TENSOR_FFN_GATE_TID2EID,
LLM_TENSOR_NEXTN_PROJ_PRE,
LLM_TENSOR_NEXTN_PROJ_POST,
LLM_TENSOR_NEXTN_EH_PROJ,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 029141e2aaf2..0465430df43a 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -2321,7 +2321,11 @@ void llama_context::output_reorder() {
//
uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
- if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
+ if (model.arch == LLM_ARCH_QWEN3NEXT ||
+ model.arch == LLM_ARCH_KIMI_LINEAR ||
+ model.arch == LLM_ARCH_QWEN35 ||
+ model.arch == LLM_ARCH_QWEN35MOE ||
+ model.arch == LLM_ARCH_DEEPSEEK4) {
return std::max(n_tokens * 40, 32u * model.n_tensors());
}
uint32_t res = std::max(1024u, 8u*model.n_tensors());
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 3ded70bc0f71..4c86e43c1f74 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -8,6 +8,7 @@
#include "llama-kv-cache.h"
#include "llama-kv-cache-iswa.h"
#include "llama-kv-cache-dsa.h"
+#include "llama-kv-cache-dsv4.h"
#include "llama-memory-hybrid.h"
#include "llama-memory-hybrid-iswa.h"
#include "llama-memory-recurrent.h"
@@ -17,6 +18,7 @@
#include
#include
#include
+#include
#include
// dedup helpers
@@ -568,7 +570,9 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
// base tensors may not be allocated if there are no non-SWA attention layers
if (self_k_idxs && self_k_idxs->buffer) {
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
- mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+ if (self_v_idxs) {
+ mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+ }
}
// the kq mask guards on its own buffer: shared cells leave idxs unbacked while the mask stays live
@@ -579,7 +583,9 @@ void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
// swa tensors may not be allocated if there are no SWA attention layers
if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
- mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+ if (self_v_idxs_swa) {
+ mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+ }
}
if (self_kq_mask_swa && self_kq_mask_swa->buffer) {
@@ -633,6 +639,283 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
return res;
}
+static void dsv4_set_i64(ggml_tensor * dst, const std::vector & src) {
+ if (!dst || !dst->buffer) {
+ return;
+ }
+
+ GGML_ASSERT(dst->ne[0] == (int64_t) src.size());
+ ggml_backend_tensor_set(dst, src.data(), 0, src.size()*ggml_element_size(dst));
+}
+
+static void dsv4_set_i32(ggml_tensor * dst, const std::vector & src) {
+ if (!dst || !dst->buffer) {
+ return;
+ }
+
+ GGML_ASSERT(dst->ne[0] == (int64_t) src.size());
+ ggml_backend_tensor_set(dst, src.data(), 0, src.size()*ggml_element_size(dst));
+}
+
+static void dsv4_set_kq_mask(
+ ggml_tensor * dst,
+ const llama_kv_cache_dsv4_context::comp_plan & plan,
+ uint32_t n_tokens,
+ int64_t n_stream) {
+ if (!dst || !dst->buffer) {
+ return;
+ }
+
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(n_stream > 0);
+ GGML_ASSERT(n_tokens%n_stream == 0);
+ GGML_ASSERT(dst->ne[0] == plan.n_kv);
+ GGML_ASSERT(dst->ne[1] == (int64_t) n_tokens/n_stream);
+ GGML_ASSERT(dst->ne[2] == 1);
+ GGML_ASSERT(dst->ne[3] == n_stream);
+ GGML_ASSERT((int64_t) plan.n_visible.size() == (int64_t) n_tokens);
+ GGML_ASSERT(ggml_backend_buffer_is_host(dst->buffer));
+
+ float * data = (float *) dst->data;
+
+ for (int64_t i = 0; i < (int64_t) n_tokens; ++i) {
+ const int32_t n_visible = plan.n_visible[i];
+
+ for (int64_t j = 0; j < dst->ne[0]; ++j) {
+ data[i*dst->ne[0] + j] = j < n_visible ? 0.0f : -INFINITY;
+ }
+ }
+}
+
+static ggml_tensor * dsv4_build_raw_kq_mask(
+ ggml_context * ctx,
+ const llama_kv_cache_dsv4_raw_context * mctx,
+ const llama_ubatch & ubatch,
+ const llama_cparams & cparams,
+ int64_t n_stream) {
+ const auto n_kv = mctx->get_n_kv();
+ const auto n_tokens = ubatch.n_tokens;
+
+ GGML_ASSERT(n_stream > 0);
+ GGML_ASSERT(n_tokens%n_stream == 0);
+
+ const bool use_fattn = cparams.flash_attn && (!cparams.kv_unified || n_stream == 1);
+ const auto type = use_fattn ? GGML_TYPE_F16 : GGML_TYPE_F32;
+
+ ggml_tensor * res = ggml_new_tensor_4d(ctx, type, n_kv, n_tokens/n_stream, 1, n_stream);
+ ggml_set_input(res);
+ ggml_set_name(res, "attn_inp_kq_mask");
+
+ return res;
+}
+
+static bool dsv4_can_reuse_raw_kq_mask(
+ ggml_tensor * kq_mask,
+ const llama_kv_cache_dsv4_raw_context * mctx,
+ const llama_ubatch & ubatch,
+ int64_t n_stream) {
+ const auto n_kv = mctx->get_n_kv();
+ const auto n_tokens = ubatch.n_tokens;
+
+ GGML_ASSERT(n_stream > 0);
+
+ bool res = true;
+
+ res &= (kq_mask->ne[0] == n_kv);
+ res &= (kq_mask->ne[1] == n_tokens/n_stream);
+ res &= (kq_mask->ne[2] == 1);
+ res &= (kq_mask->ne[3] == n_stream);
+
+ return res;
+}
+
+static std::string dsv4_plan_positions(const std::vector & values) {
+ std::ostringstream ss;
+ ss << "[";
+ for (size_t i = 0; i < values.size(); ++i) {
+ if (i > 0) {
+ ss << ", ";
+ }
+ ss << values[i];
+ }
+ ss << "]";
+ return ss.str();
+}
+
+static bool dsv4_compress_debug() {
+ static const bool debug = []() {
+ const char * env = getenv("LLAMA_DSV4_COMPRESS_DEBUG");
+ return env && atoi(env) > 0;
+ }();
+
+ return debug;
+}
+
+static void dsv4_set_comp_inputs(
+ const llm_graph_input_dsv4::comp_input & inp,
+ const llama_kv_cache_dsv4_context::comp_plan & plan,
+ const char * name,
+ bool debug,
+ uint32_t n_tokens,
+ int64_t n_stream) {
+ dsv4_set_i32(inp.state_pos, plan.state_pos);
+ dsv4_set_i32(inp.state_persist_src_idxs, plan.state_persist_src_idxs);
+ dsv4_set_i32(inp.state_persist_dst_idxs, plan.state_persist_dst_idxs);
+ dsv4_set_i32(inp.state_read_idxs, plan.state_read_idxs);
+ dsv4_set_i64(inp.state_write_idxs, plan.state_write_idxs);
+ dsv4_set_i32(inp.state_write_pos, plan.state_write_pos);
+ dsv4_set_kq_mask(inp.kq_mask, plan, n_tokens, n_stream);
+
+ if (debug || dsv4_compress_debug()) {
+ LLAMA_LOG_INFO("%s: %s n_tokens=%u, n_stream=%d, state_persist_dst=%s, state_write_pos=%s\n",
+ __func__, name, n_tokens, (int) n_stream,
+ dsv4_plan_positions(plan.state_persist_dst_idxs).c_str(),
+ dsv4_plan_positions(plan.state_write_pos).c_str());
+ }
+}
+
+static bool dsv4_can_reuse_tensor_1d(ggml_tensor * t, int64_t ne0) {
+ return (t == nullptr && ne0 == 0) || (t != nullptr && t->ne[0] == ne0);
+}
+
+static bool dsv4_can_reuse_kq_mask(
+ ggml_tensor * t,
+ const llama_kv_cache_dsv4_context::comp_plan & plan,
+ uint32_t n_tokens,
+ int64_t n_stream) {
+ if (plan.n_kv == 0) {
+ return t == nullptr;
+ }
+
+ GGML_ASSERT(n_stream > 0);
+
+ return t != nullptr &&
+ t->ne[0] == plan.n_kv &&
+ t->ne[1] == (int64_t) n_tokens/n_stream &&
+ t->ne[2] == 1 &&
+ t->ne[3] == n_stream;
+}
+
+static bool dsv4_can_reuse_comp_input(
+ const llm_graph_input_dsv4::comp_input & inp,
+ const llama_kv_cache_dsv4_context::comp_plan & plan,
+ uint32_t n_tokens,
+ int64_t n_stream) {
+ bool res = true;
+ res &= dsv4_can_reuse_tensor_1d(inp.state_pos, plan.state_pos.size());
+ res &= dsv4_can_reuse_tensor_1d(inp.state_persist_src_idxs, plan.state_persist_src_idxs.size());
+ res &= dsv4_can_reuse_tensor_1d(inp.state_persist_dst_idxs, plan.state_persist_dst_idxs.size());
+ res &= dsv4_can_reuse_tensor_1d(inp.state_read_idxs, plan.state_read_idxs.size());
+ res &= dsv4_can_reuse_tensor_1d(inp.state_write_idxs, plan.state_write_idxs.size());
+ res &= dsv4_can_reuse_tensor_1d(inp.state_write_pos, plan.state_write_pos.size());
+ res &= dsv4_can_reuse_kq_mask(inp.kq_mask, plan, n_tokens, n_stream);
+
+ return res;
+}
+
+static ggml_tensor * dsv4_build_input_1d(
+ ggml_context * ctx,
+ ggml_type type,
+ int64_t ne0,
+ const std::string & name) {
+ if (ne0 == 0) {
+ return nullptr;
+ }
+
+ ggml_tensor * res = ggml_new_tensor_1d(ctx, type, ne0);
+ ggml_set_input(res);
+ ggml_set_name(res, name.c_str());
+
+ return res;
+}
+
+static void dsv4_build_comp_inputs(
+ ggml_context * ctx,
+ llm_graph_input_dsv4::comp_input & inp,
+ const llama_kv_cache_dsv4_context::comp_plan & plan,
+ const char * name,
+ int64_t n_stream) {
+ inp.state_pos = dsv4_build_input_1d(ctx, GGML_TYPE_I32, plan.state_pos.size(), std::string("dsv4_") + name + "_state_pos");
+ inp.state_persist_src_idxs = dsv4_build_input_1d(ctx, GGML_TYPE_I32, plan.state_persist_src_idxs.size(), std::string("dsv4_") + name + "_state_persist_src_idxs");
+ inp.state_persist_dst_idxs = dsv4_build_input_1d(ctx, GGML_TYPE_I32, plan.state_persist_dst_idxs.size(), std::string("dsv4_") + name + "_state_persist_dst_idxs");
+ inp.state_read_idxs = dsv4_build_input_1d(ctx, GGML_TYPE_I32, plan.state_read_idxs.size(), std::string("dsv4_") + name + "_state_read_idxs");
+ inp.state_write_idxs = dsv4_build_input_1d(ctx, GGML_TYPE_I64, plan.state_write_idxs.size(), std::string("dsv4_") + name + "_state_write_idxs");
+ inp.state_write_pos = dsv4_build_input_1d(ctx, GGML_TYPE_I32, plan.state_write_pos.size(), std::string("dsv4_") + name + "_state_write_pos");
+
+ if (plan.n_kv > 0) {
+ const int64_t n_tokens = (int64_t) plan.n_visible.size();
+
+ GGML_ASSERT(n_stream > 0);
+ GGML_ASSERT(n_tokens%n_stream == 0);
+
+ inp.kq_mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, plan.n_kv, n_tokens/n_stream, 1, n_stream);
+ ggml_set_input(inp.kq_mask);
+ ggml_set_name(inp.kq_mask, (std::string("dsv4_") + name + "_kq_mask").c_str());
+ }
+}
+
+void llm_graph_input_dsv4_raw::set_input(const llama_ubatch * ubatch) {
+ if (self_k_idxs && self_k_idxs->buffer) {
+ mctx->set_input_k_idxs(self_k_idxs);
+ }
+
+ if (self_kq_mask && self_kq_mask->buffer) {
+ mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+ }
+
+ if (self_k_rot) {
+ mctx->set_input_k_rot(self_k_rot);
+ }
+}
+
+void llm_graph_input_dsv4::set_input(const llama_ubatch * ubatch) {
+ const auto & plan_csa = mctx->get_csa_plan(*ubatch);
+ const auto & plan_hca = mctx->get_hca_plan(*ubatch);
+ const auto & plan_lid = mctx->get_lid_plan(*ubatch);
+ const int64_t n_stream = plan_csa.n_stream;
+
+ inp_raw->mctx = mctx->get_raw();
+ inp_raw->set_input(ubatch);
+
+ dsv4_set_comp_inputs(inp_csa, plan_csa, "csa", debug > 0, ubatch->n_tokens, n_stream);
+ dsv4_set_comp_inputs(inp_hca, plan_hca, "hca", debug > 0, ubatch->n_tokens, n_stream);
+ dsv4_set_comp_inputs(inp_lid, plan_lid, "lid", debug > 0, ubatch->n_tokens, n_stream);
+
+ if (inp_lid.k_rot && inp_lid.k_rot->buffer) {
+ mctx->get_lid()->set_input_k_rot(inp_lid.k_rot);
+ }
+}
+
+bool llm_graph_input_dsv4::can_reuse(const llm_graph_params & params) {
+ const auto * mctx = static_cast(params.mctx);
+
+ this->mctx = mctx;
+ inp_raw->mctx = mctx->get_raw();
+
+ bool res = true;
+
+ const auto & plan_csa = mctx->get_csa_plan(params.ubatch);
+ const auto & plan_hca = mctx->get_hca_plan(params.ubatch);
+ const auto & plan_lid = mctx->get_lid_plan(params.ubatch);
+ const int64_t n_stream = plan_csa.n_stream;
+
+ const auto * raw_ctx = mctx->get_raw();
+ inp_raw->mctx = raw_ctx;
+
+ if (inp_raw->self_k_idxs && inp_raw->self_k_idxs->buffer) {
+ res &= inp_raw->self_k_idxs->ne[0] == raw_ctx->get_n_write();
+ }
+ if (inp_raw->self_kq_mask && inp_raw->self_kq_mask->buffer) {
+ res &= dsv4_can_reuse_raw_kq_mask(inp_raw->self_kq_mask, raw_ctx, params.ubatch, n_stream);
+ }
+
+ res &= dsv4_can_reuse_comp_input(inp_csa, plan_csa, params.ubatch.n_tokens, n_stream);
+ res &= dsv4_can_reuse_comp_input(inp_hca, plan_hca, params.ubatch.n_tokens, n_stream);
+ res &= dsv4_can_reuse_comp_input(inp_lid, plan_lid, params.ubatch.n_tokens, n_stream);
+
+ return res;
+}
+
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
GGML_ASSERT(cross_kq_mask);
@@ -1351,20 +1634,24 @@ ggml_tensor * llm_graph_context::build_ffn(
switch (type_op) {
case LLM_FFN_SILU:
if (gate && type_gate == LLM_FFN_PAR) {
- // Step35: HF clamps gate (after SiLU) and up before multiplication
- if (arch == LLM_ARCH_STEP35 && il >= 0) {
+ if (il >= 0) {
const float limit = hparams.swiglu_clamp_shexp[il];
constexpr float eps = 1e-6f;
if (limit > eps) {
- ggml_tensor * gate_act = ggml_silu(ctx0, cur);
- cb(gate_act, "ffn_silu", il);
- gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit);
- cb(gate_act, "ffn_silu_clamped", il);
-
tmp = ggml_clamp(ctx0, tmp, -limit, limit);
cb(tmp, "ffn_up_clamped", il);
- cur = ggml_mul(ctx0, gate_act, tmp);
+ if (arch == LLM_ARCH_DEEPSEEK4) {
+ cur = ggml_clamp(ctx0, cur, -INFINITY, limit);
+ cb(cur, "ffn_gate_clamped", il);
+ cur = ggml_swiglu_split(ctx0, cur, tmp);
+ } else {
+ ggml_tensor * gate_act = ggml_silu(ctx0, cur);
+ cb(gate_act, "ffn_silu", il);
+ gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit);
+ cb(gate_act, "ffn_silu_clamped", il);
+ cur = ggml_mul(ctx0, gate_act, tmp);
+ }
cb(cur, "ffn_swiglu_limited", il);
type_gate = LLM_FFN_SEQ;
break;
@@ -1474,7 +1761,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * gate_up_exps,
ggml_tensor * up_exps_s,
ggml_tensor * gate_exps_s,
- ggml_tensor * down_exps_s) const {
+ ggml_tensor * down_exps_s,
+ ggml_tensor * selected_experts_in) const {
return build_moe_ffn(
cur,
gate_inp, /* gate_inp_b */ nullptr,
@@ -1494,7 +1782,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
/* gate_up_exps_b */ nullptr,
up_exps_s,
gate_exps_s,
- down_exps_s
+ down_exps_s,
+ selected_experts_in
);
}
@@ -1521,7 +1810,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * gate_up_exps_b,
ggml_tensor * up_exps_s,
ggml_tensor * gate_exps_s,
- ggml_tensor * down_exps_s) const {
+ ggml_tensor * down_exps_s,
+ ggml_tensor * selected_experts_in) const {
const int64_t n_embd = cur->ne[0];
const int64_t n_tokens = cur->ne[1];
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
@@ -1530,6 +1820,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
if (probs_in == nullptr) {
logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
+ if (gating_op == LLAMA_EXPERT_GATING_FUNC_TYPE_SQRT_SOFTPLUS) {
+ ggml_mul_mat_set_prec(logits, GGML_PREC_F32);
+ }
cb(logits, "ffn_moe_logits", il);
} else {
logits = probs_in;
@@ -1554,6 +1847,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
{
probs = logits; // [n_expert, n_tokens]
} break;
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SQRT_SOFTPLUS:
+ {
+ probs = ggml_sqrt(ctx0, ggml_softplus(ctx0, logits)); // [n_expert, n_tokens]
+ } break;
default:
GGML_ABORT("fatal error");
}
@@ -1604,8 +1901,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
}
// select experts
- ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
+ ggml_tensor * selected_experts = selected_experts_in;
+ if (selected_experts == nullptr) {
+ selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
+ }
cb(selected_experts, "ffn_moe_topk", il);
if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
@@ -1718,20 +2018,24 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
switch (type_op) {
case LLM_FFN_SILU:
if (gate_exps) {
- // Step35: per-layer clamp for routed experts
- if (arch == LLM_ARCH_STEP35 && il >= 0) {
+ if (il >= 0) {
const float limit = hparams.swiglu_clamp_exp[il];
constexpr float eps = 1e-6f;
if (limit > eps) {
- ggml_tensor * gate_act = ggml_silu(ctx0, cur);
- cb(gate_act, "ffn_moe_silu", il);
- gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit);
- cb(gate_act, "ffn_moe_silu_clamped", il);
-
up = ggml_clamp(ctx0, up, -limit, limit);
cb(up, "ffn_moe_up_clamped", il);
- cur = ggml_mul(ctx0, gate_act, up);
+ if (arch == LLM_ARCH_DEEPSEEK4) {
+ cur = ggml_clamp(ctx0, cur, -INFINITY, limit);
+ cb(cur, "ffn_moe_gate_clamped", il);
+ cur = ggml_swiglu_split(ctx0, cur, up);
+ } else {
+ ggml_tensor * gate_act = ggml_silu(ctx0, cur);
+ cb(gate_act, "ffn_moe_silu", il);
+ gate_act = ggml_clamp(ctx0, gate_act, -INFINITY, limit);
+ cb(gate_act, "ffn_moe_silu_clamped", il);
+ cur = ggml_mul(ctx0, gate_act, up);
+ }
cb(cur, "ffn_moe_swiglu_limited", il);
break;
}
@@ -2760,6 +3064,31 @@ llm_graph_input_attn_kv_iswa * llm_graph_context::build_attn_inp_kv_iswa() const
return (llm_graph_input_attn_kv_iswa *) res->add_input(std::move(inp));
}
+llm_graph_input_dsv4 * llm_graph_context::build_inp_dsv4() const {
+ const auto * mctx_cur = static_cast(mctx);
+ const auto * raw_ctx = mctx_cur->get_raw();
+
+ auto inp_raw = std::make_unique(cparams, raw_ctx);
+
+ const int64_t n_stream = mctx_cur->get_csa_plan(ubatch).n_stream;
+
+ GGML_ASSERT(hparams.swa_type != LLAMA_SWA_TYPE_NONE && "DSV4 expects SWA raw cache");
+
+ inp_raw->self_k_idxs = raw_ctx->build_input_k_idxs(ctx0, ubatch);
+ inp_raw->self_kq_mask = dsv4_build_raw_kq_mask(ctx0, raw_ctx, ubatch, cparams, n_stream);
+ inp_raw->self_kq_mask_cnv = inp_raw->self_kq_mask;
+
+ inp_raw->self_k_rot = raw_ctx->build_input_k_rot(ctx0);
+ auto inp = std::make_unique(cparams, std::move(inp_raw), mctx_cur);
+
+ dsv4_build_comp_inputs(ctx0, inp->inp_csa, mctx_cur->get_csa_plan(ubatch), "csa", n_stream);
+ dsv4_build_comp_inputs(ctx0, inp->inp_hca, mctx_cur->get_hca_plan(ubatch), "hca", n_stream);
+ dsv4_build_comp_inputs(ctx0, inp->inp_lid, mctx_cur->get_lid_plan(ubatch), "lid", n_stream);
+ inp->inp_lid.k_rot = mctx_cur->get_lid()->build_input_k_rot(ctx0);
+
+ return (llm_graph_input_dsv4 *) res->add_input(std::move(inp));
+}
+
ggml_tensor * llm_graph_context::build_rs(
ggml_tensor * s,
ggml_tensor * state_copy_main,
diff --git a/src/llama-graph.h b/src/llama-graph.h
index a6e8c3985ba5..4b5b75c632ab 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -23,6 +23,8 @@ struct llama_memory_context_i;
class llama_kv_cache_context;
class llama_kv_cache_dsa_context;
+class llama_kv_cache_dsv4_raw_context;
+class llama_kv_cache_dsv4_context;
class llama_kv_cache_iswa_context;
class llama_memory_recurrent_context;
class llama_memory_hybrid_context;
@@ -459,6 +461,79 @@ class llm_graph_input_attn_kv_iswa : public llm_graph_input_i {
const llama_kv_cache_iswa_context * mctx;
};
+// DSV4 raw graph inputs are SWA-only, but their mask may be stream-shaped
+// so raw K can be concatenated with DSV4 compressed K in one attention op.
+class llm_graph_input_dsv4_raw {
+public:
+ llm_graph_input_dsv4_raw(
+ const llama_cparams & cparams,
+ const llama_kv_cache_dsv4_raw_context * mctx) :
+ cparams(cparams),
+ mctx(mctx) {
+ }
+
+ void set_input(const llama_ubatch * ubatch);
+
+ ggml_tensor * get_k_idxs() const { return self_k_idxs; }
+ ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+ ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
+
+ ggml_tensor * self_kq_mask = nullptr; // F32/F16 [n_kv, n_batch/n_stream, 1, n_stream]
+ ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
+
+ ggml_tensor * self_k_rot = nullptr;
+
+ const llama_cparams cparams;
+
+ const llama_kv_cache_dsv4_raw_context * mctx;
+};
+
+class llm_graph_input_dsv4 : public llm_graph_input_i {
+public:
+ struct comp_input {
+ ggml_tensor * state_pos = nullptr; // I32 [n_state]
+ ggml_tensor * state_persist_src_idxs = nullptr; // I32 [n_state_persist]
+ ggml_tensor * state_persist_dst_idxs = nullptr; // I32 [n_state_persist]
+ ggml_tensor * state_read_idxs = nullptr; // I32 [ratio*n_state_write]
+ ggml_tensor * state_write_idxs = nullptr; // I64 [n_state_write]
+ ggml_tensor * state_write_pos = nullptr; // I32 [n_state_write]
+
+ ggml_tensor * kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
+
+ ggml_tensor * k_rot = nullptr;
+ };
+
+ llm_graph_input_dsv4(
+ const llama_cparams & cparams,
+ std::unique_ptr inp_raw,
+ const llama_kv_cache_dsv4_context * mctx) :
+ inp_raw(std::move(inp_raw)),
+ cparams(cparams),
+ mctx(mctx) {
+ }
+ ~llm_graph_input_dsv4() = default;
+
+ void set_input(const llama_ubatch * ubatch) override;
+
+ bool can_reuse(const llm_graph_params & params) override;
+
+ llm_graph_input_dsv4_raw * get_raw() const { return inp_raw.get(); }
+ const comp_input & get_csa() const { return inp_csa; }
+ const comp_input & get_hca() const { return inp_hca; }
+ const comp_input & get_lid() const { return inp_lid; }
+
+ std::unique_ptr inp_raw;
+
+ comp_input inp_csa;
+ comp_input inp_hca;
+ comp_input inp_lid;
+
+ const llama_cparams cparams;
+
+ const llama_kv_cache_dsv4_context * mctx;
+};
+
class llm_graph_input_attn_cross : public llm_graph_input_i {
public:
llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {}
@@ -920,7 +995,8 @@ struct llm_graph_context {
ggml_tensor * gate_up_exps = nullptr,
ggml_tensor * up_exps_s = nullptr,
ggml_tensor * gate_exps_s = nullptr,
- ggml_tensor * down_exps_s = nullptr) const;
+ ggml_tensor * down_exps_s = nullptr,
+ ggml_tensor * selected_experts_in = nullptr) const;
ggml_tensor * build_moe_ffn(
ggml_tensor * cur,
@@ -945,7 +1021,8 @@ struct llm_graph_context {
ggml_tensor * gate_up_exps_b = nullptr,
ggml_tensor * up_exps_s = nullptr,
ggml_tensor * gate_exps_s = nullptr,
- ggml_tensor * down_exps_s = nullptr) const;
+ ggml_tensor * down_exps_s = nullptr,
+ ggml_tensor * selected_experts_in = nullptr) const;
//
// inputs
@@ -1045,6 +1122,8 @@ struct llm_graph_context {
llm_graph_input_attn_kv_iswa * build_attn_inp_kv_iswa() const;
+ llm_graph_input_dsv4 * build_inp_dsv4() const;
+
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
ggml_tensor * build_attn(
llm_graph_input_attn_kv_iswa * inp,
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 2eadeb214811..8be5f28f39e6 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -14,6 +14,7 @@ enum llama_expert_gating_func_type {
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SQRT_SOFTPLUS = 4,
};
enum llama_swa_type {
@@ -226,6 +227,16 @@ struct llama_hparams {
uint32_t indexer_head_size = 0;
uint32_t indexer_top_k = 0;
+ // DeepSeek-V4
+ uint32_t dsv4_o_group_count = 0;
+ uint32_t dsv4_o_lora_rank = 0;
+ uint32_t dsv4_hc_mult = 0;
+ uint32_t dsv4_hc_sinkhorn_iters = 0;
+ uint32_t dsv4_hash_layer_count = 0;
+ float dsv4_compress_rope_base = 0.0f;
+ float dsv4_hc_eps = 0.0f;
+ std::array dsv4_compress_ratios;
+
// qwen3vl deepstack
// When parsed from GGUF, this implies the first N layers consume the first
// N deepstack embeddings. Use deepstack_mapping_arr if you need a more
diff --git a/src/llama-kv-cache-dsv4.cpp b/src/llama-kv-cache-dsv4.cpp
new file mode 100644
index 000000000000..dfb2fc2620a8
--- /dev/null
+++ b/src/llama-kv-cache-dsv4.cpp
@@ -0,0 +1,1841 @@
+#include "llama-kv-cache-dsv4.h"
+
+#include "ggml-backend.h"
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-io.h"
+#include "llama-model.h"
+
+#include
+#include
+#include
+#include
+#include
+#include