diff --git a/common/chat.cpp b/common/chat.cpp
index 70b9f5dc2c58..38f7a2ed744a 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1661,6 +1661,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
static common_chat_params common_chat_params_init_deepseek_v3_2(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
common_chat_params data;
+ const auto & src = tmpl.source();
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
@@ -1681,8 +1682,9 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
const std::string DSML = "|DSML|";
const std::string THINK_START = "";
const std::string THINK_END = "";
- const std::string FC_START = "<" + DSML + "function_calls>";
- const std::string FC_END = "" + DSML + "function_calls>";
+ const std::string FC_NAME = src.find("function_calls") != std::string::npos ? "function_calls" : "tool_calls";
+ const std::string FC_START = "<" + DSML + FC_NAME + ">";
+ const std::string FC_END = "" + DSML + FC_NAME + ">";
const std::string INVOKE_START = "<" + DSML + "invoke";
const std::string INVOKE_END = "" + DSML + "invoke>";
const std::string PARAM_START = "<" + DSML + "parameter";
@@ -2093,12 +2095,12 @@ std::optional common_chat_try_specialized_template(
return common_chat_params_init_gigachat_v3(tmpl, params);
}
- // DeepSeek V3.2 format detection: template defines dsml_token and uses it for tool calls.
+ // DeepSeek DSML format detection: template defines dsml_token and uses it for tool calls.
// The template source contains the token as a variable assignment, not as a literal in markup.
if (src.find("dsml_token") != std::string::npos &&
- src.find("function_calls") != std::string::npos &&
+ (src.find("function_calls") != std::string::npos || src.find("tool_calls") != std::string::npos) &&
src.find("DSML") != std::string::npos) {
- LOG_DBG("Using specialized template: DeepSeek V3.2\n");
+ LOG_DBG("Using specialized template: DeepSeek DSML\n");
return common_chat_params_init_deepseek_v3_2(tmpl, params);
}
diff --git a/conversion/__init__.py b/conversion/__init__.py
index 2c38123dff8d..bba37a5cbbc7 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -47,6 +47,7 @@
"DeepseekForCausalLM": "deepseek",
"DeepseekV2ForCausalLM": "deepseek",
"DeepseekV3ForCausalLM": "deepseek",
+ "DeepseekV4ForCausalLM": "deepseek",
"DistilBertForMaskedLM": "bert",
"DistilBertForSequenceClassification": "bert",
"DistilBertModel": "bert",
diff --git a/conversion/deepseek.py b/conversion/deepseek.py
index e149fcbf752e..86a3046b9e98 100644
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@@ -1,18 +1,26 @@
from __future__ import annotations
+import concurrent.futures
+import ctypes
+import math
+import os
import re
-from typing import Any, Callable, Iterable, TYPE_CHECKING
+from pathlib import Path
+from typing import Any, Callable, Iterable, Sequence, TYPE_CHECKING
+import numpy as np
import torch
if TYPE_CHECKING:
from torch import Tensor
-from .base import MmprojModel, ModelBase, TextModel, gguf, logger
+from .base import LazyTorchTensor, MmprojModel, ModelBase, TextModel, gguf, logger
from .qwen import QwenModel
+TORCH_FLOAT8_E8M0FNU = getattr(torch, "float8_e8m0fnu", None)
+
@ModelBase.register("DeepseekOCRForCausalLM")
class DeepseekOCRVisionModel(MmprojModel):
@@ -386,3 +394,648 @@ def prepare_tensors(self):
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@ModelBase.register("DeepseekV4ForCausalLM")
+class DeepseekV4Model(TextModel):
+ model_arch = gguf.MODEL_ARCH.DEEPSEEK4
+
+ # Optional DeepSeek V4 debug / expert-quant knobs. In the pre-#17114
+ # monolithic convert_hf_to_gguf.py these were ModelBase.__init__ params
+ # wired to --deepseek4-* CLI flags. The refactored conversion/base.py
+ # ModelBase.__init__ does not accept them, so they default here; the
+ # standard DeepseekV4ForCausalLM conversion path does not require them.
+ deepseek4_max_layers: int | None = None
+ deepseek4_expert_outtypes: str | None = None
+ deepseek4_expert_workers: int = 1
+
+ _experts: list[dict[str, Tensor]] | None = None
+
+ _fp4_table = torch.tensor([
+ 0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0,
+ 0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0,
+ ], dtype=torch.float32)
+
+ _qtype_aliases: dict[str, gguf.GGMLQuantizationType] = {
+ "q8_0": gguf.GGMLQuantizationType.Q8_0,
+ "q2_k": gguf.GGMLQuantizationType.Q2_K,
+ "iq2_xxs": gguf.GGMLQuantizationType.IQ2_XXS,
+ "iq2_xs": gguf.GGMLQuantizationType.IQ2_XS,
+ "tq1_0": gguf.GGMLQuantizationType.TQ1_0,
+ "tq2_0": gguf.GGMLQuantizationType.TQ2_0,
+ }
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self._deepseek4_original_block_count = self.block_count
+ if self.deepseek4_max_layers is not None:
+ if self.deepseek4_max_layers <= 0:
+ raise ValueError("--deepseek4-max-layers must be positive")
+ if self.deepseek4_max_layers > self.block_count:
+ raise ValueError(
+ f"--deepseek4-max-layers={self.deepseek4_max_layers} exceeds model layer count {self.block_count}"
+ )
+ self.block_count = self.deepseek4_max_layers
+ self.hparams["num_hidden_layers"] = self.block_count
+ self.hparams["n_layers"] = self.block_count
+ self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+ logger.warning(
+ "DeepSeek V4 debug export: writing only the first %d/%d transformer layers",
+ self.block_count,
+ self._deepseek4_original_block_count,
+ )
+
+ self._deepseek4_expert_qtypes = self._parse_expert_outtype_spec(self.deepseek4_expert_outtypes)
+
+ def set_vocab(self):
+ self._set_vocab_gpt2()
+
+ def set_gguf_parameters(self):
+ self.hparams["num_key_value_heads"] = self.hparams.get("num_key_value_heads", 1)
+
+ super().set_gguf_parameters()
+ hparams = self.hparams
+
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+ self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+ self.gguf_writer.add_attention_output_lora_rank(hparams["o_lora_rank"])
+ self.gguf_writer.add_attention_output_group_count(hparams["o_groups"])
+ self.gguf_writer.add_attention_compress_ratios(hparams["compress_ratios"])
+ self.gguf_writer.add_attention_compress_rope_freq_base(hparams["compress_rope_theta"])
+
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+ self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+ self.gguf_writer.add_expert_weights_scale(hparams.get("routed_scaling_factor", 1.0))
+ self.gguf_writer.add_hash_layer_count(min(hparams["num_hash_layers"], self.block_count))
+ if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None:
+ self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
+ if (swiglu_limit := hparams.get("swiglu_limit")) is not None and float(swiglu_limit) > 0.0:
+ self.gguf_writer.add_swiglu_clamp_exp([float(swiglu_limit)] * self.block_count)
+
+ if (sliding_window := hparams.get("sliding_window")) is not None:
+ self.gguf_writer.add_sliding_window(sliding_window)
+
+ self.gguf_writer.add_indexer_head_count(hparams["index_n_heads"])
+ self.gguf_writer.add_indexer_key_length(hparams["index_head_dim"])
+ self.gguf_writer.add_indexer_top_k(hparams["index_topk"])
+
+ if self.deepseek4_max_layers is None and (num_nextn_predict_layers := hparams.get("num_nextn_predict_layers")) is not None:
+ self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
+
+ self.gguf_writer.add_hyper_connection_count(hparams["hc_mult"])
+ self.gguf_writer.add_hyper_connection_sinkhorn_iters(hparams["hc_sinkhorn_iters"])
+ self.gguf_writer.add_hyper_connection_eps(hparams["hc_eps"])
+
+ @staticmethod
+ def _strip_model_prefix(name: str) -> str:
+ return name.removeprefix("model.")
+
+ def _skip_layer_tensor(self, stripped_name: str) -> bool:
+ if self.deepseek4_max_layers is None:
+ return False
+ match = re.match(r"layers\.(\d+)\.", stripped_name)
+ return match is not None and int(match.group(1)) >= self.block_count
+
+ @staticmethod
+ def _is_low_bit_ftype(ftype: gguf.LlamaFileType) -> bool:
+ return ftype in (
+ gguf.LlamaFileType.MOSTLY_TQ1_0,
+ gguf.LlamaFileType.MOSTLY_TQ2_0,
+ gguf.LlamaFileType.MOSTLY_Q2_K,
+ gguf.LlamaFileType.MOSTLY_IQ2_XXS,
+ gguf.LlamaFileType.MOSTLY_IQ2_XS,
+ )
+
+ @staticmethod
+ def _qtype_for_ftype(ftype: gguf.LlamaFileType) -> gguf.GGMLQuantizationType | None:
+ return {
+ gguf.LlamaFileType.MOSTLY_TQ1_0: gguf.GGMLQuantizationType.TQ1_0,
+ gguf.LlamaFileType.MOSTLY_TQ2_0: gguf.GGMLQuantizationType.TQ2_0,
+ gguf.LlamaFileType.MOSTLY_Q2_K: gguf.GGMLQuantizationType.Q2_K,
+ gguf.LlamaFileType.MOSTLY_IQ2_XXS: gguf.GGMLQuantizationType.IQ2_XXS,
+ gguf.LlamaFileType.MOSTLY_IQ2_XS: gguf.GGMLQuantizationType.IQ2_XS,
+ gguf.LlamaFileType.MOSTLY_Q8_0: gguf.GGMLQuantizationType.Q8_0,
+ }.get(ftype)
+
+ @classmethod
+ def _parse_qtype_name(cls, name: str) -> gguf.GGMLQuantizationType:
+ qtype = cls._qtype_aliases.get(name.strip().lower())
+ if qtype is None:
+ allowed = ", ".join(sorted(cls._qtype_aliases))
+ raise ValueError(f"unknown DeepSeek V4 expert outtype {name!r}; expected one of: {allowed}")
+ return qtype
+
+ @classmethod
+ def _parse_expert_outtype_spec(cls, spec: str | None) -> dict[str, gguf.GGMLQuantizationType]:
+ if spec is None:
+ return {}
+
+ result: dict[str, gguf.GGMLQuantizationType] = {}
+ for item in spec.split(","):
+ item = item.strip()
+ if not item:
+ continue
+ if "=" not in item:
+ qtype = cls._parse_qtype_name(item)
+ result.update({"w1": qtype, "w2": qtype, "w3": qtype})
+ continue
+ key, value = (part.strip().lower() for part in item.split("=", 1))
+ if key not in ("w1", "w2", "w3", "gate", "down", "up"):
+ raise ValueError(f"unknown DeepSeek V4 expert tensor selector {key!r}")
+ wid = {"gate": "w1", "down": "w2", "up": "w3"}.get(key, key)
+ result[wid] = cls._parse_qtype_name(value)
+ return result
+
+ @staticmethod
+ def _scale_to_float(scale: Tensor) -> Tensor:
+ if TORCH_FLOAT8_E8M0FNU is not None and scale.dtype == TORCH_FLOAT8_E8M0FNU:
+ return scale.float()
+
+ if scale.dtype in (torch.uint8, torch.int8):
+ e = scale.view(torch.uint8).to(torch.int32)
+ bits = torch.where(
+ e == 0,
+ torch.full_like(e, 0x00400000),
+ e << 23,
+ )
+ return bits.view(torch.float32)
+
+ return scale.float()
+
+ @staticmethod
+ def _scale_to_e8m0_bytes(scale: Tensor) -> Tensor:
+ if TORCH_FLOAT8_E8M0FNU is not None and scale.dtype == TORCH_FLOAT8_E8M0FNU:
+ return scale.view(torch.uint8)
+ if scale.dtype in (torch.uint8, torch.int8):
+ return scale.view(torch.uint8)
+
+ scale = scale.float()
+ e = torch.where(
+ scale > 0,
+ torch.floor(torch.log2(scale)).to(torch.int32) + 127,
+ torch.zeros_like(scale, dtype=torch.int32),
+ )
+ return torch.clamp(e, 0, 255).to(torch.uint8)
+
+ @classmethod
+ def _dequant_fp8_weight(cls, weight: Tensor, scale: Tensor, block_size: Sequence[int]) -> Tensor:
+ if len(block_size) != 2:
+ raise ValueError(f"DeepSeek V4 expects 2D FP8 block scales, got block size {block_size}")
+
+ block_out, block_in = block_size
+ out_dim, in_dim = weight.shape
+ if out_dim % block_out != 0 or in_dim % block_in != 0:
+ raise ValueError(f"FP8 tensor shape {tuple(weight.shape)} is not divisible by block size {block_size}")
+
+ scale = cls._scale_to_float(scale)
+ expected_scale = (out_dim // block_out, in_dim // block_in)
+ if tuple(scale.shape) != expected_scale:
+ raise ValueError(f"FP8 scale shape {tuple(scale.shape)} does not match expected {expected_scale}")
+
+ weight = weight.reshape(out_dim // block_out, block_out, in_dim // block_in, block_in)
+ weight = weight.float() * scale[:, None, :, None]
+ return weight.reshape(out_dim, in_dim)
+
+ @classmethod
+ def _dequant_fp4_weight(cls, weight: Tensor, scale: Tensor) -> Tensor:
+ weight = weight.view(torch.uint8)
+ out_dim, packed_in_dim = weight.shape
+ in_dim = packed_in_dim * 2
+ if in_dim % 32 != 0:
+ raise ValueError(f"FP4 packed tensor shape {tuple(weight.shape)} does not contain 32-value blocks")
+
+ n_blocks = in_dim // 32
+ scale = cls._scale_to_float(scale)
+ if tuple(scale.shape) != (out_dim, n_blocks):
+ raise ValueError(f"FP4 scale shape {tuple(scale.shape)} does not match expected {(out_dim, n_blocks)}")
+
+ fp4_table = cls._fp4_table.to(weight.device)
+ packed = weight.reshape(out_dim, n_blocks, 16)
+ low = packed & 0x0F
+ high = (packed >> 4) & 0x0F
+ vals = torch.stack((low, high), dim=-1).reshape(out_dim, n_blocks, 32)
+ vals = fp4_table[vals.long()] * scale.unsqueeze(-1)
+ return vals.reshape(out_dim, in_dim)
+
+ @classmethod
+ def _pack_fp4_as_mxfp4(cls, weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]:
+ weight = weight.view(torch.uint8)
+ out_dim, packed_in_dim = weight.shape
+ in_dim = packed_in_dim * 2
+ if in_dim % 32 != 0:
+ raise ValueError(f"FP4 packed tensor shape {tuple(weight.shape)} does not contain 32-value blocks")
+
+ n_blocks = in_dim // 32
+ scale_e = cls._scale_to_e8m0_bytes(scale)
+ if tuple(scale_e.shape) != (out_dim, n_blocks):
+ raise ValueError(f"FP4 scale shape {tuple(scale_e.shape)} does not match expected {(out_dim, n_blocks)}")
+
+ packed = weight.reshape(out_dim, n_blocks, 16)
+ low = packed & 0x0F
+ high = (packed >> 4) & 0x0F
+ vals = torch.stack((low, high), dim=-1).reshape(out_dim, n_blocks, 32)
+ qs = vals[:, :, :16] | (vals[:, :, 16:] << 4)
+ raw = torch.cat((scale_e.unsqueeze(-1), qs), dim=-1).reshape(out_dim, n_blocks * 17)
+ return raw.numpy(), [out_dim, in_dim]
+
+ _ggml_quant_lib: Any = None
+
+ @classmethod
+ def _load_ggml_quant_lib(cls):
+ if cls._ggml_quant_lib is not None:
+ return cls._ggml_quant_lib
+
+ # This module lives in the conversion/ package; the repo root (where
+ # build/bin/libggml.* lands) is its parent's parent. In the pre-#17114
+ # monolithic convert_hf_to_gguf.py, __file__ was the repo-root script,
+ # so .parent alone was the repo root -- search both so the lookup is
+ # correct regardless of package layout.
+ repo_root = Path(__file__).resolve().parent.parent
+ pkg_root = Path(__file__).resolve().parent
+ candidates = [
+ os.environ.get("LLAMA_CPP_LIBGGML"),
+ repo_root / "build" / "bin" / "libggml.dylib",
+ repo_root / "build" / "bin" / "libggml.so",
+ repo_root / "build" / "bin" / "ggml.dll",
+ pkg_root / "build" / "bin" / "libggml.dylib",
+ pkg_root / "build" / "bin" / "libggml.so",
+ pkg_root / "build" / "bin" / "ggml.dll",
+ ]
+ for candidate in candidates:
+ if candidate is None:
+ continue
+ path = Path(candidate)
+ if not path.is_file():
+ continue
+ lib = ctypes.CDLL(str(path))
+ lib.ggml_quantize_chunk.restype = ctypes.c_size_t
+ lib.ggml_quantize_chunk.argtypes = (
+ ctypes.c_int,
+ ctypes.POINTER(ctypes.c_float),
+ ctypes.c_void_p,
+ ctypes.c_int64,
+ ctypes.c_int64,
+ ctypes.c_int64,
+ ctypes.POINTER(ctypes.c_float),
+ )
+ lib.ggml_quantize_requires_imatrix.restype = ctypes.c_bool
+ lib.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,)
+ cls._ggml_quant_lib = lib
+ return lib
+
+ raise RuntimeError(
+ "DeepSeek V4 low-bit expert conversion needs llama.cpp's libggml. "
+ "Build llama.cpp first or set LLAMA_CPP_LIBGGML to libggml."
+ )
+
+ @classmethod
+ def _quantize_deepseek4_expert(cls, data: np.ndarray, qtype: gguf.GGMLQuantizationType) -> np.ndarray:
+ c_quantized_types = {
+ gguf.GGMLQuantizationType.Q2_K,
+ gguf.GGMLQuantizationType.IQ2_XXS,
+ gguf.GGMLQuantizationType.IQ2_XS,
+ }
+ if qtype not in c_quantized_types:
+ return gguf.quants.quantize(data, qtype)
+
+ data = np.ascontiguousarray(data, dtype=np.float32)
+ out = np.zeros(gguf.quant_shape_to_byte_shape(data.shape, qtype), dtype=np.uint8, order="C")
+ lib = cls._load_ggml_quant_lib()
+ nrows = math.prod(data.shape[:-1])
+ n_per_row = data.shape[-1]
+ imatrix = ctypes.cast(0, ctypes.POINTER(ctypes.c_float))
+ if lib.ggml_quantize_requires_imatrix(qtype.value):
+ qw = np.ascontiguousarray(np.sum(data.reshape(-1, n_per_row) ** 2, axis=0), dtype=np.float32)
+ imatrix = qw.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+ result_size = lib.ggml_quantize_chunk(
+ qtype.value,
+ data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+ out.ctypes.data_as(ctypes.c_void_p),
+ 0,
+ nrows,
+ n_per_row,
+ imatrix,
+ )
+ if result_size != out.size:
+ raise RuntimeError(f"ggml_quantize_chunk wrote {result_size} bytes, expected {out.size}")
+ return out
+
+ def _write_deepseek4_tid2eid_tensors(self) -> set[str]:
+ consumed: set[str] = set()
+ for name in list(self.model_tensors.keys()):
+ stripped = self._strip_model_prefix(name)
+ if self._skip_layer_tensor(stripped):
+ consumed.add(name)
+ continue
+ if re.match(r"layers\.\d+\.ffn\.gate\.tid2eid$", stripped) is None:
+ continue
+
+ data = LazyTorchTensor.to_eager(self.model_tensors[name]()).to(torch.int32).numpy()
+ new_name = self.map_tensor_name(stripped)
+ logger.info(f"{new_name}, int32 --> I32, shape = {{{', '.join(str(n) for n in reversed(data.shape))}}}")
+ self.gguf_writer.add_tensor(new_name, data)
+ consumed.add(name)
+ return consumed
+
+ def _write_deepseek4_expert_tensors(self) -> set[str]:
+ default_qtype = self._qtype_for_ftype(self.ftype)
+ if default_qtype is None and not self._deepseek4_expert_qtypes:
+ if any(re.match(r"(?:model\.)?layers\.\d+\.ffn\.experts\.\d+\.w[123]\.weight$", name) for name in self.model_tensors):
+ raise NotImplementedError(
+ "DeepSeek V4 routed FP4 experts must be converted directly to a compact GGUF type. "
+ "Use --outtype iq2_xxs, iq2_xs, q2_k, tq2_0, tq1_0, or q8_0."
+ )
+ return set()
+
+ n_experts = self.hparams["n_routed_experts"]
+ consumed: set[str] = set()
+ groups: dict[tuple[int, str], dict[int, tuple[str, str]]] = {}
+
+ for name in list(self.model_tensors.keys()):
+ stripped = self._strip_model_prefix(name)
+ if self._skip_layer_tensor(stripped):
+ consumed.add(name)
+ continue
+ match = re.match(r"layers\.(\d+)\.ffn\.experts\.(\d+)\.(w[123])\.weight$", stripped)
+ if match is None:
+ continue
+
+ bid = int(match.group(1))
+ xid = int(match.group(2))
+ wid = match.group(3)
+ qtype = self._deepseek4_expert_qtypes.get(wid, default_qtype)
+ if qtype is None:
+ raise RuntimeError(f"No DeepSeek V4 expert quantization type selected for {wid}")
+ scale_name = f"{stripped.removesuffix('.weight')}.scale"
+ model_scale_name = scale_name if scale_name in self.model_tensors else f"model.{scale_name}"
+ if model_scale_name not in self.model_tensors:
+ raise ValueError(f"Missing DeepSeek V4 FP4 scale tensor for {stripped}")
+
+ groups.setdefault((bid, wid), {})[xid] = (name, model_scale_name)
+ consumed.update((name, model_scale_name))
+
+ def convert_one(name: str, model_scale_name: str, qtype: gguf.GGMLQuantizationType) -> np.ndarray:
+ weight = LazyTorchTensor.to_eager(self.model_tensors[name]())
+ scale = LazyTorchTensor.to_eager(self.model_tensors[model_scale_name]())
+
+ if qtype == gguf.GGMLQuantizationType.MXFP4:
+ data, _ = self._pack_fp4_as_mxfp4(weight, scale)
+ return data
+
+ data = self._dequant_fp4_weight(weight, scale).numpy()
+ return self._quantize_deepseek4_expert(data, qtype)
+
+ def add_merged_tensor(bid: int, wid: str, qtype: gguf.GGMLQuantizationType, experts: dict[int, np.ndarray]) -> None:
+ missing = sorted(set(range(n_experts)).difference(experts))
+ if missing:
+ raise ValueError(f"Missing DeepSeek V4 expert tensors for layer {bid} {wid}: {missing[:8]}")
+
+ merged = np.stack([experts[i] for i in range(n_experts)], axis=0)
+ merged_name = f"layers.{bid}.ffn.experts.{wid}.weight"
+ new_name = self.map_tensor_name(merged_name)
+ shape = gguf.quant_shape_from_byte_shape(merged.shape, qtype) if merged.dtype == np.uint8 else merged.shape
+ shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
+ logger.info(f"{new_name}, DeepSeek FP4 --> {qtype.name}, shape = {shape_str}")
+ self.gguf_writer.add_tensor(new_name, merged, raw_dtype=qtype)
+
+ worker_count = max(1, self.deepseek4_expert_workers)
+ for bid, wid in sorted(groups):
+ qtype = self._deepseek4_expert_qtypes.get(wid, default_qtype)
+ if qtype is None:
+ raise RuntimeError(f"No DeepSeek V4 expert quantization type selected for {wid}")
+ group = groups[(bid, wid)]
+ experts: dict[int, np.ndarray] = {}
+ logger.info(
+ "DeepSeek V4: quantizing blk.%d %s experts to %s with %d worker%s",
+ bid,
+ wid,
+ qtype.name,
+ worker_count,
+ "" if worker_count == 1 else "s",
+ )
+
+ if worker_count == 1:
+ for done, xid in enumerate(sorted(group), start=1):
+ name, model_scale_name = group[xid]
+ experts[xid] = convert_one(name, model_scale_name, qtype)
+ if done % 32 == 0 or done == n_experts:
+ logger.info("DeepSeek V4: blk.%d %s %d/%d experts", bid, wid, done, n_experts)
+ else:
+ max_pending = worker_count * 2
+ pending: dict[concurrent.futures.Future[np.ndarray], int] = {}
+ xids = iter(sorted(group))
+ done = 0
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
+ def submit_next() -> bool:
+ try:
+ xid = next(xids)
+ except StopIteration:
+ return False
+ name, model_scale_name = group[xid]
+ future = executor.submit(convert_one, name, model_scale_name, qtype)
+ pending[future] = xid
+ return True
+
+ while len(pending) < max_pending and submit_next():
+ pass
+
+ while pending:
+ finished, _ = concurrent.futures.wait(
+ pending,
+ return_when=concurrent.futures.FIRST_COMPLETED,
+ )
+ for future in finished:
+ xid = pending.pop(future)
+ experts[xid] = future.result()
+ done += 1
+ if done % 32 == 0 or done == n_experts:
+ logger.info("DeepSeek V4: blk.%d %s %d/%d experts", bid, wid, done, n_experts)
+ submit_next()
+
+ add_merged_tensor(bid, wid, qtype, experts)
+
+ return consumed
+
+ def _prepare_deepseek4_scaled_tensors(self) -> None:
+ block_size = (self.hparams.get("quantization_config") or {}).get("weight_block_size", [128, 128])
+ consumed: set[str] = set()
+
+ for name in list(self.model_tensors.keys()):
+ stripped = self._strip_model_prefix(name)
+ if stripped.startswith("mtp.") or self._skip_layer_tensor(stripped):
+ consumed.add(name)
+
+ consumed.update(self._write_deepseek4_tid2eid_tensors())
+ consumed.update(self._write_deepseek4_expert_tensors())
+
+ for name in list(self.model_tensors.keys()):
+ if name in consumed:
+ continue
+ stripped = self._strip_model_prefix(name)
+ if not stripped.endswith(".scale"):
+ continue
+ if re.match(r"layers\.\d+\.ffn\.experts\.\d+\.w[123]\.scale$", stripped) is not None:
+ continue
+
+ weight_name = f"{stripped.removesuffix('.scale')}.weight"
+ model_weight_name = weight_name if weight_name in self.model_tensors else f"model.{weight_name}"
+ if model_weight_name not in self.model_tensors:
+ raise ValueError(f"Missing DeepSeek V4 FP8 weight tensor for scale {stripped}")
+
+ w = self.model_tensors[model_weight_name]
+ s = self.model_tensors[name]
+ self.model_tensors[model_weight_name] = (
+ lambda w=w, s=s, bs=block_size: self._dequant_fp8_weight(
+ LazyTorchTensor.to_eager(w()),
+ LazyTorchTensor.to_eager(s()),
+ bs,
+ )
+ )
+ consumed.add(name)
+
+ for name in consumed:
+ self.model_tensors.pop(name, None)
+
+ def prepare_tensors(self):
+ self._prepare_deepseek4_scaled_tensors()
+
+ if any(name.endswith(".scale") for name in self.model_tensors):
+ raise NotImplementedError("Unhandled DeepSeek V4 scale tensors remain after conversion preparation")
+
+ super().prepare_tensors()
+
+ if self._experts is not None:
+ experts = [k for d in self._experts for k in d.keys()]
+ if len(experts) > 0:
+ raise ValueError(f"Unprocessed experts: {experts}")
+
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
+ del name
+ del new_name
+ del bid
+
+ if not self._is_low_bit_ftype(self.ftype) or n_dims <= 1:
+ return False
+
+ # DeepSeek V4 routed experts are handled in _write_deepseek4_expert_tensors(),
+ # where each expert is converted directly from FP4 to the requested compact
+ # GGUF type. Keep the rest of the model in float form so attention,
+ # hyper-connections, indexers, compressors, shared experts and logits do not
+ # inherit the global low-bit file type.
+ return gguf.GGMLQuantizationType.F16
+
+ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
+ mapped = self._map_tensor_name_deepseek4(name)
+ if mapped is not None:
+ return mapped
+ return super().map_tensor_name(name, try_suffixes)
+
+ def _map_tensor_name_deepseek4(self, name: str) -> str | None:
+ if name.startswith("model."):
+ name = name.removeprefix("model.")
+
+ top_level: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
+ "embed.weight": (gguf.MODEL_TENSOR.TOKEN_EMBD, ".weight"),
+ "norm.weight": (gguf.MODEL_TENSOR.OUTPUT_NORM, ".weight"),
+ "head.weight": (gguf.MODEL_TENSOR.OUTPUT, ".weight"),
+ "hc_head_base": (gguf.MODEL_TENSOR.OUTPUT_HC_BASE, ".weight"),
+ "hc_head_fn": (gguf.MODEL_TENSOR.OUTPUT_HC_FN, ".weight"),
+ "hc_head_scale": (gguf.MODEL_TENSOR.OUTPUT_HC_SCALE, ".weight"),
+ }
+ if name in top_level:
+ tensor, suffix = top_level[name]
+ return self.format_tensor_name(tensor, suffix=suffix)
+
+ match = re.match(r"layers\.(\d+)\.(.+)", name)
+ if match is None:
+ return None
+
+ bid = int(match.group(1))
+ rest = match.group(2)
+
+ layer_level: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
+ "hc_attn_base": (gguf.MODEL_TENSOR.HC_ATTN_BASE, ".weight"),
+ "hc_attn_fn": (gguf.MODEL_TENSOR.HC_ATTN_FN, ".weight"),
+ "hc_attn_scale": (gguf.MODEL_TENSOR.HC_ATTN_SCALE, ".weight"),
+ "hc_ffn_base": (gguf.MODEL_TENSOR.HC_FFN_BASE, ".weight"),
+ "hc_ffn_fn": (gguf.MODEL_TENSOR.HC_FFN_FN, ".weight"),
+ "hc_ffn_scale": (gguf.MODEL_TENSOR.HC_FFN_SCALE, ".weight"),
+ "attn.attn_sink": (gguf.MODEL_TENSOR.ATTN_SINKS, ".weight"),
+ "attn.wq_a.weight": (gguf.MODEL_TENSOR.ATTN_Q_A, ".weight"),
+ "attn.wq_b.weight": (gguf.MODEL_TENSOR.ATTN_Q_B, ".weight"),
+ "attn.q_norm.weight": (gguf.MODEL_TENSOR.ATTN_Q_A_NORM, ".weight"),
+ "attn.wkv.weight": (gguf.MODEL_TENSOR.ATTN_KV, ".weight"),
+ "attn.kv_norm.weight": (gguf.MODEL_TENSOR.ATTN_KV_A_NORM, ".weight"),
+ "attn.wo_a.weight": (gguf.MODEL_TENSOR.ATTN_OUT_A, ".weight"),
+ "attn.wo_b.weight": (gguf.MODEL_TENSOR.ATTN_OUT_B, ".weight"),
+ "attn.compressor.ape": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_APE, ".weight"),
+ "attn.compressor.wkv.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_KV, ".weight"),
+ "attn.compressor.wgate.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_GATE, ".weight"),
+ "attn.compressor.norm.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_NORM, ".weight"),
+ "attn.indexer.wq_b.weight": (gguf.MODEL_TENSOR.INDEXER_ATTN_Q_B, ".weight"),
+ "attn.indexer.weights_proj.weight": (gguf.MODEL_TENSOR.INDEXER_PROJ, ".weight"),
+ "attn.indexer.compressor.ape": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_APE, ".weight"),
+ "attn.indexer.compressor.wkv.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_KV, ".weight"),
+ "attn.indexer.compressor.wgate.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_GATE, ".weight"),
+ "attn.indexer.compressor.norm.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_NORM, ".weight"),
+ "attn_norm.weight": (gguf.MODEL_TENSOR.ATTN_NORM, ".weight"),
+ "ffn_norm.weight": (gguf.MODEL_TENSOR.FFN_NORM, ".weight"),
+ "ffn.shared_experts.w1.weight": (gguf.MODEL_TENSOR.FFN_GATE_SHEXP, ".weight"),
+ "ffn.shared_experts.w3.weight": (gguf.MODEL_TENSOR.FFN_UP_SHEXP, ".weight"),
+ "ffn.shared_experts.w2.weight": (gguf.MODEL_TENSOR.FFN_DOWN_SHEXP, ".weight"),
+ "ffn.gate.weight": (gguf.MODEL_TENSOR.FFN_GATE_INP, ".weight"),
+ "ffn.gate.bias": (gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, ".bias"),
+ "ffn.gate.tid2eid": (gguf.MODEL_TENSOR.FFN_GATE_TID2EID, ".weight"),
+ "ffn.experts.w1.weight": (gguf.MODEL_TENSOR.FFN_GATE_EXP, ".weight"),
+ "ffn.experts.w3.weight": (gguf.MODEL_TENSOR.FFN_UP_EXP, ".weight"),
+ "ffn.experts.w2.weight": (gguf.MODEL_TENSOR.FFN_DOWN_EXP, ".weight"),
+ }
+ if rest in layer_level:
+ tensor, suffix = layer_level[rest]
+ return self.format_tensor_name(tensor, bid, suffix=suffix)
+
+ return None
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ if name.startswith("model."):
+ name = name.removeprefix("model.")
+
+ # TODO: llama.cpp does not have Multi-Token Prediction for DeepSeek yet.
+ if name.startswith("mtp."):
+ return
+
+ # process the experts separately
+ match = re.match(r"layers\.(\d+)\.ffn\.experts\.(\d+)\.(w[123])\.weight", name)
+ if match is not None:
+ bid = int(match.group(1))
+ xid = int(match.group(2))
+ wid = match.group(3)
+ n_experts = self.hparams["n_routed_experts"]
+
+ if self._experts is None:
+ self._experts = [{} for _ in range(self.block_count)]
+
+ self._experts[bid][name] = data_torch
+
+ if len(self._experts[bid]) >= n_experts * 3:
+ for w_name in ["w1", "w3", "w2"]:
+ datas: list[Tensor] = []
+
+ for expert_id in range(n_experts):
+ ename = f"layers.{bid}.ffn.experts.{expert_id}.{w_name}.weight"
+ datas.append(self._experts[bid][ename])
+ del self._experts[bid][ename]
+
+ data_torch = torch.stack(datas, dim=0)
+ merged_name = f"layers.{bid}.ffn.experts.{w_name}.weight"
+ yield self.map_tensor_name(merged_name), data_torch
+ return
+
+ del xid, wid
+ return
+
+ yield self.map_tensor_name(name), data_torch
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 41566d41aef3..f71943ed33aa 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -567,6 +567,11 @@ extern "C" {
GGML_OP_RWKV_WKV7,
GGML_OP_SOLVE_TRI,
GGML_OP_GATED_DELTA_NET,
+ GGML_OP_DSV4_HC_SPLIT_SINKHORN,
+ GGML_OP_DSV4_HC_WEIGHTED_SUM,
+ GGML_OP_DSV4_HC_EXPAND,
+ GGML_OP_DSV4_FP8_KV_QUANTIZE,
+ GGML_OP_DSV4_ROPE_TAIL,
GGML_OP_UNARY,
@@ -2555,6 +2560,61 @@ extern "C" {
struct ggml_tensor * beta,
struct ggml_tensor * state);
+ // DeepSeek V4 hyperconnection helper.
+ // Splits [mix, tokens] into pre/post/comb regions and applies the
+ // Sinkhorn normalization used by the reference implementation.
+ GGML_API struct ggml_tensor * ggml_dsv4_hc_split_sinkhorn(
+ struct ggml_context * ctx,
+ struct ggml_tensor * mixes,
+ struct ggml_tensor * scale,
+ struct ggml_tensor * base,
+ int n_hc,
+ int sinkhorn_iters,
+ float eps);
+
+ // DeepSeek V4 hyperconnection weighted-sum helper.
+ // Computes sum_hc weights[hc, token] * x[embd, hc, token].
+ GGML_API struct ggml_tensor * ggml_dsv4_hc_weighted_sum(
+ struct ggml_context * ctx,
+ struct ggml_tensor * x,
+ struct ggml_tensor * weights);
+
+ // DeepSeek V4 hyperconnection expand helper.
+ // Computes post * block_out + comb^T @ residual for each token.
+ GGML_API struct ggml_tensor * ggml_dsv4_hc_expand(
+ struct ggml_context * ctx,
+ struct ggml_tensor * block_out,
+ struct ggml_tensor * residual,
+ struct ggml_tensor * post,
+ struct ggml_tensor * comb);
+
+ // DeepSeek V4 FP8 KV-cache simulation helper.
+ // Quantizes/dequantizes the non-RoPE prefix in E4M3FN blocks and leaves
+ // the RoPE tail unchanged, matching the reference inference path.
+ GGML_API struct ggml_tensor * ggml_dsv4_fp8_kv_quantize(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_rot);
+
+ // DeepSeek V4 partial RoPE helper.
+ // Leaves the non-RoPE prefix unchanged and applies RoPE to the tail,
+ // matching ggml_concat(prefix, ggml_rope_ext(tail)).
+ GGML_API struct ggml_tensor * ggml_dsv4_rope_tail(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * pos,
+ struct ggml_tensor * freq_factors,
+ int n_dims,
+ int mode,
+ int n_ctx_orig,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow,
+ bool inverse);
+
// custom operators
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index cd5c61a81879..70f8def3a742 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2047,6 +2047,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_gated_delta_net(params, tensor);
} break;
+ case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+ {
+ ggml_compute_forward_dsv4_hc_split_sinkhorn(params, tensor);
+ } break;
+ case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+ {
+ ggml_compute_forward_dsv4_hc_weighted_sum(params, tensor);
+ } break;
+ case GGML_OP_DSV4_HC_EXPAND:
+ {
+ ggml_compute_forward_dsv4_hc_expand(params, tensor);
+ } break;
+ case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+ {
+ ggml_compute_forward_dsv4_fp8_kv_quantize(params, tensor);
+ } break;
+ case GGML_OP_DSV4_ROPE_TAIL:
+ {
+ ggml_compute_forward_dsv4_rope_tail(params, tensor);
+ } break;
case GGML_OP_MAP_CUSTOM1:
{
ggml_compute_forward_map_custom1(params, tensor);
@@ -2227,6 +2247,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_COUNT_EQUAL:
case GGML_OP_SOLVE_TRI:
case GGML_OP_GATED_DELTA_NET:
+ case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+ case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+ case GGML_OP_DSV4_HC_EXPAND:
+ case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+ case GGML_OP_DSV4_ROPE_TAIL:
{
n_tasks = n_threads;
} break;
@@ -2847,6 +2872,7 @@ struct ggml_cplan ggml_graph_plan(
case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK:
+ case GGML_OP_DSV4_ROPE_TAIL:
{
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
} break;
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 7485ba4fc861..f473cb724725 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5970,6 +5970,127 @@ void ggml_compute_forward_rope_back(
}
}
+// ggml_compute_forward_dsv4_rope_tail
+
+template
+static void ggml_compute_forward_dsv4_rope_tail_flt(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+ const ggml_tensor * src1 = dst->src[1];
+ const ggml_tensor * src2 = dst->src[2];
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
+
+ const int n_dims = ((int32_t *) dst->op_params)[0];
+ const int mode = ((int32_t *) dst->op_params)[1];
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[2];
+ const bool inverse = ((int32_t *) dst->op_params)[3] != 0;
+
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 6, sizeof(float));
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 8, sizeof(float));
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 9, sizeof(float));
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ GGML_ASSERT(nb0 == nb00);
+ GGML_ASSERT(nb0 == sizeof(T));
+ GGML_ASSERT(n_dims <= ne0);
+ GGML_ASSERT(n_dims % 2 == 0);
+ GGML_ASSERT(mode == GGML_ROPE_TYPE_NORMAL || mode == GGML_ROPE_TYPE_NEOX);
+
+ const int64_t n_nope = ne0 - n_dims;
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(dst);
+ const int dr = (nr + nth - 1)/nth;
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+ float corr_dims[2];
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+ const float * freq_factors = NULL;
+ if (src2 != NULL) {
+ GGML_ASSERT(src2->type == GGML_TYPE_F32);
+ GGML_ASSERT(src2->ne[0] >= n_dims / 2);
+ freq_factors = (const float *) src2->data;
+ }
+
+ const float sin_sign = inverse ? -1.0f : 1.0f;
+ const int32_t * pos = (const int32_t *) src1->data;
+
+ int ir = 0;
+ int64_t last_i2 = -1;
+
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
+ for (int64_t i2 = 0; i2 < ne2; i2++) {
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
+ if (ir++ < ir0) continue;
+ if (ir > ir1) break;
+
+ float * cache = (float *) params->wdata + (n_dims + CACHE_LINE_SIZE_F32)*ith;
+ if (last_i2 != i2) {
+ const int64_t p = pos[i2];
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, n_dims, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+ last_i2 = i2;
+ }
+
+ const T * src = (const T *)((const char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ T * dst_data = (T *)(( char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
+
+ for (int64_t i0 = 0; i0 < n_nope; ++i0) {
+ dst_data[i0] = src[i0];
+ }
+
+ const T * src_tail = src + n_nope;
+ T * dst_tail = dst_data + n_nope;
+
+ switch (mode) {
+ case GGML_ROPE_TYPE_NORMAL:
+ rotate_pairs(n_dims, 1, cache, src_tail, dst_tail, 1);
+ break;
+ case GGML_ROPE_TYPE_NEOX:
+ rotate_pairs(n_dims, n_dims/2, cache, src_tail, dst_tail);
+ break;
+ default:
+ GGML_ABORT("rope type not supported");
+ }
+ }
+ }
+ }
+}
+
+void ggml_compute_forward_dsv4_rope_tail(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+
+ switch (src0->type) {
+ case GGML_TYPE_F16:
+ {
+ ggml_compute_forward_dsv4_rope_tail_flt(params, dst);
+ } break;
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_dsv4_rope_tail_flt(params, dst);
+ } break;
+ default:
+ {
+ GGML_ABORT("fatal error");
+ }
+ }
+}
+
// ggml_compute_forward_conv_transpose_1d
static void ggml_compute_forward_conv_transpose_1d_f16_f32(
@@ -10903,6 +11024,343 @@ void ggml_compute_forward_rwkv_wkv7(
}
}
+// ggml_compute_forward_dsv4_hc_split_sinkhorn
+
+void ggml_compute_forward_dsv4_hc_split_sinkhorn(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+ const ggml_tensor * mixes = dst->src[0];
+ const ggml_tensor * scale = dst->src[1];
+ const ggml_tensor * base = dst->src[2];
+
+ GGML_ASSERT(mixes->type == GGML_TYPE_F32);
+ GGML_ASSERT(scale->type == GGML_TYPE_F32);
+ GGML_ASSERT(base->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(mixes->nb[0] == sizeof(float));
+ GGML_ASSERT(scale->nb[0] == sizeof(float));
+ GGML_ASSERT(base->nb[0] == sizeof(float));
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
+
+ const int n_hc = ggml_get_op_params_i32(dst, 0);
+ const int sinkhorn_iters = ggml_get_op_params_i32(dst, 1);
+ const float eps = ggml_get_op_params_f32(dst, 2);
+ const int64_t mix_hc = mixes->ne[0];
+ const int64_t n_rows = ggml_nrows(mixes);
+
+ GGML_ASSERT(n_hc > 0 && n_hc <= 16);
+ GGML_ASSERT(sinkhorn_iters > 0);
+ GGML_ASSERT(mix_hc == (2 + n_hc) * n_hc);
+ GGML_ASSERT(ggml_nrows(dst) == n_rows);
+
+ const float * scale_data = (const float *) scale->data;
+ const float * base_data = (const float *) base->data;
+
+ const float pre_scale = scale_data[0];
+ const float post_scale = scale_data[1];
+ const float comb_scale = scale_data[2];
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int64_t dr = (n_rows + nth - 1) / nth;
+ const int64_t r0 = dr * ith;
+ const int64_t r1 = std::min(r0 + dr, n_rows);
+
+ for (int64_t r = r0; r < r1; ++r) {
+ const float * mix = (const float *) ((const char *) mixes->data + r*mixes->nb[1]);
+ float * out = (float *) ((char *) dst->data + r*dst->nb[1]);
+
+ for (int i = 0; i < n_hc; ++i) {
+ const float z = mix[i] * pre_scale + base_data[i];
+ out[i] = 1.0f / (1.0f + expf(-z)) + eps;
+ }
+
+ for (int i = 0; i < n_hc; ++i) {
+ const int off = n_hc + i;
+ const float z = mix[off] * post_scale + base_data[off];
+ out[off] = 2.0f / (1.0f + expf(-z));
+ }
+
+ float c[16*16];
+
+ // comb is laid out as a flat [n_hc*n_hc] block per token, written as
+ // c[src_hc + dst_hc*n_hc]; after the graph's reshape_3d this is ggml
+ // tensor comb[ne0=src_hc, ne1=dst_hc, ne2=token]. The Sinkhorn pass
+ // below softmaxes over src_hc, then alternates row/col normalization.
+ // ggml_dsv4_hc_expand intentionally reads ggml-dim0 as dst_hc, which
+ // transposes this matrix on read so it computes comb^T @ residual
+ // (the V4 hyperconnection contract). CPU/Metal/CUDA use the identical
+ // flat write + transposed read; do not "fix" one side in isolation.
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ float row_max = -INFINITY;
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ const int idx = src_hc + dst_hc*n_hc;
+ const int off = 2*n_hc + idx;
+ const float v = mix[off] * comb_scale + base_data[off];
+ c[idx] = v;
+ row_max = std::max(row_max, v);
+ }
+
+ float row_sum = 0.0f;
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ const int idx = src_hc + dst_hc*n_hc;
+ const float v = expf(c[idx] - row_max);
+ c[idx] = v;
+ row_sum += v;
+ }
+
+ const float inv_sum = 1.0f / row_sum;
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ const int idx = src_hc + dst_hc*n_hc;
+ c[idx] = c[idx] * inv_sum + eps;
+ }
+ }
+
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ float sum = 0.0f;
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ sum += c[src_hc + dst_hc*n_hc];
+ }
+
+ const float inv_denom = 1.0f / (sum + eps);
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ c[src_hc + dst_hc*n_hc] *= inv_denom;
+ }
+ }
+
+ for (int iter = 1; iter < sinkhorn_iters; ++iter) {
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ float sum = 0.0f;
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ sum += c[src_hc + dst_hc*n_hc];
+ }
+
+ const float inv_denom = 1.0f / (sum + eps);
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ c[src_hc + dst_hc*n_hc] *= inv_denom;
+ }
+ }
+
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ float sum = 0.0f;
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ sum += c[src_hc + dst_hc*n_hc];
+ }
+
+ const float inv_denom = 1.0f / (sum + eps);
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ c[src_hc + dst_hc*n_hc] *= inv_denom;
+ }
+ }
+ }
+
+ for (int i = 0; i < n_hc*n_hc; ++i) {
+ out[2*n_hc + i] = c[i];
+ }
+ }
+}
+
+// ggml_compute_forward_dsv4_hc_weighted_sum
+
+void ggml_compute_forward_dsv4_hc_weighted_sum(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+ const ggml_tensor * x = dst->src[0];
+ const ggml_tensor * weights = dst->src[1];
+
+ GGML_ASSERT(x->type == GGML_TYPE_F32);
+ GGML_ASSERT(weights->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(x->ne[0] == dst->ne[0]);
+ GGML_ASSERT(x->ne[1] == weights->ne[0]);
+ GGML_ASSERT(x->ne[2] == dst->ne[1]);
+ GGML_ASSERT(weights->ne[1] == dst->ne[1]);
+ GGML_ASSERT(x->ne[3] == 1);
+ GGML_ASSERT(weights->ne[2] == 1);
+ GGML_ASSERT(weights->ne[3] == 1);
+ GGML_ASSERT(dst->ne[2] == 1);
+ GGML_ASSERT(dst->ne[3] == 1);
+
+ const int64_t n_embd = dst->ne[0];
+ const int64_t n_hc = x->ne[1];
+ const int64_t n_tokens = dst->ne[1];
+ const int64_t n_elem = n_embd * n_tokens;
+
+ const int64_t i0 = (n_elem * params->ith) / params->nth;
+ const int64_t i1 = (n_elem * (params->ith + 1)) / params->nth;
+
+ const char * x_data = (const char *) x->data;
+ const char * w_data = (const char *) weights->data;
+ char * y_data = ( char *) dst->data;
+
+ for (int64_t i = i0; i < i1; ++i) {
+ const int64_t d = i % n_embd;
+ const int64_t t = i / n_embd;
+
+ float acc = 0.0f;
+ for (int64_t h = 0; h < n_hc; ++h) {
+ const float xv = *(const float *) (x_data + d*x->nb[0] + h*x->nb[1] + t*x->nb[2]);
+ const float wv = *(const float *) (w_data + h*weights->nb[0] + t*weights->nb[1]);
+ acc += xv * wv;
+ }
+
+ *(float *) (y_data + d*dst->nb[0] + t*dst->nb[1]) = acc;
+ }
+}
+
+// ggml_compute_forward_dsv4_hc_expand
+
+void ggml_compute_forward_dsv4_hc_expand(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+ const ggml_tensor * block_out = dst->src[0];
+ const ggml_tensor * residual = dst->src[1];
+ const ggml_tensor * post = dst->src[2];
+ const ggml_tensor * comb = dst->src[3];
+
+ GGML_ASSERT(block_out->type == GGML_TYPE_F32);
+ GGML_ASSERT(residual->type == GGML_TYPE_F32);
+ GGML_ASSERT(post->type == GGML_TYPE_F32);
+ GGML_ASSERT(comb->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(block_out->ne[0] == dst->ne[0]);
+ GGML_ASSERT(block_out->ne[1] == dst->ne[2]);
+ GGML_ASSERT(residual->ne[0] == dst->ne[0]);
+ GGML_ASSERT(residual->ne[1] == dst->ne[1]);
+ GGML_ASSERT(residual->ne[2] == dst->ne[2]);
+ GGML_ASSERT(post->ne[0] == dst->ne[1]);
+ GGML_ASSERT(post->ne[1] == dst->ne[2]);
+ GGML_ASSERT(comb->ne[0] == dst->ne[1]);
+ GGML_ASSERT(comb->ne[1] == dst->ne[1]);
+ GGML_ASSERT(comb->ne[2] == dst->ne[2]);
+ GGML_ASSERT(block_out->ne[3] == 1);
+ GGML_ASSERT(residual->ne[3] == 1);
+ GGML_ASSERT(post->ne[2] == 1);
+ GGML_ASSERT(post->ne[3] == 1);
+ GGML_ASSERT(comb->ne[3] == 1);
+ GGML_ASSERT(dst->ne[3] == 1);
+
+ const int64_t n_embd = dst->ne[0];
+ const int64_t n_hc = dst->ne[1];
+ const int64_t n_tokens = dst->ne[2];
+ const int64_t n_elem = n_embd * n_hc * n_tokens;
+
+ const int64_t i0 = (n_elem * params->ith) / params->nth;
+ const int64_t i1 = (n_elem * (params->ith + 1)) / params->nth;
+
+ const char * block_data = (const char *) block_out->data;
+ const char * res_data = (const char *) residual->data;
+ const char * post_data = (const char *) post->data;
+ const char * comb_data = (const char *) comb->data;
+ char * dst_data = ( char *) dst->data;
+
+ for (int64_t i = i0; i < i1; ++i) {
+ const int64_t d = i % n_embd;
+ const int64_t tmp = i / n_embd;
+ const int64_t dst_hc = tmp % n_hc;
+ const int64_t t = tmp / n_hc;
+
+ const float block_v = *(const float *) (block_data + d*block_out->nb[0] + t*block_out->nb[1]);
+ const float post_v = *(const float *) (post_data + dst_hc*post->nb[0] + t*post->nb[1]);
+
+ float acc = block_v * post_v;
+ // comb arrives as comb[ne0=src_hc, ne1=dst_hc, ne2=t] from
+ // dsv4_hc_split_sinkhorn (flat write src_hc + dst_hc*n_hc). Reading
+ // ne0 as dst_hc and ne1 as src_hc here transposes it, giving
+ // (comb^T @ residual)[d, dst_hc] = sum_src_hc comb[src_hc,dst_hc,t]
+ // * residual[d, src_hc, t]. This transpose is deliberate and matches
+ // the Metal/CUDA expand kernels (validated 19/19 vs this CPU oracle).
+ for (int64_t src_hc = 0; src_hc < n_hc; ++src_hc) {
+ const float comb_v = *(const float *) (comb_data + dst_hc*comb->nb[0] + src_hc*comb->nb[1] + t*comb->nb[2]);
+ const float res_v = *(const float *) (res_data + d*residual->nb[0] + src_hc*residual->nb[1] + t*residual->nb[2]);
+ acc += comb_v * res_v;
+ }
+
+ *(float *) (dst_data + d*dst->nb[0] + dst_hc*dst->nb[1] + t*dst->nb[2]) = acc;
+ }
+}
+
+static float ggml_dsv4_e4m3fn_dequant(float x) {
+ const float sign = x < 0.0f ? -1.0f : 1.0f;
+ const float ax = std::min(std::fabs(x), 448.0f);
+
+ int best = 0;
+ float best_diff = ax;
+
+ for (int i = 1; i < 127; ++i) {
+ const int exp = (i >> 3) & 0x0f;
+ const int mant = i & 0x07;
+ const float val = exp == 0
+ ? std::ldexp(float(mant), -9)
+ : std::ldexp(1.0f + float(mant) / 8.0f, exp - 7);
+ const float diff = std::fabs(ax - val);
+ if (diff < best_diff || (diff == best_diff && (i & 1) == 0 && (best & 1) != 0)) {
+ best = i;
+ best_diff = diff;
+ }
+ }
+
+ const int exp = (best >> 3) & 0x0f;
+ const int mant = best & 0x07;
+ const float val = exp == 0
+ ? std::ldexp(float(mant), -9)
+ : std::ldexp(1.0f + float(mant) / 8.0f, exp - 7);
+
+ return sign * val;
+}
+
+void ggml_compute_forward_dsv4_fp8_kv_quantize(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+ const int64_t n_rot = ggml_get_op_params_i32(dst, 0);
+ const int64_t head_dim = src0->ne[0];
+ const int64_t n_nope = head_dim - n_rot;
+
+ GGML_ASSERT(n_rot >= 0);
+ GGML_ASSERT(n_nope > 0);
+ GGML_ASSERT(n_nope % 64 == 0);
+
+ const int64_t n_rows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+ const int64_t row_start = (n_rows * params->ith) / params->nth;
+ const int64_t row_end = (n_rows * (params->ith + 1)) / params->nth;
+
+ for (int64_t row = row_start; row < row_end; ++row) {
+ const int64_t i1 = row % src0->ne[1];
+ const int64_t i2 = (row / src0->ne[1]) % src0->ne[2];
+ const int64_t i3 = row / (src0->ne[1] * src0->ne[2]);
+
+ const char * src_base = (const char *) src0->data + i1*src0->nb[1] + i2*src0->nb[2] + i3*src0->nb[3];
+ char * dst_base = ( char *) dst->data + i1*dst->nb[1] + i2*dst->nb[2] + i3*dst->nb[3];
+
+ for (int64_t off = 0; off < n_nope; off += 64) {
+ float amax = 0.0f;
+ for (int64_t i = 0; i < 64; ++i) {
+ const float v = *(const float *) (src_base + (off + i)*src0->nb[0]);
+ amax = std::max(amax, std::fabs(v));
+ }
+
+ amax = std::max(amax, 1.0e-4f);
+ const float scale = std::ldexp(1.0f, int(std::ceil(std::log2(amax / 448.0f))));
+ for (int64_t i = 0; i < 64; ++i) {
+ const float v = *(const float *) (src_base + (off + i)*src0->nb[0]);
+ *(float *) (dst_base + (off + i)*dst->nb[0]) =
+ ggml_dsv4_e4m3fn_dequant(std::clamp(v / scale, -448.0f, 448.0f)) * scale;
+ }
+ }
+
+ for (int64_t i = n_nope; i < head_dim; ++i) {
+ *(float *) (dst_base + i*dst->nb[0]) = *(const float *) (src_base + i*src0->nb[0]);
+ }
+ }
+}
+
// ggml_compute_forward_map_custom1
void ggml_compute_forward_map_custom1(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 7398e5618948..4da4db62aa4e 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -104,6 +104,11 @@ void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, s
void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_gated_delta_net(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_hc_split_sinkhorn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_hc_weighted_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_hc_expand(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_fp8_kv_quantize(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_rope_tail(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 476c30797956..8b06c0bd5a49 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1063,6 +1063,11 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"RWKV_WKV7",
"SOLVE_TRI",
"GATED_DELTA_NET",
+ "DSV4_HC_SPLIT_SINKHORN",
+ "DSV4_HC_WEIGHTED_SUM",
+ "DSV4_HC_EXPAND",
+ "DSV4_FP8_KV_QUANTIZE",
+ "DSV4_ROPE_TAIL",
"UNARY",
@@ -1080,7 +1085,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GLU",
};
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1173,6 +1178,11 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"rwkv_wkv7(r, w, k, v, a, b, s)",
"A X = B, A triangular, solve X",
"gated_delta_net(q, k, v, g, beta, s)",
+ "dsv4_hc_split_sinkhorn(x)",
+ "dsv4_hc_weighted_sum(x)",
+ "dsv4_hc_expand(x)",
+ "dsv4_fp8_kv_quantize(x)",
+ "dsv4_rope_tail(x)",
"unary(x)",
@@ -1190,7 +1200,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"glu(x)",
};
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -6230,6 +6240,180 @@ struct ggml_tensor * ggml_gated_delta_net(
return result;
}
+// ggml_dsv4_hc_split_sinkhorn
+
+struct ggml_tensor * ggml_dsv4_hc_split_sinkhorn(
+ struct ggml_context * ctx,
+ struct ggml_tensor * mixes,
+ struct ggml_tensor * scale,
+ struct ggml_tensor * base,
+ int n_hc,
+ int sinkhorn_iters,
+ float eps) {
+ GGML_ASSERT(mixes->type == GGML_TYPE_F32);
+ GGML_ASSERT(scale->type == GGML_TYPE_F32);
+ GGML_ASSERT(base->type == GGML_TYPE_F32);
+
+ GGML_ASSERT(ggml_is_contiguous_rows(mixes));
+ GGML_ASSERT(ggml_is_contiguous(scale));
+ GGML_ASSERT(ggml_is_contiguous(base));
+
+ GGML_ASSERT(n_hc > 0);
+ GGML_ASSERT(n_hc <= 16); // CPU forward uses a fixed float c[16*16] scratch
+ GGML_ASSERT(sinkhorn_iters > 0);
+ GGML_ASSERT(mixes->ne[0] == (2 + n_hc) * n_hc);
+ GGML_ASSERT(mixes->ne[2] == 1);
+ GGML_ASSERT(mixes->ne[3] == 1);
+ GGML_ASSERT(ggml_nelements(scale) >= 3);
+ GGML_ASSERT(ggml_nelements(base) >= mixes->ne[0]);
+
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, mixes);
+
+ ggml_set_op_params_i32(result, 0, n_hc);
+ ggml_set_op_params_i32(result, 1, sinkhorn_iters);
+ ggml_set_op_params_f32(result, 2, eps);
+
+ result->op = GGML_OP_DSV4_HC_SPLIT_SINKHORN;
+ result->src[0] = mixes;
+ result->src[1] = scale;
+ result->src[2] = base;
+
+ return result;
+}
+
+// ggml_dsv4_hc_weighted_sum
+
+struct ggml_tensor * ggml_dsv4_hc_weighted_sum(
+ struct ggml_context * ctx,
+ struct ggml_tensor * x,
+ struct ggml_tensor * weights) {
+ GGML_ASSERT(x->type == GGML_TYPE_F32);
+ GGML_ASSERT(weights->type == GGML_TYPE_F32);
+
+ GGML_ASSERT(x->ne[1] == weights->ne[0]);
+ GGML_ASSERT(x->ne[2] == weights->ne[1]);
+ GGML_ASSERT(x->ne[3] == 1);
+ GGML_ASSERT(weights->ne[2] == 1);
+ GGML_ASSERT(weights->ne[3] == 1);
+
+ struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, x->ne[0], x->ne[2]);
+
+ result->op = GGML_OP_DSV4_HC_WEIGHTED_SUM;
+ result->src[0] = x;
+ result->src[1] = weights;
+
+ return result;
+}
+
+// ggml_dsv4_hc_expand
+
+struct ggml_tensor * ggml_dsv4_hc_expand(
+ struct ggml_context * ctx,
+ struct ggml_tensor * block_out,
+ struct ggml_tensor * residual,
+ struct ggml_tensor * post,
+ struct ggml_tensor * comb) {
+ GGML_ASSERT(block_out->type == GGML_TYPE_F32);
+ GGML_ASSERT(residual->type == GGML_TYPE_F32);
+ GGML_ASSERT(post->type == GGML_TYPE_F32);
+ GGML_ASSERT(comb->type == GGML_TYPE_F32);
+
+ GGML_ASSERT(block_out->ne[0] == residual->ne[0]);
+ GGML_ASSERT(block_out->ne[1] == residual->ne[2]);
+ GGML_ASSERT(block_out->ne[2] == 1);
+ GGML_ASSERT(block_out->ne[3] == 1);
+ GGML_ASSERT(post->ne[0] == residual->ne[1]);
+ GGML_ASSERT(post->ne[1] == residual->ne[2]);
+ GGML_ASSERT(post->ne[2] == 1);
+ GGML_ASSERT(post->ne[3] == 1);
+ GGML_ASSERT(comb->ne[0] == residual->ne[1]);
+ GGML_ASSERT(comb->ne[1] == residual->ne[1]);
+ GGML_ASSERT(comb->ne[2] == residual->ne[2]);
+ GGML_ASSERT(comb->ne[3] == 1);
+
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, residual);
+
+ result->op = GGML_OP_DSV4_HC_EXPAND;
+ result->src[0] = block_out;
+ result->src[1] = residual;
+ result->src[2] = post;
+ result->src[3] = comb;
+
+ return result;
+}
+
+// ggml_dsv4_fp8_kv_quantize
+
+struct ggml_tensor * ggml_dsv4_fp8_kv_quantize(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_rot) {
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
+ GGML_ASSERT(n_rot >= 0);
+ GGML_ASSERT(a->ne[0] > n_rot);
+ GGML_ASSERT((a->ne[0] - n_rot) % 64 == 0);
+
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+ ggml_set_op_params_i32(result, 0, n_rot);
+
+ result->op = GGML_OP_DSV4_FP8_KV_QUANTIZE;
+ result->src[0] = a;
+
+ return result;
+}
+
+// ggml_dsv4_rope_tail
+
+struct ggml_tensor * ggml_dsv4_rope_tail(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * pos,
+ struct ggml_tensor * freq_factors,
+ int n_dims,
+ int mode,
+ int n_ctx_orig,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow,
+ bool inverse) {
+ GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
+ GGML_ASSERT(mode == GGML_ROPE_TYPE_NORMAL || mode == GGML_ROPE_TYPE_NEOX);
+ GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16);
+ GGML_ASSERT(pos->type == GGML_TYPE_I32);
+ GGML_ASSERT(ggml_is_vector(pos));
+ GGML_ASSERT(a->ne[2] == pos->ne[0]);
+ GGML_ASSERT(n_dims > 0);
+ GGML_ASSERT(n_dims <= a->ne[0]);
+ GGML_ASSERT(n_dims % 2 == 0);
+
+ if (freq_factors) {
+ GGML_ASSERT(freq_factors->type == GGML_TYPE_F32);
+ GGML_ASSERT(freq_factors->ne[0] >= n_dims / 2);
+ }
+
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+ int32_t params[16] = { n_dims, mode, n_ctx_orig, inverse ? 1 : 0 };
+ memcpy(params + 4, &freq_base, sizeof(float));
+ memcpy(params + 5, &freq_scale, sizeof(float));
+ memcpy(params + 6, &ext_factor, sizeof(float));
+ memcpy(params + 7, &attn_factor, sizeof(float));
+ memcpy(params + 8, &beta_fast, sizeof(float));
+ memcpy(params + 9, &beta_slow, sizeof(float));
+ ggml_set_op_params(result, params, sizeof(params));
+
+ result->op = GGML_OP_DSV4_ROPE_TAIL;
+ result->src[0] = a;
+ result->src[1] = pos;
+ result->src[2] = freq_factors;
+
+ return result;
+}
+
////////////////////////////////////////////////////////////////////////////////
struct ggml_hash_set ggml_hash_set_new(size_t size) {
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index c25f217f990e..8f44c7965e87 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -145,6 +145,10 @@ class LLM:
INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
FULL_ATTENTION_INTERVAL = "{arch}.full_attention_interval"
ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale"
+ HASH_LAYER_COUNT = "{arch}.hash_layer_count"
+ HYPER_CONNECTION_COUNT = "{arch}.hyper_connection.count"
+ HYPER_CONNECTION_SINKHORN_ITERS = "{arch}.hyper_connection.sinkhorn_iterations"
+ HYPER_CONNECTION_EPS = "{arch}.hyper_connection.epsilon"
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
@@ -184,6 +188,10 @@ class Attention:
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
TEMPERATURE_SCALE = "{arch}.attention.temperature_scale"
+ COMPRESS_RATIOS = "{arch}.attention.compress_ratios"
+ COMPRESS_ROPE_FREQ_BASE = "{arch}.attention.compress_rope_freq_base"
+ OUTPUT_LORA_RANK = "{arch}.attention.output_lora_rank"
+ OUTPUT_GROUP_COUNT = "{arch}.attention.output_group_count"
class Indexer:
HEAD_COUNT = "{arch}.attention.indexer.head_count"
@@ -451,6 +459,7 @@ class MODEL_ARCH(IntEnum):
DEEPSEEK = auto()
DEEPSEEK2 = auto()
DEEPSEEK2OCR = auto()
+ DEEPSEEK4 = auto()
CHATGLM = auto()
GLM4 = auto()
GLM4_MOE = auto()
@@ -527,6 +536,9 @@ class MODEL_TENSOR(IntEnum):
TOKEN_TYPES = auto()
POS_EMBD = auto()
OUTPUT = auto()
+ OUTPUT_HC_BASE = auto() # deepseek4 hyper-connection output
+ OUTPUT_HC_FN = auto() # deepseek4 hyper-connection output
+ OUTPUT_HC_SCALE = auto() # deepseek4 hyper-connection output
DENSE_2_OUT = auto() # embeddinggemma 2_Dense
DENSE_3_OUT = auto() # embeddinggemma 3_Dense
OUTPUT_NORM = auto()
@@ -650,12 +662,19 @@ class MODEL_TENSOR(IntEnum):
CHANNEL_MIX_VALUE = auto()
ATTN_Q_A = auto()
ATTN_Q_B = auto()
+ ATTN_KV = auto() # deepseek4 single-tensor combined KV projection
ATTN_KV_A_MQA = auto()
ATTN_KV_B = auto()
ATTN_K_B = auto()
ATTN_V_B = auto()
+ ATTN_OUT_A = auto() # deepseek4 attention output LoRA
+ ATTN_OUT_B = auto() # deepseek4 attention output LoRA
ATTN_Q_A_NORM = auto()
ATTN_KV_A_NORM = auto()
+ ATTN_COMPRESSOR_APE = auto() # deepseek4 attention compressor
+ ATTN_COMPRESSOR_KV = auto() # deepseek4 attention compressor
+ ATTN_COMPRESSOR_GATE = auto() # deepseek4 attention compressor
+ ATTN_COMPRESSOR_NORM = auto() # deepseek4 attention compressor
FFN_SUB_NORM = auto()
ATTN_SUB_NORM = auto()
DEC_ATTN_NORM = auto()
@@ -717,6 +736,17 @@ class MODEL_TENSOR(IntEnum):
INDEXER_PROJ = auto()
INDEXER_ATTN_K = auto()
INDEXER_ATTN_Q_B = auto()
+ INDEXER_COMPRESSOR_APE = auto() # deepseek4 indexer compressor
+ INDEXER_COMPRESSOR_KV = auto() # deepseek4 indexer compressor
+ INDEXER_COMPRESSOR_GATE = auto() # deepseek4 indexer compressor
+ INDEXER_COMPRESSOR_NORM = auto() # deepseek4 indexer compressor
+ HC_ATTN_BASE = auto() # deepseek4 hyper-connection attention
+ HC_ATTN_FN = auto() # deepseek4 hyper-connection attention
+ HC_ATTN_SCALE = auto() # deepseek4 hyper-connection attention
+ HC_FFN_BASE = auto() # deepseek4 hyper-connection ffn
+ HC_FFN_FN = auto() # deepseek4 hyper-connection ffn
+ HC_FFN_SCALE = auto() # deepseek4 hyper-connection ffn
+ FFN_GATE_TID2EID = auto() # deepseek4 token-id-to-expert-id gating
# vision
V_MMPROJ = auto()
V_MMPROJ_FC = auto()
@@ -966,6 +996,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.DEEPSEEK: "deepseek",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr",
+ MODEL_ARCH.DEEPSEEK4: "deepseek4",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.GLM4: "glm4",
MODEL_ARCH.GLM4_MOE: "glm4moe",
@@ -1042,6 +1073,9 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
+ MODEL_TENSOR.OUTPUT_HC_BASE: "output_hc_base",
+ MODEL_TENSOR.OUTPUT_HC_FN: "output_hc_fn",
+ MODEL_TENSOR.OUTPUT_HC_SCALE: "output_hc_scale",
MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
@@ -1164,12 +1198,19 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value",
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
+ MODEL_TENSOR.ATTN_KV: "blk.{bid}.attn_kv",
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b",
MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b",
+ MODEL_TENSOR.ATTN_OUT_A: "blk.{bid}.attn_output_a",
+ MODEL_TENSOR.ATTN_OUT_B: "blk.{bid}.attn_output_b",
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
+ MODEL_TENSOR.ATTN_COMPRESSOR_APE: "blk.{bid}.attn_compressor_ape",
+ MODEL_TENSOR.ATTN_COMPRESSOR_KV: "blk.{bid}.attn_compressor_kv",
+ MODEL_TENSOR.ATTN_COMPRESSOR_GATE: "blk.{bid}.attn_compressor_gate",
+ MODEL_TENSOR.ATTN_COMPRESSOR_NORM: "blk.{bid}.attn_compressor_norm",
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
@@ -1231,6 +1272,17 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.INDEXER_PROJ: "blk.{bid}.indexer.proj",
MODEL_TENSOR.INDEXER_ATTN_K: "blk.{bid}.indexer.attn_k",
MODEL_TENSOR.INDEXER_ATTN_Q_B: "blk.{bid}.indexer.attn_q_b",
+ MODEL_TENSOR.INDEXER_COMPRESSOR_APE: "blk.{bid}.indexer_compressor_ape",
+ MODEL_TENSOR.INDEXER_COMPRESSOR_KV: "blk.{bid}.indexer_compressor_kv",
+ MODEL_TENSOR.INDEXER_COMPRESSOR_GATE: "blk.{bid}.indexer_compressor_gate",
+ MODEL_TENSOR.INDEXER_COMPRESSOR_NORM: "blk.{bid}.indexer_compressor_norm",
+ MODEL_TENSOR.HC_ATTN_BASE: "blk.{bid}.hc_attn_base",
+ MODEL_TENSOR.HC_ATTN_FN: "blk.{bid}.hc_attn_fn",
+ MODEL_TENSOR.HC_ATTN_SCALE: "blk.{bid}.hc_attn_scale",
+ MODEL_TENSOR.HC_FFN_BASE: "blk.{bid}.hc_ffn_base",
+ MODEL_TENSOR.HC_FFN_FN: "blk.{bid}.hc_ffn_fn",
+ MODEL_TENSOR.HC_FFN_SCALE: "blk.{bid}.hc_ffn_scale",
+ MODEL_TENSOR.FFN_GATE_TID2EID: "blk.{bid}.ffn_gate_tid2eid",
# vision
MODEL_TENSOR.V_MMPROJ: "mm.{bid}",
MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc",
@@ -2928,6 +2980,49 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
],
+ MODEL_ARCH.DEEPSEEK4: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.OUTPUT_HC_BASE,
+ MODEL_TENSOR.OUTPUT_HC_FN,
+ MODEL_TENSOR.OUTPUT_HC_SCALE,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_SINKS,
+ MODEL_TENSOR.ATTN_Q_A,
+ MODEL_TENSOR.ATTN_Q_B,
+ MODEL_TENSOR.ATTN_Q_A_NORM,
+ MODEL_TENSOR.ATTN_KV,
+ MODEL_TENSOR.ATTN_KV_A_NORM,
+ MODEL_TENSOR.ATTN_OUT_A,
+ MODEL_TENSOR.ATTN_OUT_B,
+ MODEL_TENSOR.ATTN_COMPRESSOR_APE,
+ MODEL_TENSOR.ATTN_COMPRESSOR_KV,
+ MODEL_TENSOR.ATTN_COMPRESSOR_GATE,
+ MODEL_TENSOR.ATTN_COMPRESSOR_NORM,
+ MODEL_TENSOR.INDEXER_PROJ,
+ MODEL_TENSOR.INDEXER_ATTN_Q_B,
+ MODEL_TENSOR.INDEXER_COMPRESSOR_APE,
+ MODEL_TENSOR.INDEXER_COMPRESSOR_KV,
+ MODEL_TENSOR.INDEXER_COMPRESSOR_GATE,
+ MODEL_TENSOR.INDEXER_COMPRESSOR_NORM,
+ MODEL_TENSOR.FFN_GATE_INP,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.FFN_GATE_EXP,
+ MODEL_TENSOR.FFN_DOWN_EXP,
+ MODEL_TENSOR.FFN_UP_EXP,
+ MODEL_TENSOR.FFN_GATE_SHEXP,
+ MODEL_TENSOR.FFN_DOWN_SHEXP,
+ MODEL_TENSOR.FFN_UP_SHEXP,
+ MODEL_TENSOR.FFN_EXP_PROBS_B,
+ MODEL_TENSOR.FFN_GATE_TID2EID,
+ MODEL_TENSOR.HC_ATTN_BASE,
+ MODEL_TENSOR.HC_ATTN_FN,
+ MODEL_TENSOR.HC_ATTN_SCALE,
+ MODEL_TENSOR.HC_FFN_BASE,
+ MODEL_TENSOR.HC_FFN_FN,
+ MODEL_TENSOR.HC_FFN_SCALE,
+ ],
MODEL_ARCH.ERNIE4_5_MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@@ -4147,6 +4242,8 @@ class GGMLQuantizationType(IntEnum):
class ExpertGatingFuncType(IntEnum):
SOFTMAX = 1
SIGMOID = 2
+ SOFTMAX_WEIGHT = 3
+ SQRTSOFTPLUS = 4
# TODO: add GGMLFileType from ggml_ftype in ggml.h
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index a101382719d0..4e3c94de12b9 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -868,6 +868,18 @@ def add_moe_latent_size(self, value: int) -> None:
def add_nextn_predict_layers(self, count: int) -> None:
self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
+ def add_hash_layer_count(self, count: int) -> None:
+ self.add_uint32(Keys.LLM.HASH_LAYER_COUNT.format(arch=self.arch), count)
+
+ def add_hyper_connection_count(self, count: int) -> None:
+ self.add_uint32(Keys.LLM.HYPER_CONNECTION_COUNT.format(arch=self.arch), count)
+
+ def add_hyper_connection_sinkhorn_iters(self, count: int) -> None:
+ self.add_uint32(Keys.LLM.HYPER_CONNECTION_SINKHORN_ITERS.format(arch=self.arch), count)
+
+ def add_hyper_connection_eps(self, value: float) -> None:
+ self.add_float32(Keys.LLM.HYPER_CONNECTION_EPS.format(arch=self.arch), value)
+
def add_swin_norm(self, value: bool) -> None:
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
@@ -952,6 +964,18 @@ def add_attn_temperature_length(self, value: int) -> None:
def add_attn_temperature_scale(self, value: float) -> None:
self.add_float32(Keys.Attention.TEMPERATURE_SCALE.format(arch=self.arch), value)
+ def add_attention_compress_ratios(self, values: Sequence[int]) -> None:
+ self.add_array(Keys.Attention.COMPRESS_RATIOS.format(arch=self.arch), values)
+
+ def add_attention_compress_rope_freq_base(self, value: float) -> None:
+ self.add_float32(Keys.Attention.COMPRESS_ROPE_FREQ_BASE.format(arch=self.arch), value)
+
+ def add_attention_output_lora_rank(self, value: int) -> None:
+ self.add_uint32(Keys.Attention.OUTPUT_LORA_RANK.format(arch=self.arch), value)
+
+ def add_attention_output_group_count(self, value: int) -> None:
+ self.add_uint32(Keys.Attention.OUTPUT_GROUP_COUNT.format(arch=self.arch), value)
+
def add_pooling_type(self, value: PoolingType) -> None:
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
diff --git a/models/templates/deepseek-ai-DeepSeek-V4.jinja b/models/templates/deepseek-ai-DeepSeek-V4.jinja
new file mode 100644
index 000000000000..44d5b785ec04
--- /dev/null
+++ b/models/templates/deepseek-ai-DeepSeek-V4.jinja
@@ -0,0 +1,96 @@
+{%- if not add_generation_prompt is defined -%}
+ {%- set add_generation_prompt = false -%}
+{%- endif -%}
+{%- if not thinking is defined -%}
+ {%- if enable_thinking is defined -%}
+ {%- set thinking = enable_thinking -%}
+ {%- else -%}
+ {%- set thinking = false -%}
+ {%- endif -%}
+{%- endif -%}
+{%- set dsml_token = '|DSML|' -%}
+{%- set thinking_start_token = '' -%}
+{%- set thinking_end_token = '' -%}
+{%- set tools_header = '## Tools\n\nYou have access to a set of tools to help answer the user question. You can invoke tools by writing a "<' + dsml_token + 'tool_calls>" block like the following:\n\n<' + dsml_token + 'tool_calls>\n<' + dsml_token + 'invoke name="$TOOL_NAME">\n<' + dsml_token + 'parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE' + dsml_token + 'parameter>\n...\n' + dsml_token + 'invoke>\n<' + dsml_token + 'invoke name="$TOOL_NAME2">\n...\n' + dsml_token + 'invoke>\n' + dsml_token + 'tool_calls>\n\nString parameters should be specified as is and set `string="true"`. For all other types (numbers, booleans, arrays, objects), pass the value in JSON format and set `string="false"`.\n\nIf thinking_mode is enabled (triggered by ' + thinking_start_token + '), you MUST output your complete reasoning inside ' + thinking_start_token + '...' + thinking_end_token + ' BEFORE any tool calls or final response.\n\nOtherwise, output directly after ' + thinking_end_token + ' with tool calls or final response.\n\n### Available Tool Schemas\n\n' -%}
+{%- set tools_footer = '\n\nYou MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls.' -%}
+{%- set ns = namespace(system_prompt='', is_first_system=true, pending_assistant=false, pending_tool_result=false) -%}
+{%- for message in messages -%}
+ {%- if message['role'] == 'system' -%}
+ {%- if ns.is_first_system -%}
+ {%- set ns.system_prompt = ns.system_prompt + (message['content'] or '') -%}
+ {%- set ns.is_first_system = false -%}
+ {%- else -%}
+ {%- set ns.system_prompt = ns.system_prompt + '\n\n' + (message['content'] or '') -%}
+ {%- endif -%}
+ {%- endif -%}
+{%- endfor -%}
+{%- if tools is defined and tools -%}
+ {%- set ts = namespace(schemas='') -%}
+ {%- for tool in tools -%}
+ {%- if tool['type'] == 'function' -%}
+ {%- set ts.schemas = ts.schemas + (tool['function'] | tojson) + '\n' -%}
+ {%- endif -%}
+ {%- endfor -%}
+ {%- if ns.system_prompt -%}
+ {%- set ns.system_prompt = ns.system_prompt + '\n\n' + tools_header + ts.schemas + tools_footer -%}
+ {%- else -%}
+ {%- set ns.system_prompt = tools_header + ts.schemas + tools_footer -%}
+ {%- endif -%}
+{%- endif -%}
+{{- bos_token -}}
+{{- ns.system_prompt -}}
+{%- for message in messages -%}
+ {%- if message['role'] == 'user' -%}
+ {{- '<|User|>' + (message['content'] or '') -}}
+ {%- set ns.pending_assistant = true -%}
+ {%- set ns.pending_tool_result = true -%}
+ {%- elif message['role'] == 'tool' -%}
+ {%- if not ns.pending_tool_result -%}
+ {{- '<|User|>' -}}
+ {%- endif -%}
+ {{- '' + (message['content'] or '') + '' -}}
+ {%- set ns.pending_assistant = true -%}
+ {%- set ns.pending_tool_result = true -%}
+ {%- elif message['role'] == 'assistant' -%}
+ {%- if ns.pending_assistant -%}
+ {{- '<|Assistant|>' -}}
+ {%- if thinking and message['reasoning_content'] is defined and message['reasoning_content'] -%}
+ {{- thinking_start_token + message['reasoning_content'] + thinking_end_token -}}
+ {%- else -%}
+ {{- thinking_end_token -}}
+ {%- endif -%}
+ {%- endif -%}
+ {{- (message['content'] or '') -}}
+ {%- if message['tool_calls'] -%}
+ {{- '\n\n<' + dsml_token + 'tool_calls>\n' -}}
+ {%- for tool in message['tool_calls'] -%}
+ {%- set func = tool['function'] -%}
+ {{- '<' + dsml_token + 'invoke name="' + func['name'] + '">\n' -}}
+ {%- set args = func['arguments'] -%}
+ {%- if args is string -%}
+ {%- set args = args | from_json -%}
+ {%- endif -%}
+ {%- for key, val in args.items() -%}
+ {%- if val is string -%}
+ {{- '<' + dsml_token + 'parameter name="' + key + '" string="true">' + val + '' + dsml_token + 'parameter>\n' -}}
+ {%- else -%}
+ {{- '<' + dsml_token + 'parameter name="' + key + '" string="false">' + (val | tojson) + '' + dsml_token + 'parameter>\n' -}}
+ {%- endif -%}
+ {%- endfor -%}
+ {{- '' + dsml_token + 'invoke>\n' -}}
+ {%- endfor -%}
+ {{- '' + dsml_token + 'tool_calls>' -}}
+ {%- endif -%}
+ {{- '<|end▁of▁sentence|>' -}}
+ {%- set ns.pending_assistant = false -%}
+ {%- set ns.pending_tool_result = false -%}
+ {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt and ns.pending_assistant -%}
+ {{- '<|Assistant|>' -}}
+ {%- if thinking -%}
+ {{- thinking_start_token -}}
+ {%- else -%}
+ {{- thinking_end_token -}}
+ {%- endif -%}
+{%- endif -%}
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index c9eead18aa39..e789e5a681ae 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -75,6 +75,7 @@ static const std::map LLM_ARCH_NAMES = {
{ LLM_ARCH_DEEPSEEK, "deepseek" },
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_DEEPSEEK2OCR, "deepseek2-ocr" },
+ { LLM_ARCH_DEEPSEEK4, "deepseek4" },
{ LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_GLM4, "glm4" },
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
@@ -209,6 +210,10 @@ static const std::map LLM_KV_NAMES = {
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
{ LLM_KV_FULL_ATTENTION_INTERVAL, "%s.full_attention_interval" },
+ { LLM_KV_HASH_LAYER_COUNT, "%s.hash_layer_count" },
+ { LLM_KV_HYPER_CONNECTION_COUNT, "%s.hyper_connection.count" },
+ { LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS, "%s.hyper_connection.sinkhorn_iterations" },
+ { LLM_KV_HYPER_CONNECTION_EPS, "%s.hyper_connection.epsilon" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -243,6 +248,10 @@ static const std::map LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
{ LLM_KV_ATTENTION_SHARED_KV_LAYERS, "%s.attention.shared_kv_layers" },
+ { LLM_KV_ATTENTION_COMPRESS_RATIOS, "%s.attention.compress_ratios" },
+ { LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE, "%s.attention.compress_rope_freq_base" },
+ { LLM_KV_ATTENTION_OUTPUT_LORA_RANK, "%s.attention.output_lora_rank" },
+ { LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT, "%s.attention.output_group_count" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
@@ -346,6 +355,9 @@ static const std::map LLM_TENSOR_NAMES = {
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT_NORM_LFM2, "token_embd_norm" }, // fix for wrong tensor name
{ LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_OUTPUT_HC_BASE, "output_hc_base" },
+ { LLM_TENSOR_OUTPUT_HC_FN, "output_hc_fn" },
+ { LLM_TENSOR_OUTPUT_HC_SCALE, "output_hc_scale" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
@@ -422,8 +434,15 @@ static const std::map LLM_TENSOR_NAMES = {
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
+ { LLM_TENSOR_ATTN_KV, "blk.%d.attn_kv" },
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
+ { LLM_TENSOR_ATTN_OUT_A, "blk.%d.attn_output_a" },
+ { LLM_TENSOR_ATTN_OUT_B, "blk.%d.attn_output_b" },
+ { LLM_TENSOR_ATTN_COMPRESSOR_APE, "blk.%d.attn_compressor_ape" },
+ { LLM_TENSOR_ATTN_COMPRESSOR_KV, "blk.%d.attn_compressor_kv" },
+ { LLM_TENSOR_ATTN_COMPRESSOR_GATE, "blk.%d.attn_compressor_gate" },
+ { LLM_TENSOR_ATTN_COMPRESSOR_NORM, "blk.%d.attn_compressor_norm" },
{ LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" },
{ LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" },
{ LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" },
@@ -548,6 +567,17 @@ static const std::map LLM_TENSOR_NAMES = {
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
+ { LLM_TENSOR_INDEXER_COMPRESSOR_APE, "blk.%d.indexer_compressor_ape" },
+ { LLM_TENSOR_INDEXER_COMPRESSOR_KV, "blk.%d.indexer_compressor_kv" },
+ { LLM_TENSOR_INDEXER_COMPRESSOR_GATE, "blk.%d.indexer_compressor_gate" },
+ { LLM_TENSOR_INDEXER_COMPRESSOR_NORM, "blk.%d.indexer_compressor_norm" },
+ { LLM_TENSOR_HC_ATTN_BASE, "blk.%d.hc_attn_base" },
+ { LLM_TENSOR_HC_ATTN_FN, "blk.%d.hc_attn_fn" },
+ { LLM_TENSOR_HC_ATTN_SCALE, "blk.%d.hc_attn_scale" },
+ { LLM_TENSOR_HC_FFN_BASE, "blk.%d.hc_ffn_base" },
+ { LLM_TENSOR_HC_FFN_FN, "blk.%d.hc_ffn_fn" },
+ { LLM_TENSOR_HC_FFN_SCALE, "blk.%d.hc_ffn_scale" },
+ { LLM_TENSOR_FFN_GATE_TID2EID, "blk.%d.ffn_gate_tid2eid" },
};
// declare information about the model weight tensors:
@@ -566,6 +596,9 @@ static const std::map LLM_TENSOR_INFOS = {
{LLM_TENSOR_TOKEN_TYPES, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
{LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // do the norms on the first layer (not the input layer)
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_OUTPUT_HC_BASE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}},
+ {LLM_TENSOR_OUTPUT_HC_FN, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_OUTPUT_HC_SCALE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_SCALE}},
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CLS_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
@@ -592,10 +625,15 @@ static const std::map LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_KV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_OUT_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_OUT_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_COMPRESSOR_KV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_COMPRESSOR_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -757,6 +795,19 @@ static const std::map LLM_TENSOR_INFOS = {
{LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_INDEXER_COMPRESSOR_KV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_INDEXER_COMPRESSOR_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_COMPRESSOR_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_INDEXER_COMPRESSOR_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_HC_ATTN_BASE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_HC_ATTN_FN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_HC_ATTN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
+ {LLM_TENSOR_HC_FFN_BASE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_HC_FFN_FN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_HC_FFN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
+ {LLM_TENSOR_FFN_GATE_TID2EID, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+ {LLM_TENSOR_ATTN_COMPRESSOR_APE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_INDEXER_COMPRESSOR_APE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the
// last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so
// the model loader doesn't fault on the block index.
@@ -902,6 +953,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
case LLM_ARCH_OLMO2:
case LLM_ARCH_OLMOE:
case LLM_ARCH_DEEPSEEK2:
+ case LLM_ARCH_DEEPSEEK4:
case LLM_ARCH_GLM_DSA:
case LLM_ARCH_BITNET:
case LLM_ARCH_T5:
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 89cf16cc37cf..a1dcb037c7a2 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -79,6 +79,7 @@ enum llm_arch {
LLM_ARCH_DEEPSEEK,
LLM_ARCH_DEEPSEEK2,
LLM_ARCH_DEEPSEEK2OCR,
+ LLM_ARCH_DEEPSEEK4,
LLM_ARCH_CHATGLM,
LLM_ARCH_GLM4,
LLM_ARCH_GLM4_MOE,
@@ -213,6 +214,10 @@ enum llm_kv {
LLM_KV_TOKEN_SHIFT_COUNT,
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
LLM_KV_FULL_ATTENTION_INTERVAL,
+ LLM_KV_HASH_LAYER_COUNT,
+ LLM_KV_HYPER_CONNECTION_COUNT,
+ LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS,
+ LLM_KV_HYPER_CONNECTION_EPS,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -247,6 +252,10 @@ enum llm_kv {
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
LLM_KV_ATTENTION_INDEXER_TOP_K,
LLM_KV_ATTENTION_SHARED_KV_LAYERS,
+ LLM_KV_ATTENTION_COMPRESS_RATIOS,
+ LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE,
+ LLM_KV_ATTENTION_OUTPUT_LORA_RANK,
+ LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
@@ -354,6 +363,9 @@ enum llm_tensor {
LLM_TENSOR_DENSE_2_OUT,
LLM_TENSOR_DENSE_3_OUT,
LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_OUTPUT_HC_BASE,
+ LLM_TENSOR_OUTPUT_HC_FN,
+ LLM_TENSOR_OUTPUT_HC_SCALE,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
LLM_TENSOR_ROPE_FREQS,
@@ -482,12 +494,19 @@ enum llm_tensor {
LLM_TENSOR_CHANNEL_MIX_VALUE,
LLM_TENSOR_ATTN_Q_A,
LLM_TENSOR_ATTN_Q_B,
+ LLM_TENSOR_ATTN_KV,
LLM_TENSOR_ATTN_KV_A_MQA,
LLM_TENSOR_ATTN_KV_B,
LLM_TENSOR_ATTN_K_B,
LLM_TENSOR_ATTN_V_B,
+ LLM_TENSOR_ATTN_OUT_A,
+ LLM_TENSOR_ATTN_OUT_B,
LLM_TENSOR_ATTN_Q_A_NORM,
LLM_TENSOR_ATTN_KV_A_NORM,
+ LLM_TENSOR_ATTN_COMPRESSOR_APE,
+ LLM_TENSOR_ATTN_COMPRESSOR_KV,
+ LLM_TENSOR_ATTN_COMPRESSOR_GATE,
+ LLM_TENSOR_ATTN_COMPRESSOR_NORM,
LLM_TENSOR_ATTN_SUB_NORM,
LLM_TENSOR_FFN_SUB_NORM,
LLM_TENSOR_DEC_ATTN_NORM,
@@ -549,6 +568,17 @@ enum llm_tensor {
LLM_TENSOR_INDEXER_PROJ,
LLM_TENSOR_INDEXER_ATTN_K,
LLM_TENSOR_INDEXER_ATTN_Q_B,
+ LLM_TENSOR_INDEXER_COMPRESSOR_APE,
+ LLM_TENSOR_INDEXER_COMPRESSOR_KV,
+ LLM_TENSOR_INDEXER_COMPRESSOR_GATE,
+ LLM_TENSOR_INDEXER_COMPRESSOR_NORM,
+ LLM_TENSOR_HC_ATTN_BASE,
+ LLM_TENSOR_HC_ATTN_FN,
+ LLM_TENSOR_HC_ATTN_SCALE,
+ LLM_TENSOR_HC_FFN_BASE,
+ LLM_TENSOR_HC_FFN_FN,
+ LLM_TENSOR_HC_FFN_SCALE,
+ LLM_TENSOR_FFN_GATE_TID2EID,
LLM_TENSOR_NEXTN_EH_PROJ,
LLM_TENSOR_NEXTN_EMBED_TOKENS,
LLM_TENSOR_NEXTN_ENORM,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d62abc4009b8..b1b6aa350735 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -420,7 +420,7 @@ void llama_context::sched_reserve() {
const int64_t t_start_us = ggml_time_us();
- const uint32_t n_seqs = cparams.n_seq_max;
+ const uint32_t n_seqs = model.arch == LLM_ARCH_DEEPSEEK4 ? 1 : cparams.n_seq_max;
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
const size_t max_nodes = this->graph_max_nodes(n_tokens);
@@ -596,6 +596,22 @@ void llama_context::sched_reserve() {
n_nodes_pp = ggml_graph_n_nodes(gf);
}
+ // DeepSeek V4 resumed-prompt chunks use the compressed-attention decode
+ // graph, which is larger than the position-zero prefill graph.
+ if (model.arch == LLM_ARCH_DEEPSEEK4 && n_tokens > 1) {
+ const llama_pos reserve_pos0 = std::min(
+ cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : n_tokens,
+ std::max(cparams.n_batch, 8u*n_tokens));
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
+ model.hparams.no_alloc, nullptr, reserve_pos0);
+ if (!gf) {
+ throw std::runtime_error("failed to allocate DeepSeek V4 resumed pp buffers");
+ }
+
+ n_splits_pp = std::max(n_splits_pp, ggml_backend_sched_get_n_splits(sched.get()));
+ n_nodes_pp = std::max(n_nodes_pp, ggml_graph_n_nodes(gf));
+ }
+
// reserve with tg (token generation) graph to get the number of splits and nodes
{
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
@@ -2171,6 +2187,15 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
return std::max(n_tokens * 40, 32u * model.n_tensors());
}
+ if (model.arch == LLM_ARCH_DEEPSEEK4) {
+ // DeepSeek V4 has a position-dependent compressed-attention decode path
+ // that creates many temporary tensor objects, especially when a long
+ // prompt is split into non-prefill ubatches. The visible graph node
+ // count is much smaller than the number of GGML objects allocated while
+ // building those graphs, so reserve a larger metadata arena than the
+ // generic tensor-count heuristic would provide.
+ return std::max(524288u, n_tokens * 192 + 64u * model.n_tensors());
+ }
uint32_t res = std::max(1024u, 8u*model.n_tensors());
for (const auto & lora : model.loras) {
res += lora->get_n_nodes();
@@ -2183,7 +2208,7 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
}
ggml_cgraph * llama_context::graph_reserve(
- uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
+ uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes, llama_pos pos0) {
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
GGML_ASSERT(n_outputs >= 1);
@@ -2207,6 +2232,14 @@ ggml_cgraph * llama_context::graph_reserve(
llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
+ if (pos0 != 0 && ubatch.pos != nullptr) {
+ for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+ ubatch.pos[i*ubatch.n_pos] = pos0 + i;
+ for (uint32_t j = 1; j < ubatch.n_pos; ++j) {
+ ubatch.pos[i*ubatch.n_pos + j] = 0;
+ }
+ }
+ }
// set one output token per sequence in order to activate all backend samplers
std::vector seq_ids(n_seqs);
@@ -3357,6 +3390,29 @@ llama_context * llama_init_from_model(
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
}
+ // V4 (DeepSeek4) requires fp16 KV cache: V4's standard SWA K cache,
+ // compressed-attention K cache (cache.attn_k), and indexer K cache
+ // (cache.index_k) all share the same `type_k` and must agree in dtype
+ // because src/models/deepseek4.cpp concatenates the SWA K view with the
+ // compressed K view via ggml_concat (which asserts a->type == b->type).
+ // Furthermore, V4's K activations are post-fp8-quantized
+ // (ggml_dsv4_fp8_kv_quantize), and q8_0's single fp16 scale per 32-element
+ // block cannot faithfully reproduce fp8-quantized value distributions --
+ // pinning to q8_0 corrupts decode silently ("=" loops, "Mirror ..."
+ // garbage). Coerce here, before the SPLIT_MODE_TENSOR / FA / V-quant
+ // shared validations below and before the constructor's flash_attn check,
+ // so those validations see the effective fp16 types and won't reject V4
+ // requests with --cache-type-k|v q8_0. See
+ // docs/plans/v4-port-kv-q8-completion.md.
+ if (model->arch == LLM_ARCH_DEEPSEEK4) {
+ if (params.type_k != GGML_TYPE_F16 || params.type_v != GGML_TYPE_F16) {
+ LLAMA_LOG_WARN("DeepSeek4: forcing fp16 KV cache (--cache-type-k|v are ignored for V4 because compressed/indexer K caches require fp16; "
+ "see docs/plans/v4-port-kv-q8-completion.md)\n");
+ params.type_k = GGML_TYPE_F16;
+ params.type_v = GGML_TYPE_F16;
+ }
+ }
+
if (model->split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
LLAMA_LOG_INFO("%s: enabling flash_attn since it is required for SPLIT_MODE_TENSOR\n", __func__);
diff --git a/src/llama-context.h b/src/llama-context.h
index e16ac4c618ba..999ba5a800c5 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -240,7 +240,8 @@ struct llama_context {
// reserve a graph with a dummy ubatch of the specified size
ggml_cgraph * graph_reserve(
- uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
+ uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr,
+ llama_pos pos0 = 0);
bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 858c297dd762..bbb74a0661b4 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -500,29 +500,41 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
}
void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
- mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
- mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+ if (self_k_idxs && self_k_idxs->buffer) {
+ mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
+ }
+ if (self_v_idxs && self_v_idxs->buffer) {
+ mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+ }
- mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+ if (self_kq_mask && self_kq_mask->buffer) {
+ mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+ }
- mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
- mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+ if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
+ mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
+ }
+ if (self_v_idxs_swa && self_v_idxs_swa->buffer) {
+ mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+ }
- mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+ if (self_kq_mask_swa && self_kq_mask_swa->buffer) {
+ mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+ }
- if (self_k_rot) {
+ if (self_k_rot && self_k_rot->buffer) {
mctx->get_base()->set_input_k_rot(self_k_rot);
}
- if (self_v_rot) {
+ if (self_v_rot && self_v_rot->buffer) {
mctx->get_base()->set_input_v_rot(self_v_rot);
}
- if (self_k_rot_swa) {
+ if (self_k_rot_swa && self_k_rot_swa->buffer) {
mctx->get_swa()->set_input_k_rot(self_k_rot_swa);
}
- if (self_v_rot_swa) {
+ if (self_v_rot_swa && self_v_rot_swa->buffer) {
mctx->get_swa()->set_input_v_rot(self_v_rot_swa);
}
}
@@ -534,14 +546,19 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
bool res = true;
- res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
- //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+ if (self_k_idxs && self_k_idxs->buffer) {
+ res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+ //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+ res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
+ }
- res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
- //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+ if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
+ res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
+ //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
- res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
- res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
+ res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
+ }
return res;
}
@@ -591,7 +608,7 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
const int64_t n_rs = mctx->get_recr()->get_n_rs();
- if (inp_rs->s_copy) {
+ if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
int32_t * data = (int32_t *) inp_rs->s_copy->data;
@@ -614,10 +631,12 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
res &= can_reuse_kq_mask(inp_attn->self_kq_mask, mctx->get_attn(), params.ubatch, params.cparams);
- res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+ if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
+ res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
- res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
- res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+ }
res &= inp_rs->head == mctx->get_recr()->get_head();
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
@@ -635,7 +654,7 @@ void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) {
const int64_t n_rs = mctx->get_recr()->get_n_rs();
- if (inp_rs->s_copy) {
+ if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
int32_t * data = (int32_t *) inp_rs->s_copy->data;
@@ -657,10 +676,12 @@ bool llm_graph_input_mem_hybrid_k::can_reuse(const llm_graph_params & params) {
res &= can_reuse_kq_mask(inp_attn->self_kq_mask, mctx->get_attn(), params.ubatch, params.cparams);
- res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+ if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
+ res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
- res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
- res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+ }
res &= inp_rs->head == mctx->get_recr()->get_head();
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
@@ -674,38 +695,46 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
// base tensors may not be allocated if there are no non-SWA attention layers
if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
- attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
+ if (inp_attn->self_v_idxs && inp_attn->self_v_idxs->buffer) {
+ attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
+ }
- attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+ if (inp_attn->self_kq_mask && inp_attn->self_kq_mask->buffer) {
+ attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+ }
}
// swa tensors may not be allocated if there are no SWA attention layers
if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch);
- attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
+ if (inp_attn->self_v_idxs_swa && inp_attn->self_v_idxs_swa->buffer) {
+ attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
+ }
- attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
+ if (inp_attn->self_kq_mask_swa && inp_attn->self_kq_mask_swa->buffer) {
+ attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
+ }
}
- if (inp_attn->self_k_rot) {
+ if (inp_attn->self_k_rot && inp_attn->self_k_rot->buffer) {
attn_ctx->get_base()->set_input_k_rot(inp_attn->self_k_rot);
}
- if (inp_attn->self_v_rot) {
+ if (inp_attn->self_v_rot && inp_attn->self_v_rot->buffer) {
attn_ctx->get_base()->set_input_v_rot(inp_attn->self_v_rot);
}
- if (inp_attn->self_k_rot_swa) {
+ if (inp_attn->self_k_rot_swa && inp_attn->self_k_rot_swa->buffer) {
attn_ctx->get_swa()->set_input_k_rot(inp_attn->self_k_rot_swa);
}
- if (inp_attn->self_v_rot_swa) {
+ if (inp_attn->self_v_rot_swa && inp_attn->self_v_rot_swa->buffer) {
attn_ctx->get_swa()->set_input_v_rot(inp_attn->self_v_rot_swa);
}
const int64_t n_rs = mctx->get_recr()->get_n_rs();
- if (inp_rs->s_copy) {
+ if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
int32_t * data = (int32_t *) inp_rs->s_copy->data;
@@ -741,10 +770,12 @@ bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params)
res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams);
}
- res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+ if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
+ res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
- res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
- res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+ }
res &= inp_rs->head == mctx->get_recr()->get_head();
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
@@ -1325,7 +1356,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * gate_up_exps,
ggml_tensor * up_exps_s,
ggml_tensor * gate_exps_s,
- ggml_tensor * down_exps_s) const {
+ ggml_tensor * down_exps_s,
+ ggml_tensor * selected_experts_in) const {
return build_moe_ffn(
cur,
gate_inp, /* gate_inp_b */ nullptr,
@@ -1345,7 +1377,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
/* gate_up_exps_b */ nullptr,
up_exps_s,
gate_exps_s,
- down_exps_s
+ down_exps_s,
+ selected_experts_in
);
}
@@ -1372,10 +1405,12 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * gate_up_exps_b,
ggml_tensor * up_exps_s,
ggml_tensor * gate_exps_s,
- ggml_tensor * down_exps_s) const {
+ ggml_tensor * down_exps_s,
+ ggml_tensor * selected_experts_in) const {
const int64_t n_embd = cur->ne[0];
const int64_t n_tokens = cur->ne[1];
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
+ const bool weight_before_down = arch == LLM_ARCH_DEEPSEEK4; // DeepSeek V4 applies routed weights after SwiGLU and before w2
ggml_tensor * logits = nullptr;
@@ -1401,6 +1436,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
{
probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
} break;
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS:
+ {
+ probs = ggml_sqrt(ctx0, ggml_softplus(ctx0, logits)); // [n_expert, n_tokens]
+ } break;
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT:
{
probs = logits; // [n_expert, n_tokens]
@@ -1455,8 +1494,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
}
// select experts
- ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
+ ggml_tensor * selected_experts = selected_experts_in;
+ if (selected_experts == nullptr) {
+ selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
+ }
cb(selected_experts, "ffn_moe_topk", il);
if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
@@ -1584,6 +1626,25 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
switch (type_op) {
case LLM_FFN_SILU:
if (gate_exps) {
+ if (arch == LLM_ARCH_DEEPSEEK4 && il >= 0) {
+ const float limit = hparams.swiglu_clamp_exp[il];
+ constexpr float eps = 1e-6f;
+ if (limit > eps) {
+ cur = ggml_clamp(ctx0, cur, -INFINITY, limit);
+ cb(cur, "ffn_moe_gate_clamped", il);
+
+ ggml_tensor * gate_act = ggml_silu(ctx0, cur);
+ cb(gate_act, "ffn_moe_silu", il);
+
+ up = ggml_clamp(ctx0, up, -limit, limit);
+ cb(up, "ffn_moe_up_clamped", il);
+
+ cur = ggml_mul(ctx0, gate_act, up);
+ cb(cur, "ffn_moe_swiglu_limited", il);
+ break;
+ }
+ }
+
// Step35: per-layer clamp for routed experts
if (arch == LLM_ARCH_STEP35 && il >= 0) {
const float limit = hparams.swiglu_clamp_exp[il];
@@ -1648,6 +1709,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
GGML_ABORT("fatal error");
}
+ if (weight_before_down) {
+ cur = ggml_mul(ctx0, cur, weights);
+ cb(cur, "ffn_moe_weighted_swiglu", il);
+ }
+
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il);
@@ -1665,7 +1731,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(experts, "ffn_moe_down_scaled", il);
}
- if (!weight_before_ffn) {
+ if (!weight_before_ffn && !weight_before_down) {
experts = ggml_mul(ctx0, experts, weights);
cb(experts, "ffn_moe_weighted", il);
}
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 9e55d0a675e0..260334f7302f 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -849,7 +849,8 @@ struct llm_graph_context {
ggml_tensor * gate_up_exps = nullptr,
ggml_tensor * up_exps_s = nullptr,
ggml_tensor * gate_exps_s = nullptr,
- ggml_tensor * down_exps_s = nullptr) const;
+ ggml_tensor * down_exps_s = nullptr,
+ ggml_tensor * selected_experts_in = nullptr) const;
ggml_tensor * build_moe_ffn(
ggml_tensor * cur,
@@ -874,7 +875,8 @@ struct llm_graph_context {
ggml_tensor * gate_up_exps_b = nullptr,
ggml_tensor * up_exps_s = nullptr,
ggml_tensor * gate_exps_s = nullptr,
- ggml_tensor * down_exps_s = nullptr) const;
+ ggml_tensor * down_exps_s = nullptr,
+ ggml_tensor * selected_experts_in = nullptr) const;
//
// inputs
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 2239309c8fb4..44eaf501f7dc 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -153,6 +153,10 @@ uint32_t llama_hparams::n_embd_v_gqa_max() const {
}
uint32_t llama_hparams::n_embd_r() const {
+ if (dsv4_state_size != 0) {
+ return dsv4_state_size;
+ }
+
if (wkv_head_size != 0) {
// for RWKV models
return token_shift_count * n_embd;
@@ -177,6 +181,10 @@ uint32_t llama_hparams::n_embd_r() const {
}
uint32_t llama_hparams::n_embd_s() const {
+ if (dsv4_state_size != 0) {
+ return dsv4_state_size;
+ }
+
if (wkv_head_size != 0) {
// corresponds to RWKV's wkv_states size
return n_embd * wkv_head_size;
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index e2d051edc6cd..3a0438283e77 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -14,6 +14,7 @@ enum llama_expert_gating_func_type {
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS = 4,
};
enum llama_swa_type {
@@ -75,6 +76,8 @@ struct llama_hparams {
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
+ uint32_t n_lora_o = 0;
+ uint32_t n_attn_out_groups = 0;
uint32_t n_ff_exp = 0;
uint32_t n_ff_shexp = 0;
uint32_t n_ff_chexp = 0;
@@ -91,6 +94,7 @@ struct llama_hparams {
uint32_t moe_every_n_layers = 0;
uint32_t moe_latent_size = 0;
uint32_t nextn_predict_layers = 0;
+ uint32_t n_hash_layers = 0;
bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches)
@@ -211,6 +215,14 @@ struct llama_hparams {
uint32_t indexer_head_size = 0;
uint32_t indexer_top_k = 0;
+ // DeepSeek V4 hyper-connections and sparse KV compression
+ uint32_t n_hc = 1;
+ uint32_t hc_sinkhorn_iters = 0;
+ float hc_eps = 0.0f;
+ float compress_rope_freq_base = 0.0f;
+ uint32_t dsv4_state_size = 0;
+ std::array attn_compress_ratio;
+
// qwen3vl deepstack
uint32_t n_deepstack_layers = 0;
diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp
index 26e2cb4270b0..9b9f17903637 100644
--- a/src/llama-kv-cache-iswa.cpp
+++ b/src/llama-kv-cache-iswa.cpp
@@ -60,14 +60,14 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
kv_base = std::make_unique(
- model, type_k, type_v,
+ model, hparams, type_k, type_v,
v_trans, offload, unified, size_base, n_seq_max, n_pad,
0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
kv_swa = std::make_unique(
- model, type_k, type_v,
+ model, hparams, type_k, type_v,
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
hparams.n_swa, hparams.swa_type, filter_swa, reuse);
}
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a49a055a6304..92585b671b55 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -79,6 +79,7 @@ static ggml_tensor * ggml_mul_mat_aux(
llama_kv_cache::llama_kv_cache(
const llama_model & model,
+ const llama_hparams & hparams,
ggml_type type_k,
ggml_type type_v,
bool v_trans,
@@ -91,7 +92,7 @@ llama_kv_cache::llama_kv_cache(
llama_swa_type swa_type,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse) :
- model(model), hparams(model.hparams), v_trans(v_trans),
+ model(model), hparams(hparams), v_trans(v_trans),
n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
GGML_ASSERT(kv_size % n_pad == 0);
@@ -205,7 +206,7 @@ llama_kv_cache::llama_kv_cache(
}
const bool has_k = true;
- const bool has_v = !is_mla;
+ const bool has_v = !is_mla && model.arch != LLM_ARCH_DEEPSEEK4;
ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
@@ -253,7 +254,7 @@ llama_kv_cache::llama_kv_cache(
// allocate tensors and initialize the buffers to avoid NaNs in the padding
for (auto & [buft, ctx] : ctx_map) {
ggml_backend_buffer_t buf;
- if (model.hparams.no_alloc) {
+ if (hparams.no_alloc) {
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 0b62dc7b2320..0b0a56ce92f4 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -95,6 +95,7 @@ class llama_kv_cache : public llama_memory_i {
llama_kv_cache(
const llama_model & model,
+ const llama_hparams & hparams,
ggml_type type_k,
ggml_type type_v,
bool v_trans,
diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp
index a59561ea54dd..58dadabc9f62 100644
--- a/src/llama-memory-hybrid-iswa.cpp
+++ b/src/llama-memory-hybrid-iswa.cpp
@@ -1,9 +1,113 @@
#include "llama-memory-hybrid-iswa.h"
+#include "ggml-backend.h"
+
#include "llama-impl.h"
+#include "llama-io.h"
#include "llama-model.h"
#include "llama-context.h"
+#include
+#include
+#include
+#include