diff --git a/common/chat.cpp b/common/chat.cpp
index 70b9f5dc2c58..38f7a2ed744a 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1661,6 +1661,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
static common_chat_params common_chat_params_init_deepseek_v3_2(const common_chat_template & tmpl,
const autoparser::generation_params & inputs) {
common_chat_params data;
+ const auto & src = tmpl.source();
data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_PEG_NATIVE;
@@ -1681,8 +1682,9 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
const std::string DSML = "|DSML|";
const std::string THINK_START = "";
const std::string THINK_END = "";
- const std::string FC_START = "<" + DSML + "function_calls>";
- const std::string FC_END = "" + DSML + "function_calls>";
+ const std::string FC_NAME = src.find("function_calls") != std::string::npos ? "function_calls" : "tool_calls";
+ const std::string FC_START = "<" + DSML + FC_NAME + ">";
+ const std::string FC_END = "" + DSML + FC_NAME + ">";
const std::string INVOKE_START = "<" + DSML + "invoke";
const std::string INVOKE_END = "" + DSML + "invoke>";
const std::string PARAM_START = "<" + DSML + "parameter";
@@ -2093,12 +2095,12 @@ std::optional common_chat_try_specialized_template(
return common_chat_params_init_gigachat_v3(tmpl, params);
}
- // DeepSeek V3.2 format detection: template defines dsml_token and uses it for tool calls.
+ // DeepSeek DSML format detection: template defines dsml_token and uses it for tool calls.
// The template source contains the token as a variable assignment, not as a literal in markup.
if (src.find("dsml_token") != std::string::npos &&
- src.find("function_calls") != std::string::npos &&
+ (src.find("function_calls") != std::string::npos || src.find("tool_calls") != std::string::npos) &&
src.find("DSML") != std::string::npos) {
- LOG_DBG("Using specialized template: DeepSeek V3.2\n");
+ LOG_DBG("Using specialized template: DeepSeek DSML\n");
return common_chat_params_init_deepseek_v3_2(tmpl, params);
}
diff --git a/conversion/__init__.py b/conversion/__init__.py
index 2c38123dff8d..bba37a5cbbc7 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -47,6 +47,7 @@
"DeepseekForCausalLM": "deepseek",
"DeepseekV2ForCausalLM": "deepseek",
"DeepseekV3ForCausalLM": "deepseek",
+ "DeepseekV4ForCausalLM": "deepseek",
"DistilBertForMaskedLM": "bert",
"DistilBertForSequenceClassification": "bert",
"DistilBertModel": "bert",
diff --git a/conversion/deepseek.py b/conversion/deepseek.py
index e149fcbf752e..86a3046b9e98 100644
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@@ -1,18 +1,26 @@
from __future__ import annotations
+import concurrent.futures
+import ctypes
+import math
+import os
import re
-from typing import Any, Callable, Iterable, TYPE_CHECKING
+from pathlib import Path
+from typing import Any, Callable, Iterable, Sequence, TYPE_CHECKING
+import numpy as np
import torch
if TYPE_CHECKING:
from torch import Tensor
-from .base import MmprojModel, ModelBase, TextModel, gguf, logger
+from .base import LazyTorchTensor, MmprojModel, ModelBase, TextModel, gguf, logger
from .qwen import QwenModel
+TORCH_FLOAT8_E8M0FNU = getattr(torch, "float8_e8m0fnu", None)
+
@ModelBase.register("DeepseekOCRForCausalLM")
class DeepseekOCRVisionModel(MmprojModel):
@@ -386,3 +394,648 @@ def prepare_tensors(self):
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@ModelBase.register("DeepseekV4ForCausalLM")
+class DeepseekV4Model(TextModel):
+ model_arch = gguf.MODEL_ARCH.DEEPSEEK4
+
+ # Optional DeepSeek V4 debug / expert-quant knobs. In the pre-#17114
+ # monolithic convert_hf_to_gguf.py these were ModelBase.__init__ params
+ # wired to --deepseek4-* CLI flags. The refactored conversion/base.py
+ # ModelBase.__init__ does not accept them, so they default here; the
+ # standard DeepseekV4ForCausalLM conversion path does not require them.
+ deepseek4_max_layers: int | None = None
+ deepseek4_expert_outtypes: str | None = None
+ deepseek4_expert_workers: int = 1
+
+ _experts: list[dict[str, Tensor]] | None = None
+
+ _fp4_table = torch.tensor([
+ 0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0,
+ 0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0,
+ ], dtype=torch.float32)
+
+ _qtype_aliases: dict[str, gguf.GGMLQuantizationType] = {
+ "q8_0": gguf.GGMLQuantizationType.Q8_0,
+ "q2_k": gguf.GGMLQuantizationType.Q2_K,
+ "iq2_xxs": gguf.GGMLQuantizationType.IQ2_XXS,
+ "iq2_xs": gguf.GGMLQuantizationType.IQ2_XS,
+ "tq1_0": gguf.GGMLQuantizationType.TQ1_0,
+ "tq2_0": gguf.GGMLQuantizationType.TQ2_0,
+ }
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self._deepseek4_original_block_count = self.block_count
+ if self.deepseek4_max_layers is not None:
+ if self.deepseek4_max_layers <= 0:
+ raise ValueError("--deepseek4-max-layers must be positive")
+ if self.deepseek4_max_layers > self.block_count:
+ raise ValueError(
+ f"--deepseek4-max-layers={self.deepseek4_max_layers} exceeds model layer count {self.block_count}"
+ )
+ self.block_count = self.deepseek4_max_layers
+ self.hparams["num_hidden_layers"] = self.block_count
+ self.hparams["n_layers"] = self.block_count
+ self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+ logger.warning(
+ "DeepSeek V4 debug export: writing only the first %d/%d transformer layers",
+ self.block_count,
+ self._deepseek4_original_block_count,
+ )
+
+ self._deepseek4_expert_qtypes = self._parse_expert_outtype_spec(self.deepseek4_expert_outtypes)
+
+ def set_vocab(self):
+ self._set_vocab_gpt2()
+
+ def set_gguf_parameters(self):
+ self.hparams["num_key_value_heads"] = self.hparams.get("num_key_value_heads", 1)
+
+ super().set_gguf_parameters()
+ hparams = self.hparams
+
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+ self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+ self.gguf_writer.add_attention_output_lora_rank(hparams["o_lora_rank"])
+ self.gguf_writer.add_attention_output_group_count(hparams["o_groups"])
+ self.gguf_writer.add_attention_compress_ratios(hparams["compress_ratios"])
+ self.gguf_writer.add_attention_compress_rope_freq_base(hparams["compress_rope_theta"])
+
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+ self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+ self.gguf_writer.add_expert_weights_scale(hparams.get("routed_scaling_factor", 1.0))
+ self.gguf_writer.add_hash_layer_count(min(hparams["num_hash_layers"], self.block_count))
+ if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None:
+ self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
+ if (swiglu_limit := hparams.get("swiglu_limit")) is not None and float(swiglu_limit) > 0.0:
+ self.gguf_writer.add_swiglu_clamp_exp([float(swiglu_limit)] * self.block_count)
+
+ if (sliding_window := hparams.get("sliding_window")) is not None:
+ self.gguf_writer.add_sliding_window(sliding_window)
+
+ self.gguf_writer.add_indexer_head_count(hparams["index_n_heads"])
+ self.gguf_writer.add_indexer_key_length(hparams["index_head_dim"])
+ self.gguf_writer.add_indexer_top_k(hparams["index_topk"])
+
+ if self.deepseek4_max_layers is None and (num_nextn_predict_layers := hparams.get("num_nextn_predict_layers")) is not None:
+ self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
+
+ self.gguf_writer.add_hyper_connection_count(hparams["hc_mult"])
+ self.gguf_writer.add_hyper_connection_sinkhorn_iters(hparams["hc_sinkhorn_iters"])
+ self.gguf_writer.add_hyper_connection_eps(hparams["hc_eps"])
+
+ @staticmethod
+ def _strip_model_prefix(name: str) -> str:
+ return name.removeprefix("model.")
+
+ def _skip_layer_tensor(self, stripped_name: str) -> bool:
+ if self.deepseek4_max_layers is None:
+ return False
+ match = re.match(r"layers\.(\d+)\.", stripped_name)
+ return match is not None and int(match.group(1)) >= self.block_count
+
+ @staticmethod
+ def _is_low_bit_ftype(ftype: gguf.LlamaFileType) -> bool:
+ return ftype in (
+ gguf.LlamaFileType.MOSTLY_TQ1_0,
+ gguf.LlamaFileType.MOSTLY_TQ2_0,
+ gguf.LlamaFileType.MOSTLY_Q2_K,
+ gguf.LlamaFileType.MOSTLY_IQ2_XXS,
+ gguf.LlamaFileType.MOSTLY_IQ2_XS,
+ )
+
+ @staticmethod
+ def _qtype_for_ftype(ftype: gguf.LlamaFileType) -> gguf.GGMLQuantizationType | None:
+ return {
+ gguf.LlamaFileType.MOSTLY_TQ1_0: gguf.GGMLQuantizationType.TQ1_0,
+ gguf.LlamaFileType.MOSTLY_TQ2_0: gguf.GGMLQuantizationType.TQ2_0,
+ gguf.LlamaFileType.MOSTLY_Q2_K: gguf.GGMLQuantizationType.Q2_K,
+ gguf.LlamaFileType.MOSTLY_IQ2_XXS: gguf.GGMLQuantizationType.IQ2_XXS,
+ gguf.LlamaFileType.MOSTLY_IQ2_XS: gguf.GGMLQuantizationType.IQ2_XS,
+ gguf.LlamaFileType.MOSTLY_Q8_0: gguf.GGMLQuantizationType.Q8_0,
+ }.get(ftype)
+
+ @classmethod
+ def _parse_qtype_name(cls, name: str) -> gguf.GGMLQuantizationType:
+ qtype = cls._qtype_aliases.get(name.strip().lower())
+ if qtype is None:
+ allowed = ", ".join(sorted(cls._qtype_aliases))
+ raise ValueError(f"unknown DeepSeek V4 expert outtype {name!r}; expected one of: {allowed}")
+ return qtype
+
+ @classmethod
+ def _parse_expert_outtype_spec(cls, spec: str | None) -> dict[str, gguf.GGMLQuantizationType]:
+ if spec is None:
+ return {}
+
+ result: dict[str, gguf.GGMLQuantizationType] = {}
+ for item in spec.split(","):
+ item = item.strip()
+ if not item:
+ continue
+ if "=" not in item:
+ qtype = cls._parse_qtype_name(item)
+ result.update({"w1": qtype, "w2": qtype, "w3": qtype})
+ continue
+ key, value = (part.strip().lower() for part in item.split("=", 1))
+ if key not in ("w1", "w2", "w3", "gate", "down", "up"):
+ raise ValueError(f"unknown DeepSeek V4 expert tensor selector {key!r}")
+ wid = {"gate": "w1", "down": "w2", "up": "w3"}.get(key, key)
+ result[wid] = cls._parse_qtype_name(value)
+ return result
+
+ @staticmethod
+ def _scale_to_float(scale: Tensor) -> Tensor:
+ if TORCH_FLOAT8_E8M0FNU is not None and scale.dtype == TORCH_FLOAT8_E8M0FNU:
+ return scale.float()
+
+ if scale.dtype in (torch.uint8, torch.int8):
+ e = scale.view(torch.uint8).to(torch.int32)
+ bits = torch.where(
+ e == 0,
+ torch.full_like(e, 0x00400000),
+ e << 23,
+ )
+ return bits.view(torch.float32)
+
+ return scale.float()
+
+ @staticmethod
+ def _scale_to_e8m0_bytes(scale: Tensor) -> Tensor:
+ if TORCH_FLOAT8_E8M0FNU is not None and scale.dtype == TORCH_FLOAT8_E8M0FNU:
+ return scale.view(torch.uint8)
+ if scale.dtype in (torch.uint8, torch.int8):
+ return scale.view(torch.uint8)
+
+ scale = scale.float()
+ e = torch.where(
+ scale > 0,
+ torch.floor(torch.log2(scale)).to(torch.int32) + 127,
+ torch.zeros_like(scale, dtype=torch.int32),
+ )
+ return torch.clamp(e, 0, 255).to(torch.uint8)
+
+ @classmethod
+ def _dequant_fp8_weight(cls, weight: Tensor, scale: Tensor, block_size: Sequence[int]) -> Tensor:
+ if len(block_size) != 2:
+ raise ValueError(f"DeepSeek V4 expects 2D FP8 block scales, got block size {block_size}")
+
+ block_out, block_in = block_size
+ out_dim, in_dim = weight.shape
+ if out_dim % block_out != 0 or in_dim % block_in != 0:
+ raise ValueError(f"FP8 tensor shape {tuple(weight.shape)} is not divisible by block size {block_size}")
+
+ scale = cls._scale_to_float(scale)
+ expected_scale = (out_dim // block_out, in_dim // block_in)
+ if tuple(scale.shape) != expected_scale:
+ raise ValueError(f"FP8 scale shape {tuple(scale.shape)} does not match expected {expected_scale}")
+
+ weight = weight.reshape(out_dim // block_out, block_out, in_dim // block_in, block_in)
+ weight = weight.float() * scale[:, None, :, None]
+ return weight.reshape(out_dim, in_dim)
+
+ @classmethod
+ def _dequant_fp4_weight(cls, weight: Tensor, scale: Tensor) -> Tensor:
+ weight = weight.view(torch.uint8)
+ out_dim, packed_in_dim = weight.shape
+ in_dim = packed_in_dim * 2
+ if in_dim % 32 != 0:
+ raise ValueError(f"FP4 packed tensor shape {tuple(weight.shape)} does not contain 32-value blocks")
+
+ n_blocks = in_dim // 32
+ scale = cls._scale_to_float(scale)
+ if tuple(scale.shape) != (out_dim, n_blocks):
+ raise ValueError(f"FP4 scale shape {tuple(scale.shape)} does not match expected {(out_dim, n_blocks)}")
+
+ fp4_table = cls._fp4_table.to(weight.device)
+ packed = weight.reshape(out_dim, n_blocks, 16)
+ low = packed & 0x0F
+ high = (packed >> 4) & 0x0F
+ vals = torch.stack((low, high), dim=-1).reshape(out_dim, n_blocks, 32)
+ vals = fp4_table[vals.long()] * scale.unsqueeze(-1)
+ return vals.reshape(out_dim, in_dim)
+
+ @classmethod
+ def _pack_fp4_as_mxfp4(cls, weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]:
+ weight = weight.view(torch.uint8)
+ out_dim, packed_in_dim = weight.shape
+ in_dim = packed_in_dim * 2
+ if in_dim % 32 != 0:
+ raise ValueError(f"FP4 packed tensor shape {tuple(weight.shape)} does not contain 32-value blocks")
+
+ n_blocks = in_dim // 32
+ scale_e = cls._scale_to_e8m0_bytes(scale)
+ if tuple(scale_e.shape) != (out_dim, n_blocks):
+ raise ValueError(f"FP4 scale shape {tuple(scale_e.shape)} does not match expected {(out_dim, n_blocks)}")
+
+ packed = weight.reshape(out_dim, n_blocks, 16)
+ low = packed & 0x0F
+ high = (packed >> 4) & 0x0F
+ vals = torch.stack((low, high), dim=-1).reshape(out_dim, n_blocks, 32)
+ qs = vals[:, :, :16] | (vals[:, :, 16:] << 4)
+ raw = torch.cat((scale_e.unsqueeze(-1), qs), dim=-1).reshape(out_dim, n_blocks * 17)
+ return raw.numpy(), [out_dim, in_dim]
+
+ _ggml_quant_lib: Any = None
+
+ @classmethod
+ def _load_ggml_quant_lib(cls):
+ if cls._ggml_quant_lib is not None:
+ return cls._ggml_quant_lib
+
+ # This module lives in the conversion/ package; the repo root (where
+ # build/bin/libggml.* lands) is its parent's parent. In the pre-#17114
+ # monolithic convert_hf_to_gguf.py, __file__ was the repo-root script,
+ # so .parent alone was the repo root -- search both so the lookup is
+ # correct regardless of package layout.
+ repo_root = Path(__file__).resolve().parent.parent
+ pkg_root = Path(__file__).resolve().parent
+ candidates = [
+ os.environ.get("LLAMA_CPP_LIBGGML"),
+ repo_root / "build" / "bin" / "libggml.dylib",
+ repo_root / "build" / "bin" / "libggml.so",
+ repo_root / "build" / "bin" / "ggml.dll",
+ pkg_root / "build" / "bin" / "libggml.dylib",
+ pkg_root / "build" / "bin" / "libggml.so",
+ pkg_root / "build" / "bin" / "ggml.dll",
+ ]
+ for candidate in candidates:
+ if candidate is None:
+ continue
+ path = Path(candidate)
+ if not path.is_file():
+ continue
+ lib = ctypes.CDLL(str(path))
+ lib.ggml_quantize_chunk.restype = ctypes.c_size_t
+ lib.ggml_quantize_chunk.argtypes = (
+ ctypes.c_int,
+ ctypes.POINTER(ctypes.c_float),
+ ctypes.c_void_p,
+ ctypes.c_int64,
+ ctypes.c_int64,
+ ctypes.c_int64,
+ ctypes.POINTER(ctypes.c_float),
+ )
+ lib.ggml_quantize_requires_imatrix.restype = ctypes.c_bool
+ lib.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,)
+ cls._ggml_quant_lib = lib
+ return lib
+
+ raise RuntimeError(
+ "DeepSeek V4 low-bit expert conversion needs llama.cpp's libggml. "
+ "Build llama.cpp first or set LLAMA_CPP_LIBGGML to libggml."
+ )
+
+ @classmethod
+ def _quantize_deepseek4_expert(cls, data: np.ndarray, qtype: gguf.GGMLQuantizationType) -> np.ndarray:
+ c_quantized_types = {
+ gguf.GGMLQuantizationType.Q2_K,
+ gguf.GGMLQuantizationType.IQ2_XXS,
+ gguf.GGMLQuantizationType.IQ2_XS,
+ }
+ if qtype not in c_quantized_types:
+ return gguf.quants.quantize(data, qtype)
+
+ data = np.ascontiguousarray(data, dtype=np.float32)
+ out = np.zeros(gguf.quant_shape_to_byte_shape(data.shape, qtype), dtype=np.uint8, order="C")
+ lib = cls._load_ggml_quant_lib()
+ nrows = math.prod(data.shape[:-1])
+ n_per_row = data.shape[-1]
+ imatrix = ctypes.cast(0, ctypes.POINTER(ctypes.c_float))
+ if lib.ggml_quantize_requires_imatrix(qtype.value):
+ qw = np.ascontiguousarray(np.sum(data.reshape(-1, n_per_row) ** 2, axis=0), dtype=np.float32)
+ imatrix = qw.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+ result_size = lib.ggml_quantize_chunk(
+ qtype.value,
+ data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+ out.ctypes.data_as(ctypes.c_void_p),
+ 0,
+ nrows,
+ n_per_row,
+ imatrix,
+ )
+ if result_size != out.size:
+ raise RuntimeError(f"ggml_quantize_chunk wrote {result_size} bytes, expected {out.size}")
+ return out
+
+ def _write_deepseek4_tid2eid_tensors(self) -> set[str]:
+ consumed: set[str] = set()
+ for name in list(self.model_tensors.keys()):
+ stripped = self._strip_model_prefix(name)
+ if self._skip_layer_tensor(stripped):
+ consumed.add(name)
+ continue
+ if re.match(r"layers\.\d+\.ffn\.gate\.tid2eid$", stripped) is None:
+ continue
+
+ data = LazyTorchTensor.to_eager(self.model_tensors[name]()).to(torch.int32).numpy()
+ new_name = self.map_tensor_name(stripped)
+ logger.info(f"{new_name}, int32 --> I32, shape = {{{', '.join(str(n) for n in reversed(data.shape))}}}")
+ self.gguf_writer.add_tensor(new_name, data)
+ consumed.add(name)
+ return consumed
+
+ def _write_deepseek4_expert_tensors(self) -> set[str]:
+ default_qtype = self._qtype_for_ftype(self.ftype)
+ if default_qtype is None and not self._deepseek4_expert_qtypes:
+ if any(re.match(r"(?:model\.)?layers\.\d+\.ffn\.experts\.\d+\.w[123]\.weight$", name) for name in self.model_tensors):
+ raise NotImplementedError(
+ "DeepSeek V4 routed FP4 experts must be converted directly to a compact GGUF type. "
+ "Use --outtype iq2_xxs, iq2_xs, q2_k, tq2_0, tq1_0, or q8_0."
+ )
+ return set()
+
+ n_experts = self.hparams["n_routed_experts"]
+ consumed: set[str] = set()
+ groups: dict[tuple[int, str], dict[int, tuple[str, str]]] = {}
+
+ for name in list(self.model_tensors.keys()):
+ stripped = self._strip_model_prefix(name)
+ if self._skip_layer_tensor(stripped):
+ consumed.add(name)
+ continue
+ match = re.match(r"layers\.(\d+)\.ffn\.experts\.(\d+)\.(w[123])\.weight$", stripped)
+ if match is None:
+ continue
+
+ bid = int(match.group(1))
+ xid = int(match.group(2))
+ wid = match.group(3)
+ qtype = self._deepseek4_expert_qtypes.get(wid, default_qtype)
+ if qtype is None:
+ raise RuntimeError(f"No DeepSeek V4 expert quantization type selected for {wid}")
+ scale_name = f"{stripped.removesuffix('.weight')}.scale"
+ model_scale_name = scale_name if scale_name in self.model_tensors else f"model.{scale_name}"
+ if model_scale_name not in self.model_tensors:
+ raise ValueError(f"Missing DeepSeek V4 FP4 scale tensor for {stripped}")
+
+ groups.setdefault((bid, wid), {})[xid] = (name, model_scale_name)
+ consumed.update((name, model_scale_name))
+
+ def convert_one(name: str, model_scale_name: str, qtype: gguf.GGMLQuantizationType) -> np.ndarray:
+ weight = LazyTorchTensor.to_eager(self.model_tensors[name]())
+ scale = LazyTorchTensor.to_eager(self.model_tensors[model_scale_name]())
+
+ if qtype == gguf.GGMLQuantizationType.MXFP4:
+ data, _ = self._pack_fp4_as_mxfp4(weight, scale)
+ return data
+
+ data = self._dequant_fp4_weight(weight, scale).numpy()
+ return self._quantize_deepseek4_expert(data, qtype)
+
+ def add_merged_tensor(bid: int, wid: str, qtype: gguf.GGMLQuantizationType, experts: dict[int, np.ndarray]) -> None:
+ missing = sorted(set(range(n_experts)).difference(experts))
+ if missing:
+ raise ValueError(f"Missing DeepSeek V4 expert tensors for layer {bid} {wid}: {missing[:8]}")
+
+ merged = np.stack([experts[i] for i in range(n_experts)], axis=0)
+ merged_name = f"layers.{bid}.ffn.experts.{wid}.weight"
+ new_name = self.map_tensor_name(merged_name)
+ shape = gguf.quant_shape_from_byte_shape(merged.shape, qtype) if merged.dtype == np.uint8 else merged.shape
+ shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
+ logger.info(f"{new_name}, DeepSeek FP4 --> {qtype.name}, shape = {shape_str}")
+ self.gguf_writer.add_tensor(new_name, merged, raw_dtype=qtype)
+
+ worker_count = max(1, self.deepseek4_expert_workers)
+ for bid, wid in sorted(groups):
+ qtype = self._deepseek4_expert_qtypes.get(wid, default_qtype)
+ if qtype is None:
+ raise RuntimeError(f"No DeepSeek V4 expert quantization type selected for {wid}")
+ group = groups[(bid, wid)]
+ experts: dict[int, np.ndarray] = {}
+ logger.info(
+ "DeepSeek V4: quantizing blk.%d %s experts to %s with %d worker%s",
+ bid,
+ wid,
+ qtype.name,
+ worker_count,
+ "" if worker_count == 1 else "s",
+ )
+
+ if worker_count == 1:
+ for done, xid in enumerate(sorted(group), start=1):
+ name, model_scale_name = group[xid]
+ experts[xid] = convert_one(name, model_scale_name, qtype)
+ if done % 32 == 0 or done == n_experts:
+ logger.info("DeepSeek V4: blk.%d %s %d/%d experts", bid, wid, done, n_experts)
+ else:
+ max_pending = worker_count * 2
+ pending: dict[concurrent.futures.Future[np.ndarray], int] = {}
+ xids = iter(sorted(group))
+ done = 0
+
+ with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
+ def submit_next() -> bool:
+ try:
+ xid = next(xids)
+ except StopIteration:
+ return False
+ name, model_scale_name = group[xid]
+ future = executor.submit(convert_one, name, model_scale_name, qtype)
+ pending[future] = xid
+ return True
+
+ while len(pending) < max_pending and submit_next():
+ pass
+
+ while pending:
+ finished, _ = concurrent.futures.wait(
+ pending,
+ return_when=concurrent.futures.FIRST_COMPLETED,
+ )
+ for future in finished:
+ xid = pending.pop(future)
+ experts[xid] = future.result()
+ done += 1
+ if done % 32 == 0 or done == n_experts:
+ logger.info("DeepSeek V4: blk.%d %s %d/%d experts", bid, wid, done, n_experts)
+ submit_next()
+
+ add_merged_tensor(bid, wid, qtype, experts)
+
+ return consumed
+
+ def _prepare_deepseek4_scaled_tensors(self) -> None:
+ block_size = (self.hparams.get("quantization_config") or {}).get("weight_block_size", [128, 128])
+ consumed: set[str] = set()
+
+ for name in list(self.model_tensors.keys()):
+ stripped = self._strip_model_prefix(name)
+ if stripped.startswith("mtp.") or self._skip_layer_tensor(stripped):
+ consumed.add(name)
+
+ consumed.update(self._write_deepseek4_tid2eid_tensors())
+ consumed.update(self._write_deepseek4_expert_tensors())
+
+ for name in list(self.model_tensors.keys()):
+ if name in consumed:
+ continue
+ stripped = self._strip_model_prefix(name)
+ if not stripped.endswith(".scale"):
+ continue
+ if re.match(r"layers\.\d+\.ffn\.experts\.\d+\.w[123]\.scale$", stripped) is not None:
+ continue
+
+ weight_name = f"{stripped.removesuffix('.scale')}.weight"
+ model_weight_name = weight_name if weight_name in self.model_tensors else f"model.{weight_name}"
+ if model_weight_name not in self.model_tensors:
+ raise ValueError(f"Missing DeepSeek V4 FP8 weight tensor for scale {stripped}")
+
+ w = self.model_tensors[model_weight_name]
+ s = self.model_tensors[name]
+ self.model_tensors[model_weight_name] = (
+ lambda w=w, s=s, bs=block_size: self._dequant_fp8_weight(
+ LazyTorchTensor.to_eager(w()),
+ LazyTorchTensor.to_eager(s()),
+ bs,
+ )
+ )
+ consumed.add(name)
+
+ for name in consumed:
+ self.model_tensors.pop(name, None)
+
+ def prepare_tensors(self):
+ self._prepare_deepseek4_scaled_tensors()
+
+ if any(name.endswith(".scale") for name in self.model_tensors):
+ raise NotImplementedError("Unhandled DeepSeek V4 scale tensors remain after conversion preparation")
+
+ super().prepare_tensors()
+
+ if self._experts is not None:
+ experts = [k for d in self._experts for k in d.keys()]
+ if len(experts) > 0:
+ raise ValueError(f"Unprocessed experts: {experts}")
+
+ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
+ del name
+ del new_name
+ del bid
+
+ if not self._is_low_bit_ftype(self.ftype) or n_dims <= 1:
+ return False
+
+ # DeepSeek V4 routed experts are handled in _write_deepseek4_expert_tensors(),
+ # where each expert is converted directly from FP4 to the requested compact
+ # GGUF type. Keep the rest of the model in float form so attention,
+ # hyper-connections, indexers, compressors, shared experts and logits do not
+ # inherit the global low-bit file type.
+ return gguf.GGMLQuantizationType.F16
+
+ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
+ mapped = self._map_tensor_name_deepseek4(name)
+ if mapped is not None:
+ return mapped
+ return super().map_tensor_name(name, try_suffixes)
+
+ def _map_tensor_name_deepseek4(self, name: str) -> str | None:
+ if name.startswith("model."):
+ name = name.removeprefix("model.")
+
+ top_level: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
+ "embed.weight": (gguf.MODEL_TENSOR.TOKEN_EMBD, ".weight"),
+ "norm.weight": (gguf.MODEL_TENSOR.OUTPUT_NORM, ".weight"),
+ "head.weight": (gguf.MODEL_TENSOR.OUTPUT, ".weight"),
+ "hc_head_base": (gguf.MODEL_TENSOR.OUTPUT_HC_BASE, ".weight"),
+ "hc_head_fn": (gguf.MODEL_TENSOR.OUTPUT_HC_FN, ".weight"),
+ "hc_head_scale": (gguf.MODEL_TENSOR.OUTPUT_HC_SCALE, ".weight"),
+ }
+ if name in top_level:
+ tensor, suffix = top_level[name]
+ return self.format_tensor_name(tensor, suffix=suffix)
+
+ match = re.match(r"layers\.(\d+)\.(.+)", name)
+ if match is None:
+ return None
+
+ bid = int(match.group(1))
+ rest = match.group(2)
+
+ layer_level: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
+ "hc_attn_base": (gguf.MODEL_TENSOR.HC_ATTN_BASE, ".weight"),
+ "hc_attn_fn": (gguf.MODEL_TENSOR.HC_ATTN_FN, ".weight"),
+ "hc_attn_scale": (gguf.MODEL_TENSOR.HC_ATTN_SCALE, ".weight"),
+ "hc_ffn_base": (gguf.MODEL_TENSOR.HC_FFN_BASE, ".weight"),
+ "hc_ffn_fn": (gguf.MODEL_TENSOR.HC_FFN_FN, ".weight"),
+ "hc_ffn_scale": (gguf.MODEL_TENSOR.HC_FFN_SCALE, ".weight"),
+ "attn.attn_sink": (gguf.MODEL_TENSOR.ATTN_SINKS, ".weight"),
+ "attn.wq_a.weight": (gguf.MODEL_TENSOR.ATTN_Q_A, ".weight"),
+ "attn.wq_b.weight": (gguf.MODEL_TENSOR.ATTN_Q_B, ".weight"),
+ "attn.q_norm.weight": (gguf.MODEL_TENSOR.ATTN_Q_A_NORM, ".weight"),
+ "attn.wkv.weight": (gguf.MODEL_TENSOR.ATTN_KV, ".weight"),
+ "attn.kv_norm.weight": (gguf.MODEL_TENSOR.ATTN_KV_A_NORM, ".weight"),
+ "attn.wo_a.weight": (gguf.MODEL_TENSOR.ATTN_OUT_A, ".weight"),
+ "attn.wo_b.weight": (gguf.MODEL_TENSOR.ATTN_OUT_B, ".weight"),
+ "attn.compressor.ape": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_APE, ".weight"),
+ "attn.compressor.wkv.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_KV, ".weight"),
+ "attn.compressor.wgate.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_GATE, ".weight"),
+ "attn.compressor.norm.weight": (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_NORM, ".weight"),
+ "attn.indexer.wq_b.weight": (gguf.MODEL_TENSOR.INDEXER_ATTN_Q_B, ".weight"),
+ "attn.indexer.weights_proj.weight": (gguf.MODEL_TENSOR.INDEXER_PROJ, ".weight"),
+ "attn.indexer.compressor.ape": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_APE, ".weight"),
+ "attn.indexer.compressor.wkv.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_KV, ".weight"),
+ "attn.indexer.compressor.wgate.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_GATE, ".weight"),
+ "attn.indexer.compressor.norm.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_NORM, ".weight"),
+ "attn_norm.weight": (gguf.MODEL_TENSOR.ATTN_NORM, ".weight"),
+ "ffn_norm.weight": (gguf.MODEL_TENSOR.FFN_NORM, ".weight"),
+ "ffn.shared_experts.w1.weight": (gguf.MODEL_TENSOR.FFN_GATE_SHEXP, ".weight"),
+ "ffn.shared_experts.w3.weight": (gguf.MODEL_TENSOR.FFN_UP_SHEXP, ".weight"),
+ "ffn.shared_experts.w2.weight": (gguf.MODEL_TENSOR.FFN_DOWN_SHEXP, ".weight"),
+ "ffn.gate.weight": (gguf.MODEL_TENSOR.FFN_GATE_INP, ".weight"),
+ "ffn.gate.bias": (gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, ".bias"),
+ "ffn.gate.tid2eid": (gguf.MODEL_TENSOR.FFN_GATE_TID2EID, ".weight"),
+ "ffn.experts.w1.weight": (gguf.MODEL_TENSOR.FFN_GATE_EXP, ".weight"),
+ "ffn.experts.w3.weight": (gguf.MODEL_TENSOR.FFN_UP_EXP, ".weight"),
+ "ffn.experts.w2.weight": (gguf.MODEL_TENSOR.FFN_DOWN_EXP, ".weight"),
+ }
+ if rest in layer_level:
+ tensor, suffix = layer_level[rest]
+ return self.format_tensor_name(tensor, bid, suffix=suffix)
+
+ return None
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ if name.startswith("model."):
+ name = name.removeprefix("model.")
+
+ # TODO: llama.cpp does not have Multi-Token Prediction for DeepSeek yet.
+ if name.startswith("mtp."):
+ return
+
+ # process the experts separately
+ match = re.match(r"layers\.(\d+)\.ffn\.experts\.(\d+)\.(w[123])\.weight", name)
+ if match is not None:
+ bid = int(match.group(1))
+ xid = int(match.group(2))
+ wid = match.group(3)
+ n_experts = self.hparams["n_routed_experts"]
+
+ if self._experts is None:
+ self._experts = [{} for _ in range(self.block_count)]
+
+ self._experts[bid][name] = data_torch
+
+ if len(self._experts[bid]) >= n_experts * 3:
+ for w_name in ["w1", "w3", "w2"]:
+ datas: list[Tensor] = []
+
+ for expert_id in range(n_experts):
+ ename = f"layers.{bid}.ffn.experts.{expert_id}.{w_name}.weight"
+ datas.append(self._experts[bid][ename])
+ del self._experts[bid][ename]
+
+ data_torch = torch.stack(datas, dim=0)
+ merged_name = f"layers.{bid}.ffn.experts.{w_name}.weight"
+ yield self.map_tensor_name(merged_name), data_torch
+ return
+
+ del xid, wid
+ return
+
+ yield self.map_tensor_name(name), data_torch
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 41566d41aef3..f71943ed33aa 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -567,6 +567,11 @@ extern "C" {
GGML_OP_RWKV_WKV7,
GGML_OP_SOLVE_TRI,
GGML_OP_GATED_DELTA_NET,
+ GGML_OP_DSV4_HC_SPLIT_SINKHORN,
+ GGML_OP_DSV4_HC_WEIGHTED_SUM,
+ GGML_OP_DSV4_HC_EXPAND,
+ GGML_OP_DSV4_FP8_KV_QUANTIZE,
+ GGML_OP_DSV4_ROPE_TAIL,
GGML_OP_UNARY,
@@ -2555,6 +2560,61 @@ extern "C" {
struct ggml_tensor * beta,
struct ggml_tensor * state);
+ // DeepSeek V4 hyperconnection helper.
+ // Splits [mix, tokens] into pre/post/comb regions and applies the
+ // Sinkhorn normalization used by the reference implementation.
+ GGML_API struct ggml_tensor * ggml_dsv4_hc_split_sinkhorn(
+ struct ggml_context * ctx,
+ struct ggml_tensor * mixes,
+ struct ggml_tensor * scale,
+ struct ggml_tensor * base,
+ int n_hc,
+ int sinkhorn_iters,
+ float eps);
+
+ // DeepSeek V4 hyperconnection weighted-sum helper.
+ // Computes sum_hc weights[hc, token] * x[embd, hc, token].
+ GGML_API struct ggml_tensor * ggml_dsv4_hc_weighted_sum(
+ struct ggml_context * ctx,
+ struct ggml_tensor * x,
+ struct ggml_tensor * weights);
+
+ // DeepSeek V4 hyperconnection expand helper.
+ // Computes post * block_out + comb^T @ residual for each token.
+ GGML_API struct ggml_tensor * ggml_dsv4_hc_expand(
+ struct ggml_context * ctx,
+ struct ggml_tensor * block_out,
+ struct ggml_tensor * residual,
+ struct ggml_tensor * post,
+ struct ggml_tensor * comb);
+
+ // DeepSeek V4 FP8 KV-cache simulation helper.
+ // Quantizes/dequantizes the non-RoPE prefix in E4M3FN blocks and leaves
+ // the RoPE tail unchanged, matching the reference inference path.
+ GGML_API struct ggml_tensor * ggml_dsv4_fp8_kv_quantize(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_rot);
+
+ // DeepSeek V4 partial RoPE helper.
+ // Leaves the non-RoPE prefix unchanged and applies RoPE to the tail,
+ // matching ggml_concat(prefix, ggml_rope_ext(tail)).
+ GGML_API struct ggml_tensor * ggml_dsv4_rope_tail(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * pos,
+ struct ggml_tensor * freq_factors,
+ int n_dims,
+ int mode,
+ int n_ctx_orig,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow,
+ bool inverse);
+
// custom operators
typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 4e36909f45e9..d51620c288a6 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -754,7 +754,16 @@ static bool ggml_is_view_op(enum ggml_op op) {
#endif
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
-#define GGML_SCHED_MAX_SPLIT_INPUTS 30
+// V4 multi-GPU note: V4 (DeepSeek-V4) graphs need a higher value (~80-128)
+// when split across multiple devices, due to dense per-layer inputs
+// (hyperconnection × 4 + indexer/compressor state + multiple KV caches).
+// Single-device runs never trip the cap. The constant gates not just
+// `sched_split.inputs[N]` (small) but also `nodes_size` and
+// `context_buffer_size` allocations that scale as `graph_size × N` —
+// bumping the default adds ~200 MB per scheduler instance for V4-sized
+// graphs, paid even by single-GPU users who don't need it. We bump it
+// anyway for the DSv4-Flash use-case (Strix Halo / unified memory).
+#define GGML_SCHED_MAX_SPLIT_INPUTS 128
#endif
#ifndef GGML_SCHED_MAX_COPIES
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index cd5c61a81879..70f8def3a742 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2047,6 +2047,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_gated_delta_net(params, tensor);
} break;
+ case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+ {
+ ggml_compute_forward_dsv4_hc_split_sinkhorn(params, tensor);
+ } break;
+ case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+ {
+ ggml_compute_forward_dsv4_hc_weighted_sum(params, tensor);
+ } break;
+ case GGML_OP_DSV4_HC_EXPAND:
+ {
+ ggml_compute_forward_dsv4_hc_expand(params, tensor);
+ } break;
+ case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+ {
+ ggml_compute_forward_dsv4_fp8_kv_quantize(params, tensor);
+ } break;
+ case GGML_OP_DSV4_ROPE_TAIL:
+ {
+ ggml_compute_forward_dsv4_rope_tail(params, tensor);
+ } break;
case GGML_OP_MAP_CUSTOM1:
{
ggml_compute_forward_map_custom1(params, tensor);
@@ -2227,6 +2247,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
case GGML_OP_COUNT_EQUAL:
case GGML_OP_SOLVE_TRI:
case GGML_OP_GATED_DELTA_NET:
+ case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+ case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+ case GGML_OP_DSV4_HC_EXPAND:
+ case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+ case GGML_OP_DSV4_ROPE_TAIL:
{
n_tasks = n_threads;
} break;
@@ -2847,6 +2872,7 @@ struct ggml_cplan ggml_graph_plan(
case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
case GGML_OP_ROPE_BACK:
+ case GGML_OP_DSV4_ROPE_TAIL:
{
cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
} break;
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 7485ba4fc861..f473cb724725 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5970,6 +5970,127 @@ void ggml_compute_forward_rope_back(
}
}
+// ggml_compute_forward_dsv4_rope_tail
+
+template
+static void ggml_compute_forward_dsv4_rope_tail_flt(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+ const ggml_tensor * src1 = dst->src[1];
+ const ggml_tensor * src2 = dst->src[2];
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
+
+ const int n_dims = ((int32_t *) dst->op_params)[0];
+ const int mode = ((int32_t *) dst->op_params)[1];
+ const int n_ctx_orig = ((int32_t *) dst->op_params)[2];
+ const bool inverse = ((int32_t *) dst->op_params)[3] != 0;
+
+ float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+ memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
+ memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
+ memcpy(&ext_factor, (int32_t *) dst->op_params + 6, sizeof(float));
+ memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+ memcpy(&beta_fast, (int32_t *) dst->op_params + 8, sizeof(float));
+ memcpy(&beta_slow, (int32_t *) dst->op_params + 9, sizeof(float));
+
+ GGML_TENSOR_UNARY_OP_LOCALS
+
+ GGML_ASSERT(nb0 == nb00);
+ GGML_ASSERT(nb0 == sizeof(T));
+ GGML_ASSERT(n_dims <= ne0);
+ GGML_ASSERT(n_dims % 2 == 0);
+ GGML_ASSERT(mode == GGML_ROPE_TYPE_NORMAL || mode == GGML_ROPE_TYPE_NEOX);
+
+ const int64_t n_nope = ne0 - n_dims;
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int nr = ggml_nrows(dst);
+ const int dr = (nr + nth - 1)/nth;
+ const int ir0 = dr*ith;
+ const int ir1 = MIN(ir0 + dr, nr);
+
+ const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+ float corr_dims[2];
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+ const float * freq_factors = NULL;
+ if (src2 != NULL) {
+ GGML_ASSERT(src2->type == GGML_TYPE_F32);
+ GGML_ASSERT(src2->ne[0] >= n_dims / 2);
+ freq_factors = (const float *) src2->data;
+ }
+
+ const float sin_sign = inverse ? -1.0f : 1.0f;
+ const int32_t * pos = (const int32_t *) src1->data;
+
+ int ir = 0;
+ int64_t last_i2 = -1;
+
+ for (int64_t i3 = 0; i3 < ne3; i3++) {
+ for (int64_t i2 = 0; i2 < ne2; i2++) {
+ for (int64_t i1 = 0; i1 < ne1; i1++) {
+ if (ir++ < ir0) continue;
+ if (ir > ir1) break;
+
+ float * cache = (float *) params->wdata + (n_dims + CACHE_LINE_SIZE_F32)*ith;
+ if (last_i2 != i2) {
+ const int64_t p = pos[i2];
+ ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, n_dims, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+ last_i2 = i2;
+ }
+
+ const T * src = (const T *)((const char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+ T * dst_data = (T *)(( char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
+
+ for (int64_t i0 = 0; i0 < n_nope; ++i0) {
+ dst_data[i0] = src[i0];
+ }
+
+ const T * src_tail = src + n_nope;
+ T * dst_tail = dst_data + n_nope;
+
+ switch (mode) {
+ case GGML_ROPE_TYPE_NORMAL:
+ rotate_pairs(n_dims, 1, cache, src_tail, dst_tail, 1);
+ break;
+ case GGML_ROPE_TYPE_NEOX:
+ rotate_pairs(n_dims, n_dims/2, cache, src_tail, dst_tail);
+ break;
+ default:
+ GGML_ABORT("rope type not supported");
+ }
+ }
+ }
+ }
+}
+
+void ggml_compute_forward_dsv4_rope_tail(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+
+ switch (src0->type) {
+ case GGML_TYPE_F16:
+ {
+ ggml_compute_forward_dsv4_rope_tail_flt(params, dst);
+ } break;
+ case GGML_TYPE_F32:
+ {
+ ggml_compute_forward_dsv4_rope_tail_flt(params, dst);
+ } break;
+ default:
+ {
+ GGML_ABORT("fatal error");
+ }
+ }
+}
+
// ggml_compute_forward_conv_transpose_1d
static void ggml_compute_forward_conv_transpose_1d_f16_f32(
@@ -10903,6 +11024,343 @@ void ggml_compute_forward_rwkv_wkv7(
}
}
+// ggml_compute_forward_dsv4_hc_split_sinkhorn
+
+void ggml_compute_forward_dsv4_hc_split_sinkhorn(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+ const ggml_tensor * mixes = dst->src[0];
+ const ggml_tensor * scale = dst->src[1];
+ const ggml_tensor * base = dst->src[2];
+
+ GGML_ASSERT(mixes->type == GGML_TYPE_F32);
+ GGML_ASSERT(scale->type == GGML_TYPE_F32);
+ GGML_ASSERT(base->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(mixes->nb[0] == sizeof(float));
+ GGML_ASSERT(scale->nb[0] == sizeof(float));
+ GGML_ASSERT(base->nb[0] == sizeof(float));
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
+
+ const int n_hc = ggml_get_op_params_i32(dst, 0);
+ const int sinkhorn_iters = ggml_get_op_params_i32(dst, 1);
+ const float eps = ggml_get_op_params_f32(dst, 2);
+ const int64_t mix_hc = mixes->ne[0];
+ const int64_t n_rows = ggml_nrows(mixes);
+
+ GGML_ASSERT(n_hc > 0 && n_hc <= 16);
+ GGML_ASSERT(sinkhorn_iters > 0);
+ GGML_ASSERT(mix_hc == (2 + n_hc) * n_hc);
+ GGML_ASSERT(ggml_nrows(dst) == n_rows);
+
+ const float * scale_data = (const float *) scale->data;
+ const float * base_data = (const float *) base->data;
+
+ const float pre_scale = scale_data[0];
+ const float post_scale = scale_data[1];
+ const float comb_scale = scale_data[2];
+
+ const int ith = params->ith;
+ const int nth = params->nth;
+
+ const int64_t dr = (n_rows + nth - 1) / nth;
+ const int64_t r0 = dr * ith;
+ const int64_t r1 = std::min(r0 + dr, n_rows);
+
+ for (int64_t r = r0; r < r1; ++r) {
+ const float * mix = (const float *) ((const char *) mixes->data + r*mixes->nb[1]);
+ float * out = (float *) ((char *) dst->data + r*dst->nb[1]);
+
+ for (int i = 0; i < n_hc; ++i) {
+ const float z = mix[i] * pre_scale + base_data[i];
+ out[i] = 1.0f / (1.0f + expf(-z)) + eps;
+ }
+
+ for (int i = 0; i < n_hc; ++i) {
+ const int off = n_hc + i;
+ const float z = mix[off] * post_scale + base_data[off];
+ out[off] = 2.0f / (1.0f + expf(-z));
+ }
+
+ float c[16*16];
+
+ // comb is laid out as a flat [n_hc*n_hc] block per token, written as
+ // c[src_hc + dst_hc*n_hc]; after the graph's reshape_3d this is ggml
+ // tensor comb[ne0=src_hc, ne1=dst_hc, ne2=token]. The Sinkhorn pass
+ // below softmaxes over src_hc, then alternates row/col normalization.
+ // ggml_dsv4_hc_expand intentionally reads ggml-dim0 as dst_hc, which
+ // transposes this matrix on read so it computes comb^T @ residual
+ // (the V4 hyperconnection contract). CPU/Metal/CUDA use the identical
+ // flat write + transposed read; do not "fix" one side in isolation.
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ float row_max = -INFINITY;
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ const int idx = src_hc + dst_hc*n_hc;
+ const int off = 2*n_hc + idx;
+ const float v = mix[off] * comb_scale + base_data[off];
+ c[idx] = v;
+ row_max = std::max(row_max, v);
+ }
+
+ float row_sum = 0.0f;
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ const int idx = src_hc + dst_hc*n_hc;
+ const float v = expf(c[idx] - row_max);
+ c[idx] = v;
+ row_sum += v;
+ }
+
+ const float inv_sum = 1.0f / row_sum;
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ const int idx = src_hc + dst_hc*n_hc;
+ c[idx] = c[idx] * inv_sum + eps;
+ }
+ }
+
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ float sum = 0.0f;
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ sum += c[src_hc + dst_hc*n_hc];
+ }
+
+ const float inv_denom = 1.0f / (sum + eps);
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ c[src_hc + dst_hc*n_hc] *= inv_denom;
+ }
+ }
+
+ for (int iter = 1; iter < sinkhorn_iters; ++iter) {
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ float sum = 0.0f;
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ sum += c[src_hc + dst_hc*n_hc];
+ }
+
+ const float inv_denom = 1.0f / (sum + eps);
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ c[src_hc + dst_hc*n_hc] *= inv_denom;
+ }
+ }
+
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ float sum = 0.0f;
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ sum += c[src_hc + dst_hc*n_hc];
+ }
+
+ const float inv_denom = 1.0f / (sum + eps);
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ c[src_hc + dst_hc*n_hc] *= inv_denom;
+ }
+ }
+ }
+
+ for (int i = 0; i < n_hc*n_hc; ++i) {
+ out[2*n_hc + i] = c[i];
+ }
+ }
+}
+
+// ggml_compute_forward_dsv4_hc_weighted_sum
+
+void ggml_compute_forward_dsv4_hc_weighted_sum(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+ const ggml_tensor * x = dst->src[0];
+ const ggml_tensor * weights = dst->src[1];
+
+ GGML_ASSERT(x->type == GGML_TYPE_F32);
+ GGML_ASSERT(weights->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(x->ne[0] == dst->ne[0]);
+ GGML_ASSERT(x->ne[1] == weights->ne[0]);
+ GGML_ASSERT(x->ne[2] == dst->ne[1]);
+ GGML_ASSERT(weights->ne[1] == dst->ne[1]);
+ GGML_ASSERT(x->ne[3] == 1);
+ GGML_ASSERT(weights->ne[2] == 1);
+ GGML_ASSERT(weights->ne[3] == 1);
+ GGML_ASSERT(dst->ne[2] == 1);
+ GGML_ASSERT(dst->ne[3] == 1);
+
+ const int64_t n_embd = dst->ne[0];
+ const int64_t n_hc = x->ne[1];
+ const int64_t n_tokens = dst->ne[1];
+ const int64_t n_elem = n_embd * n_tokens;
+
+ const int64_t i0 = (n_elem * params->ith) / params->nth;
+ const int64_t i1 = (n_elem * (params->ith + 1)) / params->nth;
+
+ const char * x_data = (const char *) x->data;
+ const char * w_data = (const char *) weights->data;
+ char * y_data = ( char *) dst->data;
+
+ for (int64_t i = i0; i < i1; ++i) {
+ const int64_t d = i % n_embd;
+ const int64_t t = i / n_embd;
+
+ float acc = 0.0f;
+ for (int64_t h = 0; h < n_hc; ++h) {
+ const float xv = *(const float *) (x_data + d*x->nb[0] + h*x->nb[1] + t*x->nb[2]);
+ const float wv = *(const float *) (w_data + h*weights->nb[0] + t*weights->nb[1]);
+ acc += xv * wv;
+ }
+
+ *(float *) (y_data + d*dst->nb[0] + t*dst->nb[1]) = acc;
+ }
+}
+
+// ggml_compute_forward_dsv4_hc_expand
+
+void ggml_compute_forward_dsv4_hc_expand(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+ const ggml_tensor * block_out = dst->src[0];
+ const ggml_tensor * residual = dst->src[1];
+ const ggml_tensor * post = dst->src[2];
+ const ggml_tensor * comb = dst->src[3];
+
+ GGML_ASSERT(block_out->type == GGML_TYPE_F32);
+ GGML_ASSERT(residual->type == GGML_TYPE_F32);
+ GGML_ASSERT(post->type == GGML_TYPE_F32);
+ GGML_ASSERT(comb->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(block_out->ne[0] == dst->ne[0]);
+ GGML_ASSERT(block_out->ne[1] == dst->ne[2]);
+ GGML_ASSERT(residual->ne[0] == dst->ne[0]);
+ GGML_ASSERT(residual->ne[1] == dst->ne[1]);
+ GGML_ASSERT(residual->ne[2] == dst->ne[2]);
+ GGML_ASSERT(post->ne[0] == dst->ne[1]);
+ GGML_ASSERT(post->ne[1] == dst->ne[2]);
+ GGML_ASSERT(comb->ne[0] == dst->ne[1]);
+ GGML_ASSERT(comb->ne[1] == dst->ne[1]);
+ GGML_ASSERT(comb->ne[2] == dst->ne[2]);
+ GGML_ASSERT(block_out->ne[3] == 1);
+ GGML_ASSERT(residual->ne[3] == 1);
+ GGML_ASSERT(post->ne[2] == 1);
+ GGML_ASSERT(post->ne[3] == 1);
+ GGML_ASSERT(comb->ne[3] == 1);
+ GGML_ASSERT(dst->ne[3] == 1);
+
+ const int64_t n_embd = dst->ne[0];
+ const int64_t n_hc = dst->ne[1];
+ const int64_t n_tokens = dst->ne[2];
+ const int64_t n_elem = n_embd * n_hc * n_tokens;
+
+ const int64_t i0 = (n_elem * params->ith) / params->nth;
+ const int64_t i1 = (n_elem * (params->ith + 1)) / params->nth;
+
+ const char * block_data = (const char *) block_out->data;
+ const char * res_data = (const char *) residual->data;
+ const char * post_data = (const char *) post->data;
+ const char * comb_data = (const char *) comb->data;
+ char * dst_data = ( char *) dst->data;
+
+ for (int64_t i = i0; i < i1; ++i) {
+ const int64_t d = i % n_embd;
+ const int64_t tmp = i / n_embd;
+ const int64_t dst_hc = tmp % n_hc;
+ const int64_t t = tmp / n_hc;
+
+ const float block_v = *(const float *) (block_data + d*block_out->nb[0] + t*block_out->nb[1]);
+ const float post_v = *(const float *) (post_data + dst_hc*post->nb[0] + t*post->nb[1]);
+
+ float acc = block_v * post_v;
+ // comb arrives as comb[ne0=src_hc, ne1=dst_hc, ne2=t] from
+ // dsv4_hc_split_sinkhorn (flat write src_hc + dst_hc*n_hc). Reading
+ // ne0 as dst_hc and ne1 as src_hc here transposes it, giving
+ // (comb^T @ residual)[d, dst_hc] = sum_src_hc comb[src_hc,dst_hc,t]
+ // * residual[d, src_hc, t]. This transpose is deliberate and matches
+ // the Metal/CUDA expand kernels (validated 19/19 vs this CPU oracle).
+ for (int64_t src_hc = 0; src_hc < n_hc; ++src_hc) {
+ const float comb_v = *(const float *) (comb_data + dst_hc*comb->nb[0] + src_hc*comb->nb[1] + t*comb->nb[2]);
+ const float res_v = *(const float *) (res_data + d*residual->nb[0] + src_hc*residual->nb[1] + t*residual->nb[2]);
+ acc += comb_v * res_v;
+ }
+
+ *(float *) (dst_data + d*dst->nb[0] + dst_hc*dst->nb[1] + t*dst->nb[2]) = acc;
+ }
+}
+
+static float ggml_dsv4_e4m3fn_dequant(float x) {
+ const float sign = x < 0.0f ? -1.0f : 1.0f;
+ const float ax = std::min(std::fabs(x), 448.0f);
+
+ int best = 0;
+ float best_diff = ax;
+
+ for (int i = 1; i < 127; ++i) {
+ const int exp = (i >> 3) & 0x0f;
+ const int mant = i & 0x07;
+ const float val = exp == 0
+ ? std::ldexp(float(mant), -9)
+ : std::ldexp(1.0f + float(mant) / 8.0f, exp - 7);
+ const float diff = std::fabs(ax - val);
+ if (diff < best_diff || (diff == best_diff && (i & 1) == 0 && (best & 1) != 0)) {
+ best = i;
+ best_diff = diff;
+ }
+ }
+
+ const int exp = (best >> 3) & 0x0f;
+ const int mant = best & 0x07;
+ const float val = exp == 0
+ ? std::ldexp(float(mant), -9)
+ : std::ldexp(1.0f + float(mant) / 8.0f, exp - 7);
+
+ return sign * val;
+}
+
+void ggml_compute_forward_dsv4_fp8_kv_quantize(
+ const ggml_compute_params * params,
+ ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+ const int64_t n_rot = ggml_get_op_params_i32(dst, 0);
+ const int64_t head_dim = src0->ne[0];
+ const int64_t n_nope = head_dim - n_rot;
+
+ GGML_ASSERT(n_rot >= 0);
+ GGML_ASSERT(n_nope > 0);
+ GGML_ASSERT(n_nope % 64 == 0);
+
+ const int64_t n_rows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+ const int64_t row_start = (n_rows * params->ith) / params->nth;
+ const int64_t row_end = (n_rows * (params->ith + 1)) / params->nth;
+
+ for (int64_t row = row_start; row < row_end; ++row) {
+ const int64_t i1 = row % src0->ne[1];
+ const int64_t i2 = (row / src0->ne[1]) % src0->ne[2];
+ const int64_t i3 = row / (src0->ne[1] * src0->ne[2]);
+
+ const char * src_base = (const char *) src0->data + i1*src0->nb[1] + i2*src0->nb[2] + i3*src0->nb[3];
+ char * dst_base = ( char *) dst->data + i1*dst->nb[1] + i2*dst->nb[2] + i3*dst->nb[3];
+
+ for (int64_t off = 0; off < n_nope; off += 64) {
+ float amax = 0.0f;
+ for (int64_t i = 0; i < 64; ++i) {
+ const float v = *(const float *) (src_base + (off + i)*src0->nb[0]);
+ amax = std::max(amax, std::fabs(v));
+ }
+
+ amax = std::max(amax, 1.0e-4f);
+ const float scale = std::ldexp(1.0f, int(std::ceil(std::log2(amax / 448.0f))));
+ for (int64_t i = 0; i < 64; ++i) {
+ const float v = *(const float *) (src_base + (off + i)*src0->nb[0]);
+ *(float *) (dst_base + (off + i)*dst->nb[0]) =
+ ggml_dsv4_e4m3fn_dequant(std::clamp(v / scale, -448.0f, 448.0f)) * scale;
+ }
+ }
+
+ for (int64_t i = n_nope; i < head_dim; ++i) {
+ *(float *) (dst_base + i*dst->nb[0]) = *(const float *) (src_base + i*src0->nb[0]);
+ }
+ }
+}
+
// ggml_compute_forward_map_custom1
void ggml_compute_forward_map_custom1(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 7398e5618948..4da4db62aa4e 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -104,6 +104,11 @@ void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, s
void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_gated_delta_net(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_hc_split_sinkhorn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_hc_weighted_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_hc_expand(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_fp8_kv_quantize(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_rope_tail(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cu b/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cu
new file mode 100644
index 000000000000..d32049c97da0
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cu
@@ -0,0 +1,164 @@
+#include "dsv4-fp8-kv-quantize.cuh"
+
+#if !defined(__HIP_PLATFORM_AMD__) && __CUDA_ARCH__ >= 890
+#include
+#endif
+
+#include
+
+// E4M3FN code value: 0..127.
+// Format: 1 sign + 4 exponent + 3 mantissa, bias 7, no inf/nan reserved.
+// (i >> 3) & 0xf = exponent, i & 7 = mantissa. Code 0 is +0.
+// Mirrors the CPU helper at ggml-cpu/ops.cpp:11245-11247 and the Metal
+// helper dsv4_e4m3fn_value at ggml-metal.metal:2302-2308.
+static __device__ __forceinline__ float dsv4_e4m3fn_value(int i) {
+ const int e = (i >> 3) & 0x0f;
+ const int m = i & 0x07;
+ return e == 0
+ ? float(m) * 0.001953125f // 2^-9 * m (subnormal)
+ : (1.0f + float(m) * 0.125f) * exp2f(float(e - 7)); // normal
+}
+
+// Round |x| to the nearest E4M3FN positive code value, breaking ties
+// toward the EVEN code (matches CPU reference ops.cpp:11242-11253 exactly).
+// Returns the dequantized F32, sign-preserved.
+static __device__ __forceinline__ float dsv4_e4m3fn_dequant_sw(float x) {
+ const float sign = x < 0.0f ? -1.0f : 1.0f;
+ const float ax = fminf(fabsf(x), 448.0f);
+
+ int best = 0;
+ float best_diff = ax;
+ #pragma unroll
+ for (int i = 1; i < 127; ++i) {
+ const float val = dsv4_e4m3fn_value(i);
+ const float diff = fabsf(ax - val);
+ if (diff < best_diff || (diff == best_diff && (i & 1) == 0 && (best & 1) != 0)) {
+ best = i;
+ best_diff = diff;
+ }
+ }
+ return sign * dsv4_e4m3fn_value(best);
+}
+
+// Dual-path E4M3FN quantize+dequantize round-trip with saturation.
+//
+// Native path uses NVIDIA's documented FP8 class API. The constructor
+// __nv_fp8_e4m3(float) applies round-to-nearest-even and saturates to
+// the finite E4M3 range (+/-448). The explicit float() conversion expands
+// the FP8 storage back to F32. This is the supported public API per
+// NVIDIA's cuda_fp8.h headers (CUDA toolkit >= 11.8).
+//
+// (We intentionally avoid the lower-level __nv_cvt_fp8_to_halfraw +
+// __half2float chain: the class wrapper is clearer and avoids a half
+// hop on F32-only data. There is no __nv_cvt_fp8_to_float intrinsic.)
+static __device__ __forceinline__ float dsv4_e4m3fn_roundtrip(float x) {
+#if !defined(__HIP_PLATFORM_AMD__) && __CUDA_ARCH__ >= 890
+ const __nv_fp8_e4m3 q(x);
+ return float(q);
+#else
+ // Software emulation: matches CPU reference bit-for-bit.
+ return dsv4_e4m3fn_dequant_sw(x);
+#endif
+}
+
+// Warp-level (32 threads) max-reduction via __shfl_xor_sync.
+static __device__ __forceinline__ float warp_reduce_max(float v) {
+ #pragma unroll
+ for (int offset = 16; offset > 0; offset >>= 1) {
+ v = fmaxf(v, __shfl_xor_sync(0xffffffffu, v, offset, 32));
+ }
+ return v;
+}
+
+// One block per row. blockDim.x == 64 (two warps).
+static __global__ void dsv4_fp8_kv_quantize_f32(
+ const char * __restrict__ src,
+ char * __restrict__ dst,
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+ const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
+ const int64_t nb0, const int64_t nb1, const int64_t nb2, const int64_t nb3,
+ const int n_rot) {
+
+ const int64_t n_rows = ne01 * ne02 * ne03;
+ const int64_t row = blockIdx.x;
+ if (row >= n_rows) return;
+
+ const int tid = threadIdx.x; // 0..63
+ const int warp_id = tid >> 5; // 0 or 1
+ const int lane = tid & 31;
+
+ const int64_t i1 = row % ne01;
+ const int64_t i2 = (row / ne01) % ne02;
+ const int64_t i3 = row / (ne01 * ne02);
+
+ const char * src_base = src + i1*nb01 + i2*nb02 + i3*nb03;
+ char * dst_base = dst + i1*nb1 + i2*nb2 + i3*nb3;
+
+ const int64_t n_nope = ne00 - (int64_t) n_rot;
+
+ // Shared-mem slot for the two warps' partial max.
+ __shared__ float warp_max[2];
+
+ // Prefix loop: 64-element blocks.
+ for (int64_t off = 0; off < n_nope; off += 64) {
+ const float v = *(const float *)(src_base + (off + tid) * nb00);
+
+ // Two-stage block-max reduction across 64 threads.
+ // Stage 1: each warp reduces its 32 lanes via shfl_xor; lane 0 stores
+ // the warp's max to shared memory.
+ // Stage 2: a single thread (warp 0, lane 0) combines the two warp maxes
+ // and writes the final block max back to warp_max[0].
+ float m = warp_reduce_max(fabsf(v));
+ if (lane == 0) warp_max[warp_id] = m;
+ __syncthreads();
+ if (warp_id == 0 && lane == 0) {
+ warp_max[0] = fmaxf(warp_max[0], warp_max[1]);
+ }
+ __syncthreads();
+
+ const float amax = fmaxf(warp_max[0], 1.0e-4f);
+ const float scale = exp2f(ceilf(log2f(amax / 448.0f)));
+
+ const float q = dsv4_e4m3fn_roundtrip(fminf(fmaxf(v / scale, -448.0f), 448.0f)) * scale;
+ *(float *)(dst_base + (off + tid) * nb0) = q;
+
+ __syncthreads(); // protect warp_max for the next block
+ }
+
+ // Tail loop: copy n_rot elements per row through unchanged.
+ // 64 threads stride through the tail.
+ for (int64_t i = n_nope + tid; i < ne00; i += 64) {
+ *(float *)(dst_base + i * nb0) = *(const float *)(src_base + i * nb00);
+ }
+}
+
+void ggml_cuda_op_dsv4_fp8_kv_quantize(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+ const ggml_tensor * src = dst->src[0];
+
+ GGML_ASSERT(src->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(ggml_are_same_shape(src, dst));
+
+ const int n_rot = ggml_get_op_params_i32(dst, 0);
+ const int64_t head_dim = src->ne[0];
+ const int64_t n_nope = head_dim - (int64_t) n_rot;
+
+ GGML_ASSERT(n_rot >= 0);
+ GGML_ASSERT(n_nope > 0);
+ GGML_ASSERT(n_nope % 64 == 0);
+
+ const int64_t n_rows = src->ne[1] * src->ne[2] * src->ne[3];
+
+ const dim3 grid((unsigned) n_rows, 1, 1);
+ const dim3 block(64, 1, 1);
+
+ cudaStream_t stream = ctx.stream();
+ dsv4_fp8_kv_quantize_f32<<>>(
+ (const char *) src->data,
+ ( char *) dst->data,
+ src->ne[0], src->ne[1], src->ne[2], src->ne[3],
+ (int64_t) src->nb[0], (int64_t) src->nb[1], (int64_t) src->nb[2], (int64_t) src->nb[3],
+ (int64_t) dst->nb[0], (int64_t) dst->nb[1], (int64_t) dst->nb[2], (int64_t) dst->nb[3],
+ n_rot);
+ CUDA_CHECK(cudaGetLastError());
+}
diff --git a/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cuh b/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cuh
new file mode 100644
index 000000000000..8e0fd958674d
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cuh
@@ -0,0 +1,36 @@
+#pragma once
+
+// V4 FP8 KV-cache simulation: quantizes/dequantizes the non-RoPE prefix
+// of each row in 64-element blocks through E4M3FN representation with
+// per-block scaling; leaves the RoPE tail unchanged.
+//
+// Block-scaled algorithm (must match CPU reference for the
+// test-backend-ops NMSE check):
+// for each row (n_rows = ne01 * ne02 * ne03):
+// for off in [0, n_nope) step 64:
+// amax = max(|src[off..off+64)|, 1e-4)
+// scale = 2^ceil(log2(amax / 448))
+// dst[off+i] = dequant_e4m3fn(clamp(src[off+i]/scale, -448, 448)) * scale
+// copy src[n_nope..ne00) to dst unchanged // RoPE tail
+//
+// References:
+// CPU reference: ggml/src/ggml-cpu/ops.cpp:11235-11313
+// Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:2302-2376
+// Metal dispatch: ggml/src/ggml-metal/ggml-metal-ops.cpp:1550-1594
+// Public API: ggml/include/ggml.h:2591 (ggml_dsv4_fp8_kv_quantize)
+//
+// Dual-path implementation:
+// - __CUDA_ARCH__ >= 890 (Ada/Hopper/Blackwell): native FP8 via the
+// __nv_fp8_e4m3 class wrapper from (round-to-nearest-even,
+// saturate-to-finite to +/-448).
+// - __CUDA_ARCH__ < 890 (Volta/Turing/Ampere): software emulation by
+// nearest-even E4M3FN code search, mirroring the CPU reference.
+//
+// Both paths produce numerically equivalent output (subject to FP8's
+// inherent lossiness). The four test_dsv4_fp8_kv_quantize cases from
+// Stream A (tests/test-backend-ops.cpp:8868-8871) validate with
+// max_nmse_err = 1e-3.
+
+#include "common.cuh"
+
+void ggml_cuda_op_dsv4_fp8_kv_quantize(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/dsv4-hc-expand.cu b/ggml/src/ggml-cuda/dsv4-hc-expand.cu
new file mode 100644
index 000000000000..8c4aab67f34d
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-hc-expand.cu
@@ -0,0 +1,92 @@
+#include "dsv4-hc-expand.cuh"
+
+// out[i_embd, i_hc, i_tok] = post[i_hc, i_tok] * block_out[i_embd, i_tok]
+// + sum_{hc'} comb[i_hc, hc', i_tok] * residual[i_embd, hc', i_tok]
+//
+// block_out is 2D (no hc axis); the post*block_out term is broadcast across hc.
+// See ggml/src/ggml-cpu/ops.cpp:11218-11231 for the CPU reference loop body.
+static __global__ void dsv4_hc_expand_f32(
+ const float * __restrict__ block_out,
+ const float * __restrict__ residual,
+ const float * __restrict__ post,
+ const float * __restrict__ comb,
+ float * __restrict__ dst,
+ const int n_embd, const int n_hc, const int n_tokens,
+ // block_out strides (2D -- no hc axis)
+ const int nb_b0, const int nb_b1,
+ // residual strides (3D)
+ const int nb_r0, const int nb_r1, const int nb_r2,
+ // post strides (2D)
+ const int nb_p0, const int nb_p1,
+ // comb strides (3D)
+ const int nb_c0, const int nb_c1, const int nb_c2,
+ // dst strides (3D)
+ const int nb0, const int nb1, const int nb2) {
+ const int64_t gid = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+ const int64_t total = (int64_t)n_embd * n_hc * n_tokens;
+ if (gid >= total) {
+ return;
+ }
+
+ const int i_embd = gid % n_embd;
+ const int rest = gid / n_embd;
+ const int i_hc = rest % n_hc;
+ const int i_tok = rest / n_hc;
+
+ // post * block_out (block_out is 2D: indexed by (i_embd, i_tok) only)
+ const float p = *(const float *)((const char *)post
+ + i_hc * nb_p0 + i_tok * nb_p1);
+ const float b = *(const float *)((const char *)block_out
+ + i_embd * nb_b0 + i_tok * nb_b1);
+ float acc = p * b;
+
+ // comb @ residual: sum over hc'
+ for (int hc_p = 0; hc_p < n_hc; ++hc_p) {
+ const float c = *(const float *)((const char *)comb
+ + i_hc * nb_c0 + hc_p * nb_c1 + i_tok * nb_c2);
+ const float r = *(const float *)((const char *)residual
+ + i_embd * nb_r0 + hc_p * nb_r1 + i_tok * nb_r2);
+ acc += c * r;
+ }
+
+ float * d = (float *)((char *)dst
+ + i_embd * nb0 + i_hc * nb1 + i_tok * nb2);
+ *d = acc;
+}
+
+void ggml_cuda_op_dsv4_hc_expand(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+ const ggml_tensor * block_out = dst->src[0];
+ const ggml_tensor * residual = dst->src[1];
+ const ggml_tensor * post = dst->src[2];
+ const ggml_tensor * comb = dst->src[3];
+
+ GGML_ASSERT(block_out->type == GGML_TYPE_F32);
+ GGML_ASSERT(residual->type == GGML_TYPE_F32);
+ GGML_ASSERT(post->type == GGML_TYPE_F32);
+ GGML_ASSERT(comb->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+ const int n_embd = (int) dst->ne[0];
+ const int n_hc = (int) dst->ne[1];
+ const int n_tokens = (int) dst->ne[2];
+
+ const int64_t total = (int64_t)n_embd * n_hc * n_tokens;
+ constexpr int blk = 256;
+ const dim3 grid((total + blk - 1) / blk);
+ const dim3 block(blk);
+
+ cudaStream_t stream = ctx.stream();
+ dsv4_hc_expand_f32<<>>(
+ (const float *) block_out->data,
+ (const float *) residual->data,
+ (const float *) post->data,
+ (const float *) comb->data,
+ (float *) dst->data,
+ n_embd, n_hc, n_tokens,
+ (int) block_out->nb[0], (int) block_out->nb[1],
+ (int) residual->nb[0], (int) residual->nb[1], (int) residual->nb[2],
+ (int) post->nb[0], (int) post->nb[1],
+ (int) comb->nb[0], (int) comb->nb[1], (int) comb->nb[2],
+ (int) dst->nb[0], (int) dst->nb[1], (int) dst->nb[2]);
+ CUDA_CHECK(cudaGetLastError());
+}
diff --git a/ggml/src/ggml-cuda/dsv4-hc-expand.cuh b/ggml/src/ggml-cuda/dsv4-hc-expand.cuh
new file mode 100644
index 000000000000..28f1a0ee8c8c
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-hc-expand.cuh
@@ -0,0 +1,26 @@
+#pragma once
+
+// V4 hyperconnection expand: per-token mix of block_out and residual.
+//
+// out[i, hc, tok] = post[hc, tok] * block_out[i, tok]
+// + sum_{hc'} comb[hc, hc', tok] * residual[i, hc', tok]
+//
+// Shapes:
+// block_out: 2D {n_embd, n_tokens} -- no hc axis
+// residual: 3D {n_embd, n_hc, n_tokens}
+// post: 2D {n_hc, n_tokens}
+// comb: 3D {n_hc, n_hc, n_tokens}
+// dst: 3D {n_embd, n_hc, n_tokens}
+//
+// Reference Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:2247-2276
+// CPU reference: ggml/src/ggml-cpu/ops.cpp:11200+
+// Public API: ggml/include/ggml.h:2581 (ggml_dsv4_hc_expand)
+// Shape constructor: ggml/src/ggml.c:6363-6366
+//
+// Embarrassingly parallel: one thread per output element (i_embd, i_hc, i_tok).
+// Each thread does an n_hc-wide accumulation for the comb*residual term plus
+// one fused multiply-add for the post*block_out term.
+
+#include "common.cuh"
+
+void ggml_cuda_op_dsv4_hc_expand(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cu b/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cu
new file mode 100644
index 000000000000..39d7833c9b0b
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cu
@@ -0,0 +1,198 @@
+#include "dsv4-hc-split-sinkhorn.cuh"
+
+// Maximum n_hc supported (matches CPU reference assert at ops.cpp:11014 and
+// the dst comb matrix scratch buffer size below).
+#define DSV4_HC_SINKHORN_MAX_N_HC 16
+
+// One block per row. Inside the block:
+// - threads cooperate (parallel for) on the pre/post slices and the final
+// copy of the comb matrix back to dst.
+// - tid == 0 runs the n_hc x n_hc Sinkhorn iterations serially. n_hc <= 16
+// so this is at most a few thousand FLOPs per row.
+//
+// The comb matrix lives in shared memory (sized for the worst case 16x16
+// = 256 floats = 1 KiB per block, well within any device's shared-mem
+// budget).
+static __global__ void dsv4_hc_split_sinkhorn_f32(
+ const float * __restrict__ mixes,
+ const float * __restrict__ scale,
+ const float * __restrict__ base,
+ float * __restrict__ dst,
+ const int n_hc,
+ const int sinkhorn_iters,
+ const int n_rows,
+ const int mix_hc,
+ const int nb01, // input row stride in bytes
+ const int nb1, // output row stride in bytes
+ const float eps) {
+ const int row = blockIdx.x;
+ if (row >= n_rows) {
+ return;
+ }
+
+ const int tid = threadIdx.x;
+ const int blksz = blockDim.x;
+
+ const float pre_scale = scale[0];
+ const float post_scale = scale[1];
+ const float comb_scale = scale[2];
+
+ const float * row_in = (const float *) ((const char *) mixes + row * nb01);
+ float * row_out = (float *) ((char *) dst + row * nb1);
+
+ // ---------------- Section 1: pre slice ----------------
+ // out[i] = sigmoid(mix[i] * pre_scale + base[i]) + eps
+ for (int i = tid; i < n_hc; i += blksz) {
+ const float z = row_in[i] * pre_scale + base[i];
+ row_out[i] = 1.0f / (1.0f + expf(-z)) + eps;
+ }
+
+ // ---------------- Section 2: post slice ----------------
+ // out[n_hc + i] = 2 * sigmoid(mix[n_hc + i] * post_scale + base[n_hc + i])
+ for (int i = tid; i < n_hc; i += blksz) {
+ const int off = n_hc + i;
+ const float z = row_in[off] * post_scale + base[off];
+ row_out[off] = 2.0f / (1.0f + expf(-z));
+ }
+
+ // ---------------- Section 3: comb matrix Sinkhorn ----------------
+ //
+ // c[src_hc + dst_hc * n_hc] layout (matches CPU reference at
+ // ggml-cpu/ops.cpp:11055).
+ extern __shared__ float shmem[];
+ float * c = shmem; // n_hc * n_hc floats
+
+ // Load the comb logits = mix * comb_scale + base (parallel over the block).
+ for (int i = tid; i < n_hc * n_hc; i += blksz) {
+ const int off = 2 * n_hc + i;
+ c[i] = row_in[off] * comb_scale + base[off];
+ }
+ __syncthreads();
+
+ // Sinkhorn iterations run on thread 0; n_hc <= 16 keeps the inner loops
+ // trivially cheap (~ 1k FLOPs per row total).
+ if (tid == 0) {
+ // First pass: per-dst_hc softmax (max-subtract for numerical stability,
+ // exp, normalize) + eps stabilizer.
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ float row_max = -INFINITY;
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ row_max = fmaxf(row_max, c[src_hc + dst_hc * n_hc]);
+ }
+
+ float row_sum = 0.0f;
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ const int idx = src_hc + dst_hc * n_hc;
+ const float v = expf(c[idx] - row_max);
+ c[idx] = v;
+ row_sum += v;
+ }
+
+ const float inv_sum = 1.0f / row_sum;
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ const int idx = src_hc + dst_hc * n_hc;
+ c[idx] = c[idx] * inv_sum + eps;
+ }
+ }
+
+ // First column-normalize: per src_hc, divide by (column sum + eps).
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ float sum = 0.0f;
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ sum += c[src_hc + dst_hc * n_hc];
+ }
+ const float inv_denom = 1.0f / (sum + eps);
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ c[src_hc + dst_hc * n_hc] *= inv_denom;
+ }
+ }
+
+ // Remaining sinkhorn_iters - 1 alternations: row-normalize then column-normalize.
+ for (int it = 1; it < sinkhorn_iters; ++it) {
+ // Row-normalize: per dst_hc, divide by (row sum + eps).
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ float sum = 0.0f;
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ sum += c[src_hc + dst_hc * n_hc];
+ }
+ const float inv_denom = 1.0f / (sum + eps);
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ c[src_hc + dst_hc * n_hc] *= inv_denom;
+ }
+ }
+ // Column-normalize: per src_hc, divide by (column sum + eps).
+ for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+ float sum = 0.0f;
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ sum += c[src_hc + dst_hc * n_hc];
+ }
+ const float inv_denom = 1.0f / (sum + eps);
+ for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+ c[src_hc + dst_hc * n_hc] *= inv_denom;
+ }
+ }
+ }
+ }
+ __syncthreads();
+
+ // Copy the comb matrix back to dst (parallel over the block).
+ for (int i = tid; i < n_hc * n_hc; i += blksz) {
+ row_out[2 * n_hc + i] = c[i];
+ }
+
+ // Suppress unused-warning for mix_hc; it's covered by the host-side asserts.
+ (void) mix_hc;
+}
+
+void ggml_cuda_op_dsv4_hc_split_sinkhorn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+ const ggml_tensor * mixes = dst->src[0];
+ const ggml_tensor * scale = dst->src[1];
+ const ggml_tensor * base = dst->src[2];
+
+ GGML_ASSERT(mixes->type == GGML_TYPE_F32);
+ GGML_ASSERT(scale->type == GGML_TYPE_F32);
+ GGML_ASSERT(base->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(mixes->nb[0] == sizeof(float));
+ GGML_ASSERT(scale->nb[0] == sizeof(float));
+ GGML_ASSERT(base->nb[0] == sizeof(float));
+ GGML_ASSERT(dst->nb[0] == sizeof(float));
+
+ const int n_hc = ggml_get_op_params_i32(dst, 0);
+ const int sinkhorn_iters = ggml_get_op_params_i32(dst, 1);
+ const float eps = ggml_get_op_params_f32(dst, 2);
+
+ GGML_ASSERT(n_hc > 0 && n_hc <= DSV4_HC_SINKHORN_MAX_N_HC);
+ GGML_ASSERT(sinkhorn_iters > 0);
+
+ const int n_rows = (int) ggml_nrows(mixes);
+ const int mix_hc = (int) mixes->ne[0];
+ const int nb01 = (int) mixes->nb[1];
+ const int nb1 = (int) dst->nb[1];
+
+ GGML_ASSERT(mix_hc == (2 + n_hc) * n_hc);
+ GGML_ASSERT((int) ggml_nrows(dst) == n_rows);
+
+ // Block size MUST be a warp multiple (>= 32) so that the in-block
+ // __syncthreads() barriers are well-formed and any future warp-wide
+ // shuffle has a complete mask. With mix_hc in {24, 80} the natural
+ // size is rounded up to 32 or 96.
+ constexpr int CUDA_WARP_SIZE = 32;
+ constexpr int CUDA_MAX_BLOCK = 256;
+ const int rounded = ((mix_hc + CUDA_WARP_SIZE - 1) / CUDA_WARP_SIZE) * CUDA_WARP_SIZE;
+ const int threads_per_block = std::min(CUDA_MAX_BLOCK, std::max(CUDA_WARP_SIZE, rounded));
+
+ const dim3 grid(n_rows);
+ const dim3 block(threads_per_block);
+ const size_t shared = (size_t) n_hc * (size_t) n_hc * sizeof(float);
+
+ cudaStream_t stream = ctx.stream();
+ dsv4_hc_split_sinkhorn_f32<<>>(
+ (const float *) mixes->data,
+ (const float *) scale->data,
+ (const float *) base->data,
+ (float *) dst->data,
+ n_hc, sinkhorn_iters, n_rows, mix_hc,
+ nb01, nb1, eps);
+ CUDA_CHECK(cudaGetLastError());
+}
diff --git a/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cuh b/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cuh
new file mode 100644
index 000000000000..d77455298468
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cuh
@@ -0,0 +1,42 @@
+#pragma once
+
+// V4 hyperconnection splitter with Sinkhorn normalization.
+//
+// Splits the mix vector [mix_hc, n_rows] into three sections:
+// - out[0:n_hc] = sigmoid(mix[i] * scale[0] + base[i]) + eps ("pre")
+// - out[n_hc:2*n_hc] = 2 * sigmoid(mix[off] * scale[1] + base[off]) ("post")
+// - out[2*n_hc:] = Sinkhorn-normalized n_hc x n_hc comb matrix ("comb")
+//
+// The comb section starts as logits (mix * scale[2] + base), then a
+// per-dst_hc row softmax (max-subtract + exp + normalize) with `eps` added,
+// then alternating column / row normalizations for sinkhorn_iters - 1 more
+// iterations. The result is doubly-stochastic up to `eps`-stabilization.
+//
+// Expected shape:
+// mixes : [mix_hc, n_rows] float32, contiguous along ne[0]
+// scale : [3] float32 (pre, post, comb scales)
+// base : [mix_hc] float32, matches the mix layout
+// dst : [mix_hc, n_rows] float32, same shape as mixes
+// where mix_hc == (2 + n_hc) * n_hc and n_hc in [1, 16].
+//
+// Op params (i32, i32, f32): n_hc, sinkhorn_iters, eps.
+//
+// CUDA kernel design:
+// - One CUDA block per output row.
+// - Block size rounded up to a warp multiple (>= 32) so __syncthreads()
+// and any future block-wide reductions are well-formed even when the
+// natural row width (mix_hc = 24 or 80 for n_hc = 4 or 8) is not a
+// warp multiple. Excess threads do no memory work; loops guard `i < n`.
+// - Sections 1, 2, and the final copy parallelize across the block.
+// - Section 3 (Sinkhorn iterations on the n_hc x n_hc comb matrix) is
+// serialized on `tid == 0`; n_hc <= 16 makes this trivially cheap
+// (O(n_hc^2 * sinkhorn_iters) per row) and avoids the complexity of
+// warp-cooperative reductions over a 4-or-8-wide inner dimension.
+//
+// Reference Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:2076-2245
+// CPU reference: ggml/src/ggml-cpu/ops.cpp:11037-11117
+// Public API: ggml/include/ggml.h (ggml_dsv4_hc_split_sinkhorn)
+
+#include "common.cuh"
+
+void ggml_cuda_op_dsv4_hc_split_sinkhorn(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cu b/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cu
new file mode 100644
index 000000000000..75f38d2aa7c3
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cu
@@ -0,0 +1,104 @@
+#include "dsv4-hc-weighted-sum.cuh"
+
+// CUDA port of kernel_dsv4_hc_weighted_sum (ggml-metal.metal:2278-2327).
+//
+// Layout (FP32 throughout):
+// x : {n_embd, n_hc, n_tokens}
+// weights : {n_hc, n_tokens}
+// dst : {n_embd, n_tokens}
+// Output[d, t] = sum_{h=0..n_hc-1} x[d, h, t] * weights[h, t].
+//
+// One thread per output element. Total threads = n_embd * n_tokens.
+// Strides are passed in BYTES (matching ggml's nb[] convention); element
+// access is via `(const char *) base + d*nb0 + h*nb1 + t*nb2` reinterpret
+// as `const float *`, identical to the Metal kernel and CPU reference.
+
+static __global__ void dsv4_hc_weighted_sum_f32(
+ const char * __restrict__ x,
+ const char * __restrict__ weights,
+ char * __restrict__ dst,
+ const int n_embd,
+ const int n_hc,
+ const int n_tokens,
+ const int64_t nb_x0,
+ const int64_t nb_x1,
+ const int64_t nb_x2,
+ const int64_t nb_w0,
+ const int64_t nb_w1,
+ const int64_t nb0,
+ const int64_t nb1) {
+ const int64_t gid = (int64_t) blockIdx.x * blockDim.x + threadIdx.x;
+ const int64_t total = (int64_t) n_embd * n_tokens;
+ if (gid >= total) {
+ return;
+ }
+
+ const int64_t d = gid % n_embd;
+ const int64_t t = gid / n_embd;
+
+ float acc = 0.0f;
+ for (int h = 0; h < n_hc; ++h) {
+ const float xv = *((const float *) (x + d*nb_x0 + h*nb_x1 + t*nb_x2));
+ const float wv = *((const float *) (weights + h*nb_w0 + t*nb_w1));
+ acc += xv * wv;
+ }
+
+ *((float *) (dst + d*nb0 + t*nb1)) = acc;
+}
+
+void ggml_cuda_op_dsv4_hc_weighted_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+ const ggml_tensor * x = dst->src[0];
+ const ggml_tensor * weights = dst->src[1];
+
+ GGML_ASSERT(x->type == GGML_TYPE_F32);
+ GGML_ASSERT(weights->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+ // Shape contract: see ggml.c:6335-6339 and the CPU reference asserts at
+ // ggml-cpu/ops.cpp:11129-11140.
+ GGML_ASSERT(x->ne[0] == dst->ne[0]);
+ GGML_ASSERT(x->ne[1] == weights->ne[0]);
+ GGML_ASSERT(x->ne[2] == dst->ne[1]);
+ GGML_ASSERT(weights->ne[1] == dst->ne[1]);
+ GGML_ASSERT(x->ne[3] == 1);
+ GGML_ASSERT(weights->ne[2] == 1);
+ GGML_ASSERT(weights->ne[3] == 1);
+ GGML_ASSERT(dst->ne[2] == 1);
+ GGML_ASSERT(dst->ne[3] == 1);
+
+ const int n_embd = (int) dst->ne[0];
+ const int n_hc = (int) x->ne[1];
+ const int n_tokens = (int) dst->ne[1];
+
+ const int64_t nb_x0 = (int64_t) x->nb[0];
+ const int64_t nb_x1 = (int64_t) x->nb[1];
+ const int64_t nb_x2 = (int64_t) x->nb[2];
+ const int64_t nb_w0 = (int64_t) weights->nb[0];
+ const int64_t nb_w1 = (int64_t) weights->nb[1];
+ const int64_t nb0 = (int64_t) dst->nb[0];
+ const int64_t nb1 = (int64_t) dst->nb[1];
+
+ const int64_t total = (int64_t) n_embd * n_tokens;
+ if (total == 0) {
+ return;
+ }
+
+ constexpr int CUDA_DSV4_HC_WEIGHTED_SUM_BLOCK_SIZE = 256;
+ const dim3 block_dims(CUDA_DSV4_HC_WEIGHTED_SUM_BLOCK_SIZE, 1, 1);
+ const dim3 grid_dims((unsigned) ((total + CUDA_DSV4_HC_WEIGHTED_SUM_BLOCK_SIZE - 1) /
+ CUDA_DSV4_HC_WEIGHTED_SUM_BLOCK_SIZE),
+ 1, 1);
+
+ cudaStream_t stream = ctx.stream();
+
+ dsv4_hc_weighted_sum_f32<<>>(
+ (const char *) x->data,
+ (const char *) weights->data,
+ (char *) dst->data,
+ n_embd, n_hc, n_tokens,
+ nb_x0, nb_x1, nb_x2,
+ nb_w0, nb_w1,
+ nb0, nb1);
+
+ CUDA_CHECK(cudaGetLastError());
+}
diff --git a/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cuh b/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cuh
new file mode 100644
index 000000000000..e6ee9c19a267
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cuh
@@ -0,0 +1,25 @@
+#pragma once
+
+// V4 hyper-connection weighted-sum: collapses the hc dimension.
+//
+// out[embd, token] = sum over hc of weights[hc, token] * x[embd, hc, token]
+//
+// Inputs (all GGML_TYPE_F32):
+// dst->src[0] = x shape {n_embd, n_hc, n_tokens, 1}
+// dst->src[1] = weights shape {n_hc, n_tokens, 1, 1}
+// Output (GGML_TYPE_F32):
+// dst shape {n_embd, n_tokens, 1, 1}
+//
+// Reference Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:2278-2327
+// Reference Metal dispatch: ggml/src/ggml-metal/ggml-metal-ops.cpp:1440-1486
+// CPU reference: ggml/src/ggml-cpu/ops.cpp:11121 (ggml_compute_forward_dsv4_hc_weighted_sum)
+// Public API: ggml/include/ggml.h:2574 (ggml_dsv4_hc_weighted_sum)
+//
+// Implementation: embarrassingly parallel; one thread per output element
+// (n_embd * n_tokens total), each thread loops over n_hc to accumulate.
+// Strides are kept in bytes (matching the Metal kernel + the ggml tensor
+// nb[] convention) and applied via (const char *) base + offset casts.
+
+#include "common.cuh"
+
+void ggml_cuda_op_dsv4_hc_weighted_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/dsv4-rope-tail.cu b/ggml/src/ggml-cuda/dsv4-rope-tail.cu
new file mode 100644
index 000000000000..abbf9b2181be
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-rope-tail.cu
@@ -0,0 +1,219 @@
+#include "dsv4-rope-tail.cuh"
+
+#include "ggml.h" // ggml_rope_yarn_corr_dims, ggml_get_op_params_i32
+
+#include // std::min / std::max in dispatch
+#include // memcpy
+
+// YaRN helper. Direct port of ggml/src/ggml-cuda/rope.cu:22-41
+// (template rope_yarn). Duplicated here to keep this
+// translation unit self-contained — rope.cuh does not currently expose
+// the function as a reusable device helper. The math is identical.
+
+struct dsv4_rope_corr_dims {
+ float v[2];
+};
+
+static __device__ __forceinline__ float dsv4_rope_yarn_ramp(
+ const float low, const float high, const int i0) {
+ const float y = (i0 / 2 - low) / max(0.001f, high - low);
+ return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+// forward=true: standard rotation; forward=false: inverse (sin flipped).
+template
+static __device__ __forceinline__ void dsv4_rope_yarn(
+ const float theta_extrap, const float freq_scale,
+ const dsv4_rope_corr_dims corr_dims, const int i0,
+ const float ext_factor, float mscale,
+ float & cos_theta, float & sin_theta) {
+ float theta_interp = freq_scale * theta_extrap;
+ float theta = theta_interp;
+ if (ext_factor != 0.0f) {
+ const float ramp_mix = dsv4_rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
+ theta = theta_interp * (1.0f - ramp_mix) + theta_extrap * ramp_mix;
+ mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+ }
+ cos_theta = cosf(theta) * mscale;
+ sin_theta = sinf(theta) * mscale;
+ if (!forward) {
+ sin_theta = -sin_theta;
+ }
+}
+
+// Main kernel. Launch shape matches Metal:
+// grid = (ne01, ne02, ne03)
+// block = (min(256, ne00), 1, 1)
+// Each thread walks the ne00 dim with stride ntg (== blockDim.x).
+// Translation of kernel_dsv4_rope_tail_f32 at
+// ggml/src/ggml-metal/ggml-metal.metal:4906-4997.
+static __global__ void dsv4_rope_tail_f32_kernel(
+ const float * __restrict__ src0,
+ const int * __restrict__ pos,
+ const float * __restrict__ freq_factors,
+ float * __restrict__ dst,
+ const int ne00,
+ const int nb00, const int nb01, const int nb02, const int nb03,
+ const int nb0, const int nb1, const int nb2, const int nb3,
+ const int n_dims,
+ const float freq_base, const float freq_scale,
+ const float ext_factor, const float attn_factor,
+ const dsv4_rope_corr_dims corr_dims,
+ const bool is_neox, const bool inverse) {
+ const int i1 = blockIdx.x;
+ const int i2 = blockIdx.y;
+ const int i3 = blockIdx.z;
+ const int tid = threadIdx.x;
+ const int ntg = blockDim.x;
+
+ const int n_nope = ne00 - n_dims;
+ if (n_nope < 0) {
+ return;
+ }
+
+ const float theta_base_pos = (float) pos[i2];
+ const float inv_ndims = -1.0f / (float) n_dims;
+
+ const char * src_base = (const char *) src0 + i3 * nb03 + i2 * nb02 + i1 * nb01;
+ char * dst_base = (char *) dst + i3 * nb3 + i2 * nb2 + i1 * nb1;
+
+ for (int i0 = tid; i0 < ne00; i0 += ntg) {
+ // Pass-through prefix: non-RoPE portion of the row.
+ if (i0 < n_nope) {
+ *((float *) (dst_base + i0 * nb0)) = *((const float *) (src_base + i0 * nb00));
+ continue;
+ }
+
+ const int r = i0 - n_nope;
+
+ if (is_neox) {
+ const int n_half = n_dims / 2;
+ if (r >= n_half) {
+ continue;
+ }
+
+ const int ic = r;
+ const int rel_i0 = 2 * ic;
+ const float theta = theta_base_pos * powf(freq_base, inv_ndims * (float) rel_i0);
+ const float freq_factor = freq_factors ? freq_factors[ic] : 1.0f;
+
+ float cos_theta;
+ float sin_theta;
+ // Use forward=true; inverse handled as a sign flip below to match
+ // Metal's "if (args.inverse) sin_theta = -sin_theta" pattern.
+ dsv4_rope_yarn(theta / freq_factor, freq_scale, corr_dims,
+ rel_i0, ext_factor, attn_factor,
+ cos_theta, sin_theta);
+ if (inverse) {
+ sin_theta = -sin_theta;
+ }
+
+ const int j0 = n_nope + ic;
+ const int j1 = n_nope + ic + n_half;
+ const float x0 = *((const float *) (src_base + j0 * nb00));
+ const float x1 = *((const float *) (src_base + j1 * nb00));
+ *((float *) (dst_base + j0 * nb0)) = x0 * cos_theta - x1 * sin_theta;
+ *((float *) (dst_base + j1 * nb0)) = x0 * sin_theta + x1 * cos_theta;
+ } else {
+ // NORMAL mode: rotate adjacent pair (j0, j0+1).
+ if ((r & 1) != 0) {
+ continue;
+ }
+
+ const int ic = r / 2;
+ const float theta = theta_base_pos * powf(freq_base, inv_ndims * (float) r);
+ const float freq_factor = freq_factors ? freq_factors[ic] : 1.0f;
+
+ float cos_theta;
+ float sin_theta;
+ dsv4_rope_yarn(theta / freq_factor, freq_scale, corr_dims,
+ r, ext_factor, attn_factor,
+ cos_theta, sin_theta);
+ if (inverse) {
+ sin_theta = -sin_theta;
+ }
+
+ const int j0 = n_nope + r;
+ const int j1 = j0 + 1;
+ const float x0 = *((const float *) (src_base + j0 * nb00));
+ const float x1 = *((const float *) (src_base + j1 * nb00));
+ *((float *) (dst_base + j0 * nb0)) = x0 * cos_theta - x1 * sin_theta;
+ *((float *) (dst_base + j1 * nb0)) = x0 * sin_theta + x1 * cos_theta;
+ }
+ }
+}
+
+void ggml_cuda_op_dsv4_rope_tail(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+ const ggml_tensor * src0 = dst->src[0];
+ const ggml_tensor * pos = dst->src[1];
+ const ggml_tensor * ff = dst->src[2]; // optional; may be NULL
+
+ GGML_ASSERT(src0->type == GGML_TYPE_F32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(pos->type == GGML_TYPE_I32);
+
+ // op_params layout — matches Metal dispatch at
+ // ggml/src/ggml-metal/ggml-metal-ops.cpp:1606-1623 verbatim:
+ // [0] = n_dims (i32)
+ // [1] = mode (i32)
+ // [2] = n_ctx_orig (i32)
+ // [3] = inverse (i32, treated as bool)
+ // [4] = freq_base (f32)
+ // [5] = freq_scale (f32)
+ // [6] = ext_factor (f32)
+ // [7] = attn_factor (f32)
+ // [8] = beta_fast (f32)
+ // [9] = beta_slow (f32)
+ const int32_t n_dims = ggml_get_op_params_i32(dst, 0);
+ const int32_t mode = ggml_get_op_params_i32(dst, 1);
+ const int32_t n_ctx_orig = ggml_get_op_params_i32(dst, 2);
+ const int32_t inverse_i = ggml_get_op_params_i32(dst, 3);
+ const bool inverse = inverse_i != 0;
+
+ float freq_base;
+ float freq_scale;
+ float ext_factor;
+ float attn_factor;
+ float beta_fast;
+ float beta_slow;
+ memcpy(&freq_base, (const int32_t *) dst->op_params + 4, sizeof(float));
+ memcpy(&freq_scale, (const int32_t *) dst->op_params + 5, sizeof(float));
+ memcpy(&ext_factor, (const int32_t *) dst->op_params + 6, sizeof(float));
+ memcpy(&attn_factor, (const int32_t *) dst->op_params + 7, sizeof(float));
+ memcpy(&beta_fast, (const int32_t *) dst->op_params + 8, sizeof(float));
+ memcpy(&beta_slow, (const int32_t *) dst->op_params + 9, sizeof(float));
+
+ const bool is_neox = (mode == GGML_ROPE_TYPE_NEOX);
+
+ // Precompute YaRN corr_dims host-side (matches Metal call at
+ // ggml/src/ggml-metal/ggml-metal.metal:4927).
+ dsv4_rope_corr_dims corr_dims;
+ ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
+
+ const int ne00 = (int) src0->ne[0];
+ const int ne01 = (int) src0->ne[1];
+ const int ne02 = (int) src0->ne[2];
+ const int ne03 = (int) src0->ne[3];
+
+ GGML_ASSERT(ne01 > 0 && ne02 > 0 && ne03 > 0);
+
+ const int nth = std::min(256, std::max(1, ne00));
+ const dim3 grid(ne01, ne02, ne03);
+ const dim3 block(nth, 1, 1);
+
+ cudaStream_t stream = ctx.stream();
+ dsv4_rope_tail_f32_kernel<<>>(
+ (const float *) src0->data,
+ (const int *) pos->data,
+ ff ? (const float *) ff->data : nullptr,
+ (float *) dst->data,
+ ne00,
+ (int) src0->nb[0], (int) src0->nb[1], (int) src0->nb[2], (int) src0->nb[3],
+ (int) dst->nb[0], (int) dst->nb[1], (int) dst->nb[2], (int) dst->nb[3],
+ n_dims,
+ freq_base, freq_scale, ext_factor, attn_factor,
+ corr_dims,
+ is_neox, inverse);
+
+ CUDA_CHECK(cudaGetLastError());
+}
diff --git a/ggml/src/ggml-cuda/dsv4-rope-tail.cuh b/ggml/src/ggml-cuda/dsv4-rope-tail.cuh
new file mode 100644
index 000000000000..24c1e5dac4fd
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-rope-tail.cuh
@@ -0,0 +1,26 @@
+#pragma once
+
+// V4 partial-RoPE: applies RoPE rotation to the last n_dims elements of each
+// row, leaving the non-RoPE prefix (i.e. the first ne00 - n_dims elements)
+// unchanged. The rotation math is the same as ggml_rope_ext (with YaRN
+// extrapolation when ext_factor != 0), restricted to the tail.
+//
+// Reference Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:4906-4997
+// CPU reference: ggml/src/ggml-cpu/ops.cpp:5961
+// Public API: ggml/include/ggml.h:2599 (ggml_dsv4_rope_tail)
+//
+// The dispatch function extracts op_params (i32 slots 0..3:
+// n_dims, mode, n_ctx_orig, inverse; f32 slots 4..9: freq_base, freq_scale,
+// ext_factor, attn_factor, beta_fast, beta_slow) from the destination
+// tensor, precomputes YaRN corr_dims host-side, and launches the kernel
+// with grid = (ne01, ne02, ne03), block.x = min(256, ne00), matching the
+// Metal dispatch at ggml/src/ggml-metal/ggml-metal-ops.cpp:1670.
+//
+// Supports the two RoPE modes the public V4 API allows
+// (ggml/src/ggml.c:6426 ASSERT mode == NORMAL || mode == NEOX). All
+// other modes are rejected via ggml_backend_cuda_device_supports_op so
+// the framework falls back to CPU rather than producing wrong output.
+
+#include "common.cuh"
+
+void ggml_cuda_op_dsv4_rope_tail(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index beeb52389464..a0feb032e215 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -391,6 +391,18 @@ static __device__ __forceinline__ void dequantize_V_q4_0(const void * __restrict
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]);
}
} else
+#else // FP16_AVAILABLE
+ // Software-FP16 fallback: compute in float, pack via __floats2half2_rn (RTNE).
+ // Required because the half-precision V-dequant template is instantiated
+ // for the full CC matrix (50/61/70/...), but FP16_AVAILABLE is undefined for CC < 600.
+ if constexpr (std::is_same_v) {
+ const float d = __half2float(x[ib].d);
+
+#pragma unroll
+ for (int l0 = 0; l0 < ne; l0 += 2) {
+ ((half2 *) dst)[l0/2] = __floats2half2_rn(d * q8[l0 + 0], d * q8[l0 + 1]);
+ }
+ } else
#endif // FP16_AVAILABLE
if constexpr (std::is_same_v) {
const float d = x[ib].d;
@@ -431,6 +443,16 @@ static __device__ __forceinline__ void dequantize_V_q4_1(const void * __restrict
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]) + m;
}
} else
+#else // FP16_AVAILABLE
+ // Software-FP16 fallback: see dequantize_V_q4_0 for the rationale.
+ if constexpr (std::is_same_v) {
+ const float2 dm = __half22float2(x[ib].dm);
+
+#pragma unroll
+ for (int l0 = 0; l0 < ne; l0 += 2) {
+ ((half2 *) dst)[l0/2] = __floats2half2_rn(dm.x * q8[l0 + 0] + dm.y, dm.x * q8[l0 + 1] + dm.y);
+ }
+ } else
#endif // FP16_AVAILABLE
if constexpr (std::is_same_v) {
const float2 dm = __half22float2(x[ib].dm);
@@ -481,6 +503,16 @@ static __device__ __forceinline__ void dequantize_V_q5_0(const void * __restrict
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]);
}
} else
+#else // FP16_AVAILABLE
+ // Software-FP16 fallback: see dequantize_V_q4_0 for the rationale.
+ if constexpr (std::is_same_v) {
+ const float d = __half2float(x[ib].d);
+
+#pragma unroll
+ for (int l0 = 0; l0 < ne; l0 += 2) {
+ ((half2 *) dst)[l0/2] = __floats2half2_rn(d * q8[l0 + 0], d * q8[l0 + 1]);
+ }
+ } else
#endif // FP16_AVAILABLE
if constexpr (std::is_same_v) {
const float d = x[ib].d;
@@ -531,6 +563,16 @@ static __device__ __forceinline__ void dequantize_V_q5_1(const void * __restrict
((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]) + m;
}
} else
+#else // FP16_AVAILABLE
+ // Software-FP16 fallback: see dequantize_V_q4_0 for the rationale.
+ if constexpr (std::is_same_v) {
+ const float2 dm = __half22float2(x[ib].dm);
+
+#pragma unroll
+ for (int l0 = 0; l0 < ne; l0 += 2) {
+ ((half2 *) dst)[l0/2] = __floats2half2_rn(dm.x * q8[l0 + 0] + dm.y, dm.x * q8[l0 + 1] + dm.y);
+ }
+ } else
#endif // FP16_AVAILABLE
if constexpr (std::is_same_v) {
const float2 dm = __half22float2(x[ib].dm);
@@ -564,6 +606,16 @@ static __device__ __forceinline__ void dequantize_V_q8_0(const void * __restrict
((half2 *) dst)[l0/2] = d * make_half2(qs[l0 + 0], qs[l0 + 1]);
}
} else
+#else // FP16_AVAILABLE
+ // Software-FP16 fallback: see dequantize_V_q4_0 for the rationale.
+ if constexpr (std::is_same::value) {
+ const float d = __half2float(x[ib].d);
+
+#pragma unroll
+ for (int l0 = 0; l0 < ne; l0 += 2) {
+ ((half2 *) dst)[l0/2] = __floats2half2_rn(d * qs[l0 + 0], d * qs[l0 + 1]);
+ }
+ } else
#endif // FP16_AVAILABLE
if constexpr (std::is_same::value) {
const float d = x[ib].d;
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index e25be3592fd4..e0fbceec7a0e 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -23,6 +23,11 @@
#include "ggml-cuda/cumsum.cuh"
#include "ggml-cuda/diagmask.cuh"
#include "ggml-cuda/diag.cuh"
+#include "ggml-cuda/dsv4-fp8-kv-quantize.cuh"
+#include "ggml-cuda/dsv4-hc-expand.cuh"
+#include "ggml-cuda/dsv4-hc-split-sinkhorn.cuh"
+#include "ggml-cuda/dsv4-hc-weighted-sum.cuh"
+#include "ggml-cuda/dsv4-rope-tail.cuh"
#include "ggml-cuda/fattn.cuh"
#include "ggml-cuda/getrows.cuh"
#include "ggml-cuda/im2col.cuh"
@@ -2777,7 +2782,52 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
nb1, nb2, nb3, stream);
}
+// ---------- DSV4 debug logging (env-gated, set GGML_DSV4_DEBUG=1 to enable) ----------
+static bool dsv4_debug_enabled() {
+ static const bool enabled = (getenv("GGML_DSV4_DEBUG") != nullptr);
+ return enabled;
+}
+
+static const char * dsv4_op_short(enum ggml_op op) {
+ switch (op) {
+ case GGML_OP_DSV4_ROPE_TAIL: return "DSV4_ROPE_TAIL";
+ case GGML_OP_DSV4_HC_SPLIT_SINKHORN: return "DSV4_HC_SPLIT_SINKHORN";
+ case GGML_OP_DSV4_HC_WEIGHTED_SUM: return "DSV4_HC_WEIGHTED_SUM";
+ case GGML_OP_DSV4_HC_EXPAND: return "DSV4_HC_EXPAND";
+ case GGML_OP_DSV4_FP8_KV_QUANTIZE: return "DSV4_FP8_KV_QUANTIZE";
+ default: return nullptr;
+ }
+}
+
+static bool dsv4_op_is_v4(enum ggml_op op) {
+ return dsv4_op_short(op) != nullptr;
+}
+
+static void dsv4_log_op_entry(int device, const struct ggml_tensor * dst) {
+ if (!dsv4_debug_enabled() || !dsv4_op_is_v4(dst->op)) return;
+ fprintf(stderr, "[DSV4_DEBUG] dev=%d op=%s dst=%s(%s) shape=[%lld,%lld,%lld,%lld]\n",
+ device, dsv4_op_short(dst->op),
+ dst->name, ggml_type_name(dst->type),
+ (long long) dst->ne[0], (long long) dst->ne[1],
+ (long long) dst->ne[2], (long long) dst->ne[3]);
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ if (!dst->src[i]) continue;
+ const char * buft_name = "(null-buf)";
+ int is_split = 0;
+ if (dst->src[i]->buffer) {
+ buft_name = ggml_backend_buft_name(dst->src[i]->buffer->buft);
+ is_split = ggml_backend_buft_is_cuda_split(dst->src[i]->buffer->buft) ? 1 : 0;
+ }
+ fprintf(stderr, "[DSV4_DEBUG] src[%d]=%s(%s) buft=%s split=%d data=%p extra=%p\n",
+ i, dst->src[i]->name, ggml_type_name(dst->src[i]->type),
+ buft_name, is_split, dst->src[i]->data, (void *) dst->src[i]->extra);
+ }
+ fflush(stderr);
+}
+// ---------- end DSV4 debug ----------
+
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
+ dsv4_log_op_entry(ctx.device, dst);
switch (dst->op) {
case GGML_OP_ARGMAX:
ggml_cuda_argmax(ctx, dst);
@@ -3020,6 +3070,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ROPE_BACK:
ggml_cuda_op_rope_back(ctx, dst);
break;
+ case GGML_OP_DSV4_ROPE_TAIL:
+ ggml_cuda_op_dsv4_rope_tail(ctx, dst);
+ break;
case GGML_OP_ROLL:
ggml_cuda_op_roll(ctx, dst);
break;
@@ -3089,6 +3142,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_RWKV_WKV7:
ggml_cuda_op_rwkv_wkv7(ctx, dst);
break;
+ case GGML_OP_DSV4_HC_EXPAND:
+ ggml_cuda_op_dsv4_hc_expand(ctx, dst);
+ break;
+ case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+ ggml_cuda_op_dsv4_fp8_kv_quantize(ctx, dst);
+ break;
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
ggml_cuda_cross_entropy_loss_back(ctx, dst);
break;
@@ -3104,6 +3163,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_FILL:
ggml_cuda_op_fill(ctx, dst);
break;
+ case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+ ggml_cuda_op_dsv4_hc_split_sinkhorn(ctx, dst);
+ break;
+ case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+ ggml_cuda_op_dsv4_hc_weighted_sum(ctx, dst);
+ break;
default:
return false;
}
@@ -3208,7 +3273,30 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
#ifdef GGML_CUDA_NO_PEER_COPY
return false;
#else
+ if (dsv4_debug_enabled()) {
+ fprintf(stderr, "[DSV4_DEBUG] peer-copy: src_dev=%d dst_dev=%d bytes=%zu "
+ "src=%s(%s,op=%s,buft=%s) dst=%s(%s,op=%s,buft=%s) src_ptr=%p dst_ptr=%p\n",
+ cuda_ctx_src->device, cuda_ctx_dst->device, ggml_nbytes(dst),
+ src->name, ggml_type_name(src->type), ggml_op_name(src->op),
+ src->buffer ? ggml_backend_buft_name(src->buffer->buft) : "?",
+ dst->name, ggml_type_name(dst->type), ggml_op_name(dst->op),
+ dst->buffer ? ggml_backend_buft_name(dst->buffer->buft) : "?",
+ src->data, dst->data);
+ fflush(stderr);
+ // Force any deferred CUDA error to surface BEFORE the next op, so the log line
+ // immediately above truly identifies the failing copy (codex review nit #1).
+ cudaError_t pre_err = cudaGetLastError();
+ if (pre_err != cudaSuccess) {
+ fprintf(stderr, "[DSV4_DEBUG] pre-copy stale error: %s\n", cudaGetErrorString(pre_err));
+ fflush(stderr);
+ }
+ }
CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
+ if (dsv4_debug_enabled()) {
+ // Synchronous wait to force the async error (if any) to surface at the offending copy,
+ // not at some later API call. Heavy perturbation — only with GGML_DSV4_DEBUG=1.
+ CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream()));
+ }
#endif // GGML_CUDA_NO_PEER_COPY
}
@@ -5034,8 +5122,17 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
- // split buffers can only be used with GGML_OP_MUL_MAT
- if (op->op != GGML_OP_MUL_MAT) {
+ // split buffers can only be used with GGML_OP_MUL_MAT and DeepSeek V4 custom ops.
+ // Without the DSV4 exception, multi-GPU scheduler rejects the V4 ops once their
+ // weight tensors land in cuda_split buffers and falls back to CPU — which then
+ // corrupts data via host<->device transfer mismatches and crashes during decode.
+ // Reported and root-caused by @DenisVASI9 on an 8x A100 40GB rig.
+ if (op->op != GGML_OP_MUL_MAT &&
+ op->op != GGML_OP_DSV4_HC_SPLIT_SINKHORN &&
+ op->op != GGML_OP_DSV4_HC_WEIGHTED_SUM &&
+ op->op != GGML_OP_DSV4_HC_EXPAND &&
+ op->op != GGML_OP_DSV4_FP8_KV_QUANTIZE &&
+ op->op != GGML_OP_DSV4_ROPE_TAIL) {
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
return false;
@@ -5053,6 +5150,30 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
}
}
+ // Some ops write through a pre-allocated destination buffer (e.g. SET_ROWS
+ // into a KV cache). For those, the dst lives on a specific device — dispatching
+ // the op on a different device causes the CUDA kernel to write through a
+ // foreign-device pointer (dst->data), surfacing as cudaErrorIllegalAddress.
+ //
+ // SET_ROWS returns a view tensor (ggml_view_tensor(ctx, a)) so op->buffer is
+ // nullptr. We must walk the view chain to find the real buffer.
+ // Diagnosed via CUDA_LAUNCH_BLOCKING=1 + GGML_DSV4_DEBUG=1 on @DenisVASI9's
+ // 8x A100 rig: V4's dsv4_store_cache_rows emits SET_ROWS at layer-7 K-cache
+ // (on CUDA1) while sched dispatched on CUDA0 → illegal access.
+ {
+ const ggml_tensor * t = op;
+ while (t->view_src) {
+ t = t->view_src;
+ }
+ if (t->buffer && ggml_backend_buft_is_cuda(t->buffer->buft)) {
+ ggml_backend_cuda_buffer_type_context * buft_ctx =
+ (ggml_backend_cuda_buffer_type_context *) t->buffer->buft->context;
+ if (buft_ctx->device != dev_ctx->device) {
+ return false;
+ }
+ }
+ }
+
switch (op->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
@@ -5358,6 +5479,23 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_ROPE_BACK: {
return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
}
+ case GGML_OP_DSV4_ROPE_TAIL: {
+ // Only F32 in/out is supported on this kernel (matches Metal kargs).
+ if (op->src[0]->type != GGML_TYPE_F32 || op->type != GGML_TYPE_F32) {
+ return false;
+ }
+ // Kernel implements mode == NORMAL (0) and mode == NEOX (2);
+ // any other mode is rejected so the framework falls back to CPU
+ // rather than producing wrong output. ggml/src/ggml.c:6426 ASSERTs
+ // this constraint at op-construction time, but we re-check here
+ // for defense-in-depth.
+ const int32_t mode = ggml_get_op_params_i32(op, 1);
+ if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
+ return false;
+ }
+ // Same contiguity requirement as GGML_OP_ROPE.
+ return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
+ }
case GGML_OP_IM2COL:
case GGML_OP_IM2COL_3D:
case GGML_OP_CONV_2D:
@@ -5393,6 +5531,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_GATED_LINEAR_ATTN:
case GGML_OP_RWKV_WKV7:
return true;
+ case GGML_OP_DSV4_HC_EXPAND:
+ return op->type == GGML_TYPE_F32
+ && op->src[0]->type == GGML_TYPE_F32
+ && op->src[1]->type == GGML_TYPE_F32
+ && op->src[2]->type == GGML_TYPE_F32
+ && op->src[3]->type == GGML_TYPE_F32;
case GGML_OP_GATED_DELTA_NET:
//TODO: enable once MUSA compiler is solved https://github.com/ggml-org/llama.cpp/pull/19504#issuecomment-4018634327
#ifdef GGML_USE_MUSA
@@ -5400,6 +5544,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
#else
return true;
#endif // GGML_USE_MUSA
+ case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+ return op->type == GGML_TYPE_F32
+ && op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_FLASH_ATTN_EXT:
return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op);
case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -5412,6 +5559,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_DIAG:
case GGML_OP_SOLVE_TRI:
return true;
+ case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+ return op->src[0]->type == GGML_TYPE_F32 &&
+ op->src[1]->type == GGML_TYPE_F32 &&
+ op->src[2]->type == GGML_TYPE_F32 &&
+ op->type == GGML_TYPE_F32;
+ case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+ return op->type == GGML_TYPE_F32
+ && op->src[0]->type == GGML_TYPE_F32
+ && op->src[1]->type == GGML_TYPE_F32;
default:
return false;
diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
index e288a27f992a..d6e5bf98e04f 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -459,6 +459,54 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max(ggml_me
return res;
}
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_split_sinkhorn(ggml_metal_library_t lib, const ggml_tensor * op) {
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[2]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+ const char * name = "kernel_dsv4_hc_split_sinkhorn";
+
+ ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+ if (!res.pipeline) {
+ res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
+ }
+
+ return res;
+}
+
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_weighted_sum(ggml_metal_library_t lib, const ggml_tensor * op) {
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+ const char * name = "kernel_dsv4_hc_weighted_sum";
+
+ ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+ if (!res.pipeline) {
+ res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
+ }
+
+ return res;
+}
+
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_expand(ggml_metal_library_t lib, const ggml_tensor * op) {
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[2]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[3]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+ const char * name = "kernel_dsv4_hc_expand";
+
+ ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+ if (!res.pipeline) {
+ res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
+ }
+
+ return res;
+}
+
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv(ggml_metal_library_t lib, const ggml_tensor * op) {
GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
@@ -1429,6 +1477,36 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext(
return res;
}
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_fp8_kv_quantize(ggml_metal_library_t lib, const ggml_tensor * op) {
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+ const char * name = "kernel_dsv4_fp8_kv_quantize_f32";
+
+ ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+ if (!res.pipeline) {
+ res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
+ }
+
+ res.smem = 64*sizeof(float);
+
+ return res;
+}
+
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_rope_tail(ggml_metal_library_t lib, const ggml_tensor * op) {
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+ const char * name = "kernel_dsv4_rope_tail_f32";
+
+ ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+ if (!res.pipeline) {
+ res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
+ }
+
+ return res;
+}
+
ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_vec(
ggml_metal_library_t lib,
const ggml_tensor * op,
diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h
index 1f212a92f985..1e3a8485f922 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -123,6 +123,11 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_bl
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_add (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tri (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_split_sinkhorn(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_weighted_sum(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_expand (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_fp8_kv_quantize(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_rope_tail (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv (ggml_metal_library_t lib, const struct ggml_tensor * op);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs);
struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan (ggml_metal_library_t lib, const struct ggml_tensor * op);
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index 780dfe81bb3c..f8b2e65c2a46 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1187,6 +1187,53 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
case GGML_OP_SSM_CONV:
case GGML_OP_SSM_SCAN:
return has_simdgroup_reduction;
+ case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+ return ggml_is_contiguous_rows(op->src[0]) &&
+ ggml_is_contiguous(op->src[1]) &&
+ ggml_is_contiguous(op->src[2]) &&
+ op->src[0]->type == GGML_TYPE_F32 &&
+ op->src[1]->type == GGML_TYPE_F32 &&
+ op->src[2]->type == GGML_TYPE_F32 &&
+ op->type == GGML_TYPE_F32;
+ case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+ return op->src[0]->type == GGML_TYPE_F32 &&
+ op->src[1]->type == GGML_TYPE_F32 &&
+ op->type == GGML_TYPE_F32 &&
+ op->src[0]->ne[0] == op->ne[0] &&
+ op->src[0]->ne[1] == op->src[1]->ne[0] &&
+ op->src[0]->ne[2] == op->ne[1] &&
+ op->src[1]->ne[1] == op->ne[1];
+ case GGML_OP_DSV4_HC_EXPAND:
+ return op->src[0]->type == GGML_TYPE_F32 &&
+ op->src[1]->type == GGML_TYPE_F32 &&
+ op->src[2]->type == GGML_TYPE_F32 &&
+ op->src[3]->type == GGML_TYPE_F32 &&
+ op->type == GGML_TYPE_F32 &&
+ op->src[0]->ne[0] == op->ne[0] &&
+ op->src[0]->ne[1] == op->ne[2] &&
+ op->src[1]->ne[0] == op->ne[0] &&
+ op->src[1]->ne[1] == op->ne[1] &&
+ op->src[1]->ne[2] == op->ne[2] &&
+ op->src[2]->ne[0] == op->ne[1] &&
+ op->src[2]->ne[1] == op->ne[2] &&
+ op->src[3]->ne[0] == op->ne[1] &&
+ op->src[3]->ne[1] == op->ne[1] &&
+ op->src[3]->ne[2] == op->ne[2];
+ case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+ return op->src[0]->type == GGML_TYPE_F32 &&
+ op->type == GGML_TYPE_F32 &&
+ op->src[0]->ne[0] > ggml_get_op_params_i32(op, 0) &&
+ (op->src[0]->ne[0] - ggml_get_op_params_i32(op, 0)) % 64 == 0;
+ case GGML_OP_DSV4_ROPE_TAIL:
+ return op->src[0]->type == GGML_TYPE_F32 &&
+ op->src[1]->type == GGML_TYPE_I32 &&
+ op->type == GGML_TYPE_F32 &&
+ op->src[0]->ne[2] == op->src[1]->ne[0] &&
+ ggml_get_op_params_i32(op, 0) > 0 &&
+ ggml_get_op_params_i32(op, 0) <= op->src[0]->ne[0] &&
+ ggml_get_op_params_i32(op, 0) % 2 == 0 &&
+ (ggml_get_op_params_i32(op, 1) == GGML_ROPE_TYPE_NORMAL ||
+ ggml_get_op_params_i32(op, 1) == GGML_ROPE_TYPE_NEOX);
case GGML_OP_RWKV_WKV6:
case GGML_OP_RWKV_WKV7:
return true;
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
index ff74cafb5b79..48301eff4709 100644
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -798,6 +798,90 @@ typedef struct {
int32_t n_head_log2;
} ggml_metal_kargs_soft_max;
+typedef struct {
+ int32_t n_hc;
+ int32_t sinkhorn_iters;
+ int64_t n_rows;
+ int64_t mix_hc;
+ uint64_t nb01;
+ uint64_t nb1;
+ float eps;
+} ggml_metal_kargs_dsv4_hc_split_sinkhorn;
+
+typedef struct {
+ int64_t n_embd;
+ int64_t n_hc;
+ int64_t n_tokens;
+ uint64_t nb_x0;
+ uint64_t nb_x1;
+ uint64_t nb_x2;
+ uint64_t nb_w0;
+ uint64_t nb_w1;
+ uint64_t nb0;
+ uint64_t nb1;
+} ggml_metal_kargs_dsv4_hc_weighted_sum;
+
+typedef struct {
+ int64_t n_embd;
+ int64_t n_hc;
+ int64_t n_tokens;
+ uint64_t nb_block0;
+ uint64_t nb_block1;
+ uint64_t nb_res0;
+ uint64_t nb_res1;
+ uint64_t nb_res2;
+ uint64_t nb_post0;
+ uint64_t nb_post1;
+ uint64_t nb_comb0;
+ uint64_t nb_comb1;
+ uint64_t nb_comb2;
+ uint64_t nb0;
+ uint64_t nb1;
+ uint64_t nb2;
+} ggml_metal_kargs_dsv4_hc_expand;
+
+typedef struct {
+ int64_t ne00;
+ int64_t ne01;
+ int64_t ne02;
+ int64_t ne03;
+ uint64_t nb00;
+ uint64_t nb01;
+ uint64_t nb02;
+ uint64_t nb03;
+ uint64_t nb0;
+ uint64_t nb1;
+ uint64_t nb2;
+ uint64_t nb3;
+ int32_t n_rot;
+} ggml_metal_kargs_dsv4_fp8_kv_quantize;
+
+typedef struct {
+ int64_t ne00;
+ int64_t ne01;
+ int64_t ne02;
+ int64_t ne03;
+ uint64_t nb00;
+ uint64_t nb01;
+ uint64_t nb02;
+ uint64_t nb03;
+ uint64_t nb0;
+ uint64_t nb1;
+ uint64_t nb2;
+ uint64_t nb3;
+ int32_t n_dims;
+ int32_t mode;
+ int32_t n_ctx_orig;
+ int32_t inverse;
+ float freq_base;
+ float freq_scale;
+ float ext_factor;
+ float attn_factor;
+ float beta_fast;
+ float beta_slow;
+ bool src2;
+} ggml_metal_kargs_dsv4_rope_tail;
+
typedef struct {
int64_t ne00;
int64_t ne01;
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index a114391c2e8c..b2f6ed37847d 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -320,6 +320,26 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
{
n_fuse = ggml_metal_op_soft_max(ctx, idx);
} break;
+ case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+ {
+ n_fuse = ggml_metal_op_dsv4_hc_split_sinkhorn(ctx, idx);
+ } break;
+ case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+ {
+ n_fuse = ggml_metal_op_dsv4_hc_weighted_sum(ctx, idx);
+ } break;
+ case GGML_OP_DSV4_HC_EXPAND:
+ {
+ n_fuse = ggml_metal_op_dsv4_hc_expand(ctx, idx);
+ } break;
+ case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+ {
+ n_fuse = ggml_metal_op_dsv4_fp8_kv_quantize(ctx, idx);
+ } break;
+ case GGML_OP_DSV4_ROPE_TAIL:
+ {
+ n_fuse = ggml_metal_op_dsv4_rope_tail(ctx, idx);
+ } break;
case GGML_OP_SSM_CONV:
{
n_fuse = ggml_metal_op_ssm_conv(ctx, idx);
@@ -1369,6 +1389,289 @@ int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) {
return 1;
}
+int ggml_metal_op_dsv4_hc_split_sinkhorn(ggml_metal_op_t ctx, int idx) {
+ ggml_tensor * op = ctx->node(idx);
+
+ ggml_metal_library_t lib = ctx->lib;
+ ggml_metal_encoder_t enc = ctx->enc;
+
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[2]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[0]->ne[2] == 1);
+ GGML_ASSERT(op->src[0]->ne[3] == 1);
+
+ const int32_t n_hc = ggml_get_op_params_i32(op, 0);
+ const int32_t sinkhorn_iters = ggml_get_op_params_i32(op, 1);
+ const float eps = ggml_get_op_params_f32(op, 2);
+
+ GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne);
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
+
+ ggml_metal_kargs_dsv4_hc_split_sinkhorn args = {
+ /*.n_hc =*/ n_hc,
+ /*.sinkhorn_iters =*/ sinkhorn_iters,
+ /*.n_rows =*/ ne01*ne02*ne03,
+ /*.mix_hc =*/ ne00,
+ /*.nb01 =*/ nb01,
+ /*.nb1 =*/ nb1,
+ /*.eps =*/ eps,
+ };
+
+ auto pipeline = ggml_metal_library_get_pipeline_dsv4_hc_split_sinkhorn(lib, op);
+
+ const int nth = std::min(256, std::max(1, args.n_rows));
+ const int n_tg = (args.n_rows + nth - 1) / nth;
+
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[2]), 3);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 4);
+
+ ggml_metal_encoder_dispatch_threadgroups(enc, n_tg, 1, 1, nth, 1, 1);
+
+ return 1;
+}
+
+int ggml_metal_op_dsv4_hc_weighted_sum(ggml_metal_op_t ctx, int idx) {
+ ggml_tensor * op = ctx->node(idx);
+
+ ggml_metal_library_t lib = ctx->lib;
+ ggml_metal_encoder_t enc = ctx->enc;
+
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+ ggml_tensor * x = op->src[0];
+ ggml_tensor * weights = op->src[1];
+
+ GGML_TENSOR_LOCALS(int64_t, ne, op, ne);
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
+ GGML_TENSOR_LOCALS(uint64_t, nb_x, x, nb);
+ GGML_TENSOR_LOCALS(uint64_t, nb_w, weights, nb);
+
+ ggml_metal_kargs_dsv4_hc_weighted_sum args = {
+ /*.n_embd =*/ ne0,
+ /*.n_hc =*/ x->ne[1],
+ /*.n_tokens =*/ ne1,
+ /*.nb_x0 =*/ nb_x0,
+ /*.nb_x1 =*/ nb_x1,
+ /*.nb_x2 =*/ nb_x2,
+ /*.nb_w0 =*/ nb_w0,
+ /*.nb_w1 =*/ nb_w1,
+ /*.nb0 =*/ nb0,
+ /*.nb1 =*/ nb1,
+ };
+
+ auto pipeline = ggml_metal_library_get_pipeline_dsv4_hc_weighted_sum(lib, op);
+
+ const int64_t n_elem = ne0*ne1;
+ const int nth = std::min(256, std::max(1, n_elem));
+ const int n_tg = (n_elem + nth - 1) / nth;
+
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(x), 1);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(weights), 2);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 3);
+
+ ggml_metal_encoder_dispatch_threadgroups(enc, n_tg, 1, 1, nth, 1, 1);
+
+ return 1;
+}
+
+int ggml_metal_op_dsv4_hc_expand(ggml_metal_op_t ctx, int idx) {
+ ggml_tensor * op = ctx->node(idx);
+
+ ggml_metal_library_t lib = ctx->lib;
+ ggml_metal_encoder_t enc = ctx->enc;
+
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[2]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[3]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+ ggml_tensor * block_out = op->src[0];
+ ggml_tensor * residual = op->src[1];
+ ggml_tensor * post = op->src[2];
+ ggml_tensor * comb = op->src[3];
+
+ GGML_TENSOR_LOCALS(int64_t, ne, op, ne);
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
+ GGML_TENSOR_LOCALS(uint64_t, nb_block, block_out, nb);
+ GGML_TENSOR_LOCALS(uint64_t, nb_res, residual, nb);
+ GGML_TENSOR_LOCALS(uint64_t, nb_post, post, nb);
+ GGML_TENSOR_LOCALS(uint64_t, nb_comb, comb, nb);
+
+ ggml_metal_kargs_dsv4_hc_expand args = {
+ /*.n_embd =*/ ne0,
+ /*.n_hc =*/ ne1,
+ /*.n_tokens =*/ ne2,
+ /*.nb_block0 =*/ nb_block0,
+ /*.nb_block1 =*/ nb_block1,
+ /*.nb_res0 =*/ nb_res0,
+ /*.nb_res1 =*/ nb_res1,
+ /*.nb_res2 =*/ nb_res2,
+ /*.nb_post0 =*/ nb_post0,
+ /*.nb_post1 =*/ nb_post1,
+ /*.nb_comb0 =*/ nb_comb0,
+ /*.nb_comb1 =*/ nb_comb1,
+ /*.nb_comb2 =*/ nb_comb2,
+ /*.nb0 =*/ nb0,
+ /*.nb1 =*/ nb1,
+ /*.nb2 =*/ nb2,
+ };
+
+ auto pipeline = ggml_metal_library_get_pipeline_dsv4_hc_expand(lib, op);
+
+ const int64_t n_elem = ne0*ne1*ne2;
+ const int nth = std::min(256, std::max(1, n_elem));
+ const int n_tg = (n_elem + nth - 1) / nth;
+
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(block_out), 1);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(residual), 2);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(post), 3);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(comb), 4);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 5);
+
+ ggml_metal_encoder_dispatch_threadgroups(enc, n_tg, 1, 1, nth, 1, 1);
+
+ return 1;
+}
+
+int ggml_metal_op_dsv4_fp8_kv_quantize(ggml_metal_op_t ctx, int idx) {
+ ggml_tensor * op = ctx->node(idx);
+
+ ggml_metal_library_t lib = ctx->lib;
+ ggml_metal_encoder_t enc = ctx->enc;
+
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+ const int32_t n_rot = ggml_get_op_params_i32(op, 0);
+
+ GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne);
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
+
+ ggml_metal_kargs_dsv4_fp8_kv_quantize args = {
+ /*.ne00 =*/ ne00,
+ /*.ne01 =*/ ne01,
+ /*.ne02 =*/ ne02,
+ /*.ne03 =*/ ne03,
+ /*.nb00 =*/ nb00,
+ /*.nb01 =*/ nb01,
+ /*.nb02 =*/ nb02,
+ /*.nb03 =*/ nb03,
+ /*.nb0 =*/ nb0,
+ /*.nb1 =*/ nb1,
+ /*.nb2 =*/ nb2,
+ /*.nb3 =*/ nb3,
+ /*.n_rot =*/ n_rot,
+ };
+
+ auto pipeline = ggml_metal_library_get_pipeline_dsv4_fp8_kv_quantize(lib, op);
+
+ const int64_t n_rows = ne01*ne02*ne03;
+
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 2);
+ ggml_metal_encoder_set_threadgroup_memory_size(enc, pipeline.smem, 0);
+
+ ggml_metal_encoder_dispatch_threadgroups(enc, n_rows, 1, 1, 64, 1, 1);
+
+ return 1;
+}
+
+int ggml_metal_op_dsv4_rope_tail(ggml_metal_op_t ctx, int idx) {
+ ggml_tensor * op = ctx->node(idx);
+
+ ggml_metal_library_t lib = ctx->lib;
+ ggml_metal_encoder_t enc = ctx->enc;
+
+ GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+ GGML_ASSERT(op->src[1]->type == GGML_TYPE_I32);
+ GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+ const int32_t n_dims = ggml_get_op_params_i32(op, 0);
+ const int32_t mode = ggml_get_op_params_i32(op, 1);
+ const int32_t n_ctx_orig = ggml_get_op_params_i32(op, 2);
+ const int32_t inverse = ggml_get_op_params_i32(op, 3);
+
+ float freq_base;
+ float freq_scale;
+ float ext_factor;
+ float attn_factor;
+ float beta_fast;
+ float beta_slow;
+
+ memcpy(&freq_base, (const int32_t *) op->op_params + 4, sizeof(float));
+ memcpy(&freq_scale, (const int32_t *) op->op_params + 5, sizeof(float));
+ memcpy(&ext_factor, (const int32_t *) op->op_params + 6, sizeof(float));
+ memcpy(&attn_factor, (const int32_t *) op->op_params + 7, sizeof(float));
+ memcpy(&beta_fast, (const int32_t *) op->op_params + 8, sizeof(float));
+ memcpy(&beta_slow, (const int32_t *) op->op_params + 9, sizeof(float));
+
+ GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne);
+ GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+ GGML_TENSOR_LOCALS(uint64_t, nb, op, nb);
+
+ ggml_metal_kargs_dsv4_rope_tail args = {
+ /*.ne00 =*/ ne00,
+ /*.ne01 =*/ ne01,
+ /*.ne02 =*/ ne02,
+ /*.ne03 =*/ ne03,
+ /*.nb00 =*/ nb00,
+ /*.nb01 =*/ nb01,
+ /*.nb02 =*/ nb02,
+ /*.nb03 =*/ nb03,
+ /*.nb0 =*/ nb0,
+ /*.nb1 =*/ nb1,
+ /*.nb2 =*/ nb2,
+ /*.nb3 =*/ nb3,
+ /*.n_dims =*/ n_dims,
+ /*.mode =*/ mode,
+ /*.n_ctx_orig =*/ n_ctx_orig,
+ /*.inverse =*/ inverse,
+ /*.freq_base =*/ freq_base,
+ /*.freq_scale =*/ freq_scale,
+ /*.ext_factor =*/ ext_factor,
+ /*.attn_factor =*/ attn_factor,
+ /*.beta_fast =*/ beta_fast,
+ /*.beta_slow =*/ beta_slow,
+ /*.src2 =*/ op->src[2] != nullptr,
+ };
+
+ auto pipeline = ggml_metal_library_get_pipeline_dsv4_rope_tail(lib, op);
+
+ const int nth = std::min(256, std::max(1, ne00));
+
+ ggml_metal_encoder_set_pipeline(enc, pipeline);
+ ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
+ if (op->src[2]) {
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[2]), 3);
+ } else {
+ ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 3);
+ }
+ ggml_metal_encoder_set_buffer (enc, ggml_metal_get_buffer_id(op), 4);
+
+ ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
+
+ return 1;
+}
+
int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) {
ggml_tensor * op = ctx->node(idx);
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h
index 36c61071b4fa..e402a450e619 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.h
+++ b/ggml/src/ggml-metal/ggml-metal-ops.h
@@ -55,6 +55,11 @@ int ggml_metal_op_get_rows (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_set_rows (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_diag (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_soft_max (ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_dsv4_hc_split_sinkhorn(ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_dsv4_hc_weighted_sum(ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_dsv4_hc_expand (ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_dsv4_fp8_kv_quantize(ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_dsv4_rope_tail (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_ssm_conv (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_ssm_scan (ggml_metal_op_t ctx, int idx);
int ggml_metal_op_rwkv (ggml_metal_op_t ctx, int idx);
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index f6ffb2b3a1c6..37cabfcdb2bb 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2073,6 +2073,308 @@ template [[host_name("kernel_soft_max_f32")]] kernel kernel_soft_max_t kerne
template [[host_name("kernel_soft_max_f16_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4;
template [[host_name("kernel_soft_max_f32_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4;
+kernel void kernel_dsv4_hc_split_sinkhorn(
+ constant ggml_metal_kargs_dsv4_hc_split_sinkhorn & args,
+ device const float * mixes,
+ device const float * scale,
+ device const float * base,
+ device float * dst,
+ uint tid [[thread_position_in_grid]]) {
+ if ((int64_t) tid >= args.n_rows) {
+ return;
+ }
+
+ constexpr int HC_MAX = 16;
+ const int HC = args.n_hc;
+ if (HC <= 0 || HC > HC_MAX) {
+ return;
+ }
+
+ device const float * mix = mixes + ((int64_t) tid)*args.mix_hc;
+ device float * out = dst + ((int64_t) tid)*args.mix_hc;
+
+ const float epsv = args.eps;
+ const float pre_scale = scale[0];
+ const float post_scale = scale[1];
+ const float comb_scale = scale[2];
+
+ if (HC == 4) {
+ const float4 pre_z =
+ *((device const float4 *) mix) * pre_scale +
+ *((device const float4 *) base);
+ *((device float4 *) out) = 1.0f / (1.0f + exp(-pre_z)) + epsv;
+
+ const float4 post_z =
+ *((device const float4 *) (mix + 4)) * post_scale +
+ *((device const float4 *) (base + 4));
+ *((device float4 *) (out + 4)) = 2.0f / (1.0f + exp(-post_z));
+
+ float4 r0 =
+ *((device const float4 *) (mix + 8)) * comb_scale +
+ *((device const float4 *) (base + 8));
+ float4 r1 =
+ *((device const float4 *) (mix + 12)) * comb_scale +
+ *((device const float4 *) (base + 12));
+ float4 r2 =
+ *((device const float4 *) (mix + 16)) * comb_scale +
+ *((device const float4 *) (base + 16));
+ float4 r3 =
+ *((device const float4 *) (mix + 20)) * comb_scale +
+ *((device const float4 *) (base + 20));
+
+ const float m0 = max(max(r0.x, r0.y), max(r0.z, r0.w));
+ const float m1 = max(max(r1.x, r1.y), max(r1.z, r1.w));
+ const float m2 = max(max(r2.x, r2.y), max(r2.z, r2.w));
+ const float m3 = max(max(r3.x, r3.y), max(r3.z, r3.w));
+
+ r0 = exp(r0 - m0);
+ r1 = exp(r1 - m1);
+ r2 = exp(r2 - m2);
+ r3 = exp(r3 - m3);
+
+ r0 = r0 * (1.0f / (r0.x + r0.y + r0.z + r0.w)) + epsv;
+ r1 = r1 * (1.0f / (r1.x + r1.y + r1.z + r1.w)) + epsv;
+ r2 = r2 * (1.0f / (r2.x + r2.y + r2.z + r2.w)) + epsv;
+ r3 = r3 * (1.0f / (r3.x + r3.y + r3.z + r3.w)) + epsv;
+
+ float4 col_inv = 1.0f / (r0 + r1 + r2 + r3 + epsv);
+ r0 *= col_inv;
+ r1 *= col_inv;
+ r2 *= col_inv;
+ r3 *= col_inv;
+
+ for (int iter = 1; iter < args.sinkhorn_iters; ++iter) {
+ r0 *= 1.0f / (r0.x + r0.y + r0.z + r0.w + epsv);
+ r1 *= 1.0f / (r1.x + r1.y + r1.z + r1.w + epsv);
+ r2 *= 1.0f / (r2.x + r2.y + r2.z + r2.w + epsv);
+ r3 *= 1.0f / (r3.x + r3.y + r3.z + r3.w + epsv);
+
+ col_inv = 1.0f / (r0 + r1 + r2 + r3 + epsv);
+ r0 *= col_inv;
+ r1 *= col_inv;
+ r2 *= col_inv;
+ r3 *= col_inv;
+ }
+
+ *((device float4 *) (out + 8)) = r0;
+ *((device float4 *) (out + 12)) = r1;
+ *((device float4 *) (out + 16)) = r2;
+ *((device float4 *) (out + 20)) = r3;
+ return;
+ }
+
+ for (int i = 0; i < HC; ++i) {
+ const float z = mix[i] * pre_scale + base[i];
+ out[i] = 1.0f / (1.0f + exp(-z)) + epsv;
+ }
+
+ for (int i = 0; i < HC; ++i) {
+ const int off = HC + i;
+ const float z = mix[off] * post_scale + base[off];
+ out[off] = 2.0f / (1.0f + exp(-z));
+ }
+
+ float c[HC_MAX*HC_MAX];
+
+ for (int dst_hc = 0; dst_hc < HC; ++dst_hc) {
+ float row_max = -INFINITY;
+ for (int src_hc = 0; src_hc < HC; ++src_hc) {
+ const int idx = src_hc + dst_hc*HC;
+ const int off = 2*HC + idx;
+ const float v = mix[off] * comb_scale + base[off];
+ c[idx] = v;
+ row_max = max(row_max, v);
+ }
+
+ float row_sum = 0.0f;
+ for (int src_hc = 0; src_hc < HC; ++src_hc) {
+ const int idx = src_hc + dst_hc*HC;
+ const float v = exp(c[idx] - row_max);
+ c[idx] = v;
+ row_sum += v;
+ }
+
+ const float inv_sum = 1.0f / row_sum;
+ for (int src_hc = 0; src_hc < HC; ++src_hc) {
+ const int idx = src_hc + dst_hc*HC;
+ c[idx] = c[idx] * inv_sum + epsv;
+ }
+ }
+
+ for (int src_hc = 0; src_hc < HC; ++src_hc) {
+ float sum = 0.0f;
+ for (int dst_hc = 0; dst_hc < HC; ++dst_hc) {
+ sum += c[src_hc + dst_hc*HC];
+ }
+
+ const float inv_denom = 1.0f / (sum + epsv);
+ for (int dst_hc = 0; dst_hc < HC; ++dst_hc) {
+ c[src_hc + dst_hc*HC] *= inv_denom;
+ }
+ }
+
+ for (int iter = 1; iter < args.sinkhorn_iters; ++iter) {
+ for (int dst_hc = 0; dst_hc < HC; ++dst_hc) {
+ float sum = 0.0f;
+ for (int src_hc = 0; src_hc < HC; ++src_hc) {
+ sum += c[src_hc + dst_hc*HC];
+ }
+
+ const float inv_denom = 1.0f / (sum + epsv);
+ for (int src_hc = 0; src_hc < HC; ++src_hc) {
+ c[src_hc + dst_hc*HC] *= inv_denom;
+ }
+ }
+
+ for (int src_hc = 0; src_hc < HC; ++src_hc) {
+ float sum = 0.0f;
+ for (int dst_hc = 0; dst_hc < HC; ++dst_hc) {
+ sum += c[src_hc + dst_hc*HC];
+ }
+
+ const float inv_denom = 1.0f / (sum + epsv);
+ for (int dst_hc = 0; dst_hc < HC; ++dst_hc) {
+ c[src_hc + dst_hc*HC] *= inv_denom;
+ }
+ }
+ }
+
+ for (int i = 0; i < HC*HC; ++i) {
+ out[2*HC + i] = c[i];
+ }
+}
+
+kernel void kernel_dsv4_hc_expand(
+ constant ggml_metal_kargs_dsv4_hc_expand & args,
+ device const char * block_out,
+ device const char * residual,
+ device const char * post,
+ device const char * comb,
+ device char * dst,
+ uint gid [[thread_position_in_grid]]) {
+ const int64_t n_elem = args.n_embd * args.n_hc * args.n_tokens;
+ if ((int64_t) gid >= n_elem) {
+ return;
+ }
+
+ const int64_t d = ((int64_t) gid) % args.n_embd;
+ const int64_t tmp = ((int64_t) gid) / args.n_embd;
+ const int64_t dst_hc = tmp % args.n_hc;
+ const int64_t t = tmp / args.n_hc;
+
+ const float block_v = *((device const float *) (block_out + d*args.nb_block0 + t*args.nb_block1));
+ const float post_v = *((device const float *) (post + dst_hc*args.nb_post0 + t*args.nb_post1));
+
+ float acc = block_v * post_v;
+ for (int64_t src_hc = 0; src_hc < args.n_hc; ++src_hc) {
+ const float comb_v = *((device const float *) (comb + dst_hc*args.nb_comb0 + src_hc*args.nb_comb1 + t*args.nb_comb2));
+ const float res_v = *((device const float *) (residual + d*args.nb_res0 + src_hc*args.nb_res1 + t*args.nb_res2));
+ acc += comb_v * res_v;
+ }
+
+ *((device float *) (dst + d*args.nb0 + dst_hc*args.nb1 + t*args.nb2)) = acc;
+}
+
+kernel void kernel_dsv4_hc_weighted_sum(
+ constant ggml_metal_kargs_dsv4_hc_weighted_sum & args,
+ device const char * x,
+ device const char * weights,
+ device char * dst,
+ uint gid [[thread_position_in_grid]]) {
+ const int64_t n_elem = args.n_embd * args.n_tokens;
+ if ((int64_t) gid >= n_elem) {
+ return;
+ }
+
+ const int64_t d = ((int64_t) gid) % args.n_embd;
+ const int64_t t = ((int64_t) gid) / args.n_embd;
+
+ float acc = 0.0f;
+ for (int64_t h = 0; h < args.n_hc; ++h) {
+ const float xv = *((device const float *) (x + d*args.nb_x0 + h*args.nb_x1 + t*args.nb_x2));
+ const float wv = *((device const float *) (weights + h*args.nb_w0 + t*args.nb_w1));
+ acc += xv * wv;
+ }
+
+ *((device float *) (dst + d*args.nb0 + t*args.nb1)) = acc;
+}
+
+static inline float dsv4_e4m3fn_value(int i) {
+ const int exp = (i >> 3) & 0x0f;
+ const int mant = i & 0x07;
+ return exp == 0
+ ? float(mant) * 0.001953125f
+ : (1.0f + float(mant) * 0.125f) * exp2(float(exp - 7));
+}
+
+static inline float dsv4_e4m3fn_dequant(float x) {
+ const float sign = x < 0.0f ? -1.0f : 1.0f;
+ const float ax = min(abs(x), 448.0f);
+
+ int best = 0;
+ float best_diff = ax;
+ for (int i = 1; i < 127; ++i) {
+ const float val = dsv4_e4m3fn_value(i);
+ const float diff = abs(ax - val);
+ if (diff < best_diff || (diff == best_diff && (i & 1) == 0 && (best & 1) != 0)) {
+ best = i;
+ best_diff = diff;
+ }
+ }
+
+ return sign * dsv4_e4m3fn_value(best);
+}
+
+kernel void kernel_dsv4_fp8_kv_quantize_f32(
+ constant ggml_metal_kargs_dsv4_fp8_kv_quantize & args,
+ device const char * src0,
+ device char * dst,
+ threadgroup float * scratch [[threadgroup(0)]],
+ uint row [[threadgroup_position_in_grid]],
+ uint tid [[thread_position_in_threadgroup]]) {
+ const int64_t n_rows = args.ne01 * args.ne02 * args.ne03;
+ if ((int64_t) row >= n_rows) {
+ return;
+ }
+
+ const int64_t i1 = row % args.ne01;
+ const int64_t i2 = (row / args.ne01) % args.ne02;
+ const int64_t i3 = row / (args.ne01 * args.ne02);
+
+ device const char * src_base = src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03;
+ device char * dst_base = dst + i1*args.nb1 + i2*args.nb2 + i3*args.nb3;
+
+ const int64_t n_nope = args.ne00 - args.n_rot;
+
+ for (int64_t off = 0; off < n_nope; off += 64) {
+ float v = 0.0f;
+ if (tid < 64) {
+ v = *((device const float *) (src_base + (off + tid)*args.nb00));
+ scratch[tid] = abs(v);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+
+ for (uint stride = 32; stride > 0; stride >>= 1) {
+ if (tid < stride) {
+ scratch[tid] = max(scratch[tid], scratch[tid + stride]);
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ }
+
+ const float amax = max(scratch[0], 1.0e-4f);
+ const float scale = exp2(ceil(log2(amax / 448.0f)));
+ if (tid < 64) {
+ const float q = dsv4_e4m3fn_dequant(clamp(v / scale, -448.0f, 448.0f)) * scale;
+ *((device float *) (dst_base + (off + tid)*args.nb0)) = q;
+ }
+ threadgroup_barrier(mem_flags::mem_threadgroup);
+ }
+
+ for (int64_t i = n_nope + tid; i < args.ne00; i += 64) {
+ *((device float *) (dst_base + i*args.nb0)) = *((device const float *) (src_base + i*args.nb00));
+ }
+}
+
// ref: ggml.c:ggml_compute_forward_ssm_conv_f32
kernel void kernel_ssm_conv_f32_f32(
constant ggml_metal_kargs_ssm_conv & args,
@@ -4632,6 +4934,95 @@ template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kerne
template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision;
template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision;
+kernel void kernel_dsv4_rope_tail_f32(
+ constant ggml_metal_kargs_dsv4_rope_tail & args,
+ device const char * src0,
+ device const char * src1,
+ device const char * src2,
+ device char * dst,
+ uint tid [[thread_index_in_threadgroup]],
+ ushort3 ntg [[threads_per_threadgroup]],
+ uint3 tgpig [[threadgroup_position_in_grid]]) {
+ const int i1 = tgpig[0];
+ const int i2 = tgpig[1];
+ const int i3 = tgpig[2];
+
+ const int n_nope = args.ne00 - args.n_dims;
+ if (n_nope < 0) {
+ return;
+ }
+
+ device const int32_t * pos = (device const int32_t *) src1;
+
+ float corr_dims[2];
+ rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+
+ const float theta_base = (float) pos[i2];
+ const float inv_ndims = -1.f/args.n_dims;
+ const bool is_neox = args.mode == 2;
+
+ for (int i0 = tid; i0 < args.ne00; i0 += ntg.x) {
+ device const char * src_base = src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01;
+ device char * dst_base = dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1;
+
+ if (i0 < n_nope) {
+ *((device float *) (dst_base + i0*args.nb0)) = *((device const float *) (src_base + i0*args.nb00));
+ continue;
+ }
+
+ const int r = i0 - n_nope;
+ if (is_neox) {
+ const int n_half = args.n_dims/2;
+ if (r >= n_half) {
+ continue;
+ }
+
+ const int ic = r;
+ const int rel_i0 = 2*ic;
+ const float theta = theta_base * pow(args.freq_base, inv_ndims*rel_i0);
+ const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;
+
+ float cos_theta;
+ float sin_theta;
+ rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, rel_i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+ if (args.inverse) {
+ sin_theta = -sin_theta;
+ }
+
+ const int j0 = n_nope + ic;
+ const int j1 = n_nope + ic + n_half;
+ const float x0 = *((device const float *) (src_base + j0*args.nb00));
+ const float x1 = *((device const float *) (src_base + j1*args.nb00));
+
+ *((device float *) (dst_base + j0*args.nb0)) = x0*cos_theta - x1*sin_theta;
+ *((device float *) (dst_base + j1*args.nb0)) = x0*sin_theta + x1*cos_theta;
+ } else {
+ if ((r & 1) != 0) {
+ continue;
+ }
+
+ const int ic = r/2;
+ const float theta = theta_base * pow(args.freq_base, inv_ndims*r);
+ const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;
+
+ float cos_theta;
+ float sin_theta;
+ rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, r, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+ if (args.inverse) {
+ sin_theta = -sin_theta;
+ }
+
+ const int j0 = n_nope + r;
+ const int j1 = j0 + 1;
+ const float x0 = *((device const float *) (src_base + j0*args.nb00));
+ const float x1 = *((device const float *) (src_base + j1*args.nb00));
+
+ *((device float *) (dst_base + j0*args.nb0)) = x0*cos_theta - x1*sin_theta;
+ *((device float *) (dst_base + j1*args.nb0)) = x0*sin_theta + x1*cos_theta;
+ }
+ }
+}
+
typedef void (im2col_t)(
constant ggml_metal_kargs_im2col & args,
device const float * x,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 476c30797956..8b06c0bd5a49 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1063,6 +1063,11 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"RWKV_WKV7",
"SOLVE_TRI",
"GATED_DELTA_NET",
+ "DSV4_HC_SPLIT_SINKHORN",
+ "DSV4_HC_WEIGHTED_SUM",
+ "DSV4_HC_EXPAND",
+ "DSV4_FP8_KV_QUANTIZE",
+ "DSV4_ROPE_TAIL",
"UNARY",
@@ -1080,7 +1085,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"GLU",
};
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@@ -1173,6 +1178,11 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"rwkv_wkv7(r, w, k, v, a, b, s)",
"A X = B, A triangular, solve X",
"gated_delta_net(q, k, v, g, beta, s)",
+ "dsv4_hc_split_sinkhorn(x)",
+ "dsv4_hc_weighted_sum(x)",
+ "dsv4_hc_expand(x)",
+ "dsv4_fp8_kv_quantize(x)",
+ "dsv4_rope_tail(x)",
"unary(x)",
@@ -1190,7 +1200,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"glu(x)",
};
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -6230,6 +6240,180 @@ struct ggml_tensor * ggml_gated_delta_net(
return result;
}
+// ggml_dsv4_hc_split_sinkhorn
+
+struct ggml_tensor * ggml_dsv4_hc_split_sinkhorn(
+ struct ggml_context * ctx,
+ struct ggml_tensor * mixes,
+ struct ggml_tensor * scale,
+ struct ggml_tensor * base,
+ int n_hc,
+ int sinkhorn_iters,
+ float eps) {
+ GGML_ASSERT(mixes->type == GGML_TYPE_F32);
+ GGML_ASSERT(scale->type == GGML_TYPE_F32);
+ GGML_ASSERT(base->type == GGML_TYPE_F32);
+
+ GGML_ASSERT(ggml_is_contiguous_rows(mixes));
+ GGML_ASSERT(ggml_is_contiguous(scale));
+ GGML_ASSERT(ggml_is_contiguous(base));
+
+ GGML_ASSERT(n_hc > 0);
+ GGML_ASSERT(n_hc <= 16); // CPU forward uses a fixed float c[16*16] scratch
+ GGML_ASSERT(sinkhorn_iters > 0);
+ GGML_ASSERT(mixes->ne[0] == (2 + n_hc) * n_hc);
+ GGML_ASSERT(mixes->ne[2] == 1);
+ GGML_ASSERT(mixes->ne[3] == 1);
+ GGML_ASSERT(ggml_nelements(scale) >= 3);
+ GGML_ASSERT(ggml_nelements(base) >= mixes->ne[0]);
+
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, mixes);
+
+ ggml_set_op_params_i32(result, 0, n_hc);
+ ggml_set_op_params_i32(result, 1, sinkhorn_iters);
+ ggml_set_op_params_f32(result, 2, eps);
+
+ result->op = GGML_OP_DSV4_HC_SPLIT_SINKHORN;
+ result->src[0] = mixes;
+ result->src[1] = scale;
+ result->src[2] = base;
+
+ return result;
+}
+
+// ggml_dsv4_hc_weighted_sum
+
+struct ggml_tensor * ggml_dsv4_hc_weighted_sum(
+ struct ggml_context * ctx,
+ struct ggml_tensor * x,
+ struct ggml_tensor * weights) {
+ GGML_ASSERT(x->type == GGML_TYPE_F32);
+ GGML_ASSERT(weights->type == GGML_TYPE_F32);
+
+ GGML_ASSERT(x->ne[1] == weights->ne[0]);
+ GGML_ASSERT(x->ne[2] == weights->ne[1]);
+ GGML_ASSERT(x->ne[3] == 1);
+ GGML_ASSERT(weights->ne[2] == 1);
+ GGML_ASSERT(weights->ne[3] == 1);
+
+ struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, x->ne[0], x->ne[2]);
+
+ result->op = GGML_OP_DSV4_HC_WEIGHTED_SUM;
+ result->src[0] = x;
+ result->src[1] = weights;
+
+ return result;
+}
+
+// ggml_dsv4_hc_expand
+
+struct ggml_tensor * ggml_dsv4_hc_expand(
+ struct ggml_context * ctx,
+ struct ggml_tensor * block_out,
+ struct ggml_tensor * residual,
+ struct ggml_tensor * post,
+ struct ggml_tensor * comb) {
+ GGML_ASSERT(block_out->type == GGML_TYPE_F32);
+ GGML_ASSERT(residual->type == GGML_TYPE_F32);
+ GGML_ASSERT(post->type == GGML_TYPE_F32);
+ GGML_ASSERT(comb->type == GGML_TYPE_F32);
+
+ GGML_ASSERT(block_out->ne[0] == residual->ne[0]);
+ GGML_ASSERT(block_out->ne[1] == residual->ne[2]);
+ GGML_ASSERT(block_out->ne[2] == 1);
+ GGML_ASSERT(block_out->ne[3] == 1);
+ GGML_ASSERT(post->ne[0] == residual->ne[1]);
+ GGML_ASSERT(post->ne[1] == residual->ne[2]);
+ GGML_ASSERT(post->ne[2] == 1);
+ GGML_ASSERT(post->ne[3] == 1);
+ GGML_ASSERT(comb->ne[0] == residual->ne[1]);
+ GGML_ASSERT(comb->ne[1] == residual->ne[1]);
+ GGML_ASSERT(comb->ne[2] == residual->ne[2]);
+ GGML_ASSERT(comb->ne[3] == 1);
+
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, residual);
+
+ result->op = GGML_OP_DSV4_HC_EXPAND;
+ result->src[0] = block_out;
+ result->src[1] = residual;
+ result->src[2] = post;
+ result->src[3] = comb;
+
+ return result;
+}
+
+// ggml_dsv4_fp8_kv_quantize
+
+struct ggml_tensor * ggml_dsv4_fp8_kv_quantize(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ int n_rot) {
+ GGML_ASSERT(a->type == GGML_TYPE_F32);
+ GGML_ASSERT(n_rot >= 0);
+ GGML_ASSERT(a->ne[0] > n_rot);
+ GGML_ASSERT((a->ne[0] - n_rot) % 64 == 0);
+
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+ ggml_set_op_params_i32(result, 0, n_rot);
+
+ result->op = GGML_OP_DSV4_FP8_KV_QUANTIZE;
+ result->src[0] = a;
+
+ return result;
+}
+
+// ggml_dsv4_rope_tail
+
+struct ggml_tensor * ggml_dsv4_rope_tail(
+ struct ggml_context * ctx,
+ struct ggml_tensor * a,
+ struct ggml_tensor * pos,
+ struct ggml_tensor * freq_factors,
+ int n_dims,
+ int mode,
+ int n_ctx_orig,
+ float freq_base,
+ float freq_scale,
+ float ext_factor,
+ float attn_factor,
+ float beta_fast,
+ float beta_slow,
+ bool inverse) {
+ GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
+ GGML_ASSERT(mode == GGML_ROPE_TYPE_NORMAL || mode == GGML_ROPE_TYPE_NEOX);
+ GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16);
+ GGML_ASSERT(pos->type == GGML_TYPE_I32);
+ GGML_ASSERT(ggml_is_vector(pos));
+ GGML_ASSERT(a->ne[2] == pos->ne[0]);
+ GGML_ASSERT(n_dims > 0);
+ GGML_ASSERT(n_dims <= a->ne[0]);
+ GGML_ASSERT(n_dims % 2 == 0);
+
+ if (freq_factors) {
+ GGML_ASSERT(freq_factors->type == GGML_TYPE_F32);
+ GGML_ASSERT(freq_factors->ne[0] >= n_dims / 2);
+ }
+
+ struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+ int32_t params[16] = { n_dims, mode, n_ctx_orig, inverse ? 1 : 0 };
+ memcpy(params + 4, &freq_base, sizeof(float));
+ memcpy(params + 5, &freq_scale, sizeof(float));
+ memcpy(params + 6, &ext_factor, sizeof(float));
+ memcpy(params + 7, &attn_factor, sizeof(float));
+ memcpy(params + 8, &beta_fast, sizeof(float));
+ memcpy(params + 9, &beta_slow, sizeof(float));
+ ggml_set_op_params(result, params, sizeof(params));
+
+ result->op = GGML_OP_DSV4_ROPE_TAIL;
+ result->src[0] = a;
+ result->src[1] = pos;
+ result->src[2] = freq_factors;
+
+ return result;
+}
+
////////////////////////////////////////////////////////////////////////////////
struct ggml_hash_set ggml_hash_set_new(size_t size) {
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index c25f217f990e..8f44c7965e87 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -145,6 +145,10 @@ class LLM:
INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
FULL_ATTENTION_INTERVAL = "{arch}.full_attention_interval"
ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale"
+ HASH_LAYER_COUNT = "{arch}.hash_layer_count"
+ HYPER_CONNECTION_COUNT = "{arch}.hyper_connection.count"
+ HYPER_CONNECTION_SINKHORN_ITERS = "{arch}.hyper_connection.sinkhorn_iterations"
+ HYPER_CONNECTION_EPS = "{arch}.hyper_connection.epsilon"
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
@@ -184,6 +188,10 @@ class Attention:
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
TEMPERATURE_SCALE = "{arch}.attention.temperature_scale"
+ COMPRESS_RATIOS = "{arch}.attention.compress_ratios"
+ COMPRESS_ROPE_FREQ_BASE = "{arch}.attention.compress_rope_freq_base"
+ OUTPUT_LORA_RANK = "{arch}.attention.output_lora_rank"
+ OUTPUT_GROUP_COUNT = "{arch}.attention.output_group_count"
class Indexer:
HEAD_COUNT = "{arch}.attention.indexer.head_count"
@@ -451,6 +459,7 @@ class MODEL_ARCH(IntEnum):
DEEPSEEK = auto()
DEEPSEEK2 = auto()
DEEPSEEK2OCR = auto()
+ DEEPSEEK4 = auto()
CHATGLM = auto()
GLM4 = auto()
GLM4_MOE = auto()
@@ -527,6 +536,9 @@ class MODEL_TENSOR(IntEnum):
TOKEN_TYPES = auto()
POS_EMBD = auto()
OUTPUT = auto()
+ OUTPUT_HC_BASE = auto() # deepseek4 hyper-connection output
+ OUTPUT_HC_FN = auto() # deepseek4 hyper-connection output
+ OUTPUT_HC_SCALE = auto() # deepseek4 hyper-connection output
DENSE_2_OUT = auto() # embeddinggemma 2_Dense
DENSE_3_OUT = auto() # embeddinggemma 3_Dense
OUTPUT_NORM = auto()
@@ -650,12 +662,19 @@ class MODEL_TENSOR(IntEnum):
CHANNEL_MIX_VALUE = auto()
ATTN_Q_A = auto()
ATTN_Q_B = auto()
+ ATTN_KV = auto() # deepseek4 single-tensor combined KV projection
ATTN_KV_A_MQA = auto()
ATTN_KV_B = auto()
ATTN_K_B = auto()
ATTN_V_B = auto()
+ ATTN_OUT_A = auto() # deepseek4 attention output LoRA
+ ATTN_OUT_B = auto() # deepseek4 attention output LoRA
ATTN_Q_A_NORM = auto()
ATTN_KV_A_NORM = auto()
+ ATTN_COMPRESSOR_APE = auto() # deepseek4 attention compressor
+ ATTN_COMPRESSOR_KV = auto() # deepseek4 attention compressor
+ ATTN_COMPRESSOR_GATE = auto() # deepseek4 attention compressor
+ ATTN_COMPRESSOR_NORM = auto() # deepseek4 attention compressor
FFN_SUB_NORM = auto()
ATTN_SUB_NORM = auto()
DEC_ATTN_NORM = auto()
@@ -717,6 +736,17 @@ class MODEL_TENSOR(IntEnum):
INDEXER_PROJ = auto()
INDEXER_ATTN_K = auto()
INDEXER_ATTN_Q_B = auto()
+ INDEXER_COMPRESSOR_APE = auto() # deepseek4 indexer compressor
+ INDEXER_COMPRESSOR_KV = auto() # deepseek4 indexer compressor
+ INDEXER_COMPRESSOR_GATE = auto() # deepseek4 indexer compressor
+ INDEXER_COMPRESSOR_NORM = auto() # deepseek4 indexer compressor
+ HC_ATTN_BASE = auto() # deepseek4 hyper-connection attention
+ HC_ATTN_FN = auto() # deepseek4 hyper-connection attention
+ HC_ATTN_SCALE = auto() # deepseek4 hyper-connection attention
+ HC_FFN_BASE = auto() # deepseek4 hyper-connection ffn
+ HC_FFN_FN = auto() # deepseek4 hyper-connection ffn
+ HC_FFN_SCALE = auto() # deepseek4 hyper-connection ffn
+ FFN_GATE_TID2EID = auto() # deepseek4 token-id-to-expert-id gating
# vision
V_MMPROJ = auto()
V_MMPROJ_FC = auto()
@@ -966,6 +996,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.DEEPSEEK: "deepseek",
MODEL_ARCH.DEEPSEEK2: "deepseek2",
MODEL_ARCH.DEEPSEEK2OCR: "deepseek2-ocr",
+ MODEL_ARCH.DEEPSEEK4: "deepseek4",
MODEL_ARCH.CHATGLM: "chatglm",
MODEL_ARCH.GLM4: "glm4",
MODEL_ARCH.GLM4_MOE: "glm4moe",
@@ -1042,6 +1073,9 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.POS_EMBD: "position_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
+ MODEL_TENSOR.OUTPUT_HC_BASE: "output_hc_base",
+ MODEL_TENSOR.OUTPUT_HC_FN: "output_hc_fn",
+ MODEL_TENSOR.OUTPUT_HC_SCALE: "output_hc_scale",
MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense
MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
@@ -1164,12 +1198,19 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value",
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
+ MODEL_TENSOR.ATTN_KV: "blk.{bid}.attn_kv",
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b",
MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b",
+ MODEL_TENSOR.ATTN_OUT_A: "blk.{bid}.attn_output_a",
+ MODEL_TENSOR.ATTN_OUT_B: "blk.{bid}.attn_output_b",
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
+ MODEL_TENSOR.ATTN_COMPRESSOR_APE: "blk.{bid}.attn_compressor_ape",
+ MODEL_TENSOR.ATTN_COMPRESSOR_KV: "blk.{bid}.attn_compressor_kv",
+ MODEL_TENSOR.ATTN_COMPRESSOR_GATE: "blk.{bid}.attn_compressor_gate",
+ MODEL_TENSOR.ATTN_COMPRESSOR_NORM: "blk.{bid}.attn_compressor_norm",
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
@@ -1231,6 +1272,17 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.INDEXER_PROJ: "blk.{bid}.indexer.proj",
MODEL_TENSOR.INDEXER_ATTN_K: "blk.{bid}.indexer.attn_k",
MODEL_TENSOR.INDEXER_ATTN_Q_B: "blk.{bid}.indexer.attn_q_b",
+ MODEL_TENSOR.INDEXER_COMPRESSOR_APE: "blk.{bid}.indexer_compressor_ape",
+ MODEL_TENSOR.INDEXER_COMPRESSOR_KV: "blk.{bid}.indexer_compressor_kv",
+ MODEL_TENSOR.INDEXER_COMPRESSOR_GATE: "blk.{bid}.indexer_compressor_gate",
+ MODEL_TENSOR.INDEXER_COMPRESSOR_NORM: "blk.{bid}.indexer_compressor_norm",
+ MODEL_TENSOR.HC_ATTN_BASE: "blk.{bid}.hc_attn_base",
+ MODEL_TENSOR.HC_ATTN_FN: "blk.{bid}.hc_attn_fn",
+ MODEL_TENSOR.HC_ATTN_SCALE: "blk.{bid}.hc_attn_scale",
+ MODEL_TENSOR.HC_FFN_BASE: "blk.{bid}.hc_ffn_base",
+ MODEL_TENSOR.HC_FFN_FN: "blk.{bid}.hc_ffn_fn",
+ MODEL_TENSOR.HC_FFN_SCALE: "blk.{bid}.hc_ffn_scale",
+ MODEL_TENSOR.FFN_GATE_TID2EID: "blk.{bid}.ffn_gate_tid2eid",
# vision
MODEL_TENSOR.V_MMPROJ: "mm.{bid}",
MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc",
@@ -2928,6 +2980,49 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
],
+ MODEL_ARCH.DEEPSEEK4: [
+ MODEL_TENSOR.TOKEN_EMBD,
+ MODEL_TENSOR.OUTPUT_NORM,
+ MODEL_TENSOR.OUTPUT,
+ MODEL_TENSOR.OUTPUT_HC_BASE,
+ MODEL_TENSOR.OUTPUT_HC_FN,
+ MODEL_TENSOR.OUTPUT_HC_SCALE,
+ MODEL_TENSOR.ATTN_NORM,
+ MODEL_TENSOR.ATTN_SINKS,
+ MODEL_TENSOR.ATTN_Q_A,
+ MODEL_TENSOR.ATTN_Q_B,
+ MODEL_TENSOR.ATTN_Q_A_NORM,
+ MODEL_TENSOR.ATTN_KV,
+ MODEL_TENSOR.ATTN_KV_A_NORM,
+ MODEL_TENSOR.ATTN_OUT_A,
+ MODEL_TENSOR.ATTN_OUT_B,
+ MODEL_TENSOR.ATTN_COMPRESSOR_APE,
+ MODEL_TENSOR.ATTN_COMPRESSOR_KV,
+ MODEL_TENSOR.ATTN_COMPRESSOR_GATE,
+ MODEL_TENSOR.ATTN_COMPRESSOR_NORM,
+ MODEL_TENSOR.INDEXER_PROJ,
+ MODEL_TENSOR.INDEXER_ATTN_Q_B,
+ MODEL_TENSOR.INDEXER_COMPRESSOR_APE,
+ MODEL_TENSOR.INDEXER_COMPRESSOR_KV,
+ MODEL_TENSOR.INDEXER_COMPRESSOR_GATE,
+ MODEL_TENSOR.INDEXER_COMPRESSOR_NORM,
+ MODEL_TENSOR.FFN_GATE_INP,
+ MODEL_TENSOR.FFN_NORM,
+ MODEL_TENSOR.FFN_GATE_EXP,
+ MODEL_TENSOR.FFN_DOWN_EXP,
+ MODEL_TENSOR.FFN_UP_EXP,
+ MODEL_TENSOR.FFN_GATE_SHEXP,
+ MODEL_TENSOR.FFN_DOWN_SHEXP,
+ MODEL_TENSOR.FFN_UP_SHEXP,
+ MODEL_TENSOR.FFN_EXP_PROBS_B,
+ MODEL_TENSOR.FFN_GATE_TID2EID,
+ MODEL_TENSOR.HC_ATTN_BASE,
+ MODEL_TENSOR.HC_ATTN_FN,
+ MODEL_TENSOR.HC_ATTN_SCALE,
+ MODEL_TENSOR.HC_FFN_BASE,
+ MODEL_TENSOR.HC_FFN_FN,
+ MODEL_TENSOR.HC_FFN_SCALE,
+ ],
MODEL_ARCH.ERNIE4_5_MOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
@@ -4147,6 +4242,8 @@ class GGMLQuantizationType(IntEnum):
class ExpertGatingFuncType(IntEnum):
SOFTMAX = 1
SIGMOID = 2
+ SOFTMAX_WEIGHT = 3
+ SQRTSOFTPLUS = 4
# TODO: add GGMLFileType from ggml_ftype in ggml.h
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index a101382719d0..4e3c94de12b9 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -868,6 +868,18 @@ def add_moe_latent_size(self, value: int) -> None:
def add_nextn_predict_layers(self, count: int) -> None:
self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
+ def add_hash_layer_count(self, count: int) -> None:
+ self.add_uint32(Keys.LLM.HASH_LAYER_COUNT.format(arch=self.arch), count)
+
+ def add_hyper_connection_count(self, count: int) -> None:
+ self.add_uint32(Keys.LLM.HYPER_CONNECTION_COUNT.format(arch=self.arch), count)
+
+ def add_hyper_connection_sinkhorn_iters(self, count: int) -> None:
+ self.add_uint32(Keys.LLM.HYPER_CONNECTION_SINKHORN_ITERS.format(arch=self.arch), count)
+
+ def add_hyper_connection_eps(self, value: float) -> None:
+ self.add_float32(Keys.LLM.HYPER_CONNECTION_EPS.format(arch=self.arch), value)
+
def add_swin_norm(self, value: bool) -> None:
self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
@@ -952,6 +964,18 @@ def add_attn_temperature_length(self, value: int) -> None:
def add_attn_temperature_scale(self, value: float) -> None:
self.add_float32(Keys.Attention.TEMPERATURE_SCALE.format(arch=self.arch), value)
+ def add_attention_compress_ratios(self, values: Sequence[int]) -> None:
+ self.add_array(Keys.Attention.COMPRESS_RATIOS.format(arch=self.arch), values)
+
+ def add_attention_compress_rope_freq_base(self, value: float) -> None:
+ self.add_float32(Keys.Attention.COMPRESS_ROPE_FREQ_BASE.format(arch=self.arch), value)
+
+ def add_attention_output_lora_rank(self, value: int) -> None:
+ self.add_uint32(Keys.Attention.OUTPUT_LORA_RANK.format(arch=self.arch), value)
+
+ def add_attention_output_group_count(self, value: int) -> None:
+ self.add_uint32(Keys.Attention.OUTPUT_GROUP_COUNT.format(arch=self.arch), value)
+
def add_pooling_type(self, value: PoolingType) -> None:
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
diff --git a/models/templates/deepseek-ai-DeepSeek-V4.jinja b/models/templates/deepseek-ai-DeepSeek-V4.jinja
new file mode 100644
index 000000000000..44d5b785ec04
--- /dev/null
+++ b/models/templates/deepseek-ai-DeepSeek-V4.jinja
@@ -0,0 +1,96 @@
+{%- if not add_generation_prompt is defined -%}
+ {%- set add_generation_prompt = false -%}
+{%- endif -%}
+{%- if not thinking is defined -%}
+ {%- if enable_thinking is defined -%}
+ {%- set thinking = enable_thinking -%}
+ {%- else -%}
+ {%- set thinking = false -%}
+ {%- endif -%}
+{%- endif -%}
+{%- set dsml_token = '|DSML|' -%}
+{%- set thinking_start_token = '' -%}
+{%- set thinking_end_token = '' -%}
+{%- set tools_header = '## Tools\n\nYou have access to a set of tools to help answer the user question. You can invoke tools by writing a "<' + dsml_token + 'tool_calls>" block like the following:\n\n<' + dsml_token + 'tool_calls>\n<' + dsml_token + 'invoke name="$TOOL_NAME">\n<' + dsml_token + 'parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE' + dsml_token + 'parameter>\n...\n' + dsml_token + 'invoke>\n<' + dsml_token + 'invoke name="$TOOL_NAME2">\n...\n' + dsml_token + 'invoke>\n' + dsml_token + 'tool_calls>\n\nString parameters should be specified as is and set `string="true"`. For all other types (numbers, booleans, arrays, objects), pass the value in JSON format and set `string="false"`.\n\nIf thinking_mode is enabled (triggered by ' + thinking_start_token + '), you MUST output your complete reasoning inside ' + thinking_start_token + '...' + thinking_end_token + ' BEFORE any tool calls or final response.\n\nOtherwise, output directly after ' + thinking_end_token + ' with tool calls or final response.\n\n### Available Tool Schemas\n\n' -%}
+{%- set tools_footer = '\n\nYou MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls.' -%}
+{%- set ns = namespace(system_prompt='', is_first_system=true, pending_assistant=false, pending_tool_result=false) -%}
+{%- for message in messages -%}
+ {%- if message['role'] == 'system' -%}
+ {%- if ns.is_first_system -%}
+ {%- set ns.system_prompt = ns.system_prompt + (message['content'] or '') -%}
+ {%- set ns.is_first_system = false -%}
+ {%- else -%}
+ {%- set ns.system_prompt = ns.system_prompt + '\n\n' + (message['content'] or '') -%}
+ {%- endif -%}
+ {%- endif -%}
+{%- endfor -%}
+{%- if tools is defined and tools -%}
+ {%- set ts = namespace(schemas='') -%}
+ {%- for tool in tools -%}
+ {%- if tool['type'] == 'function' -%}
+ {%- set ts.schemas = ts.schemas + (tool['function'] | tojson) + '\n' -%}
+ {%- endif -%}
+ {%- endfor -%}
+ {%- if ns.system_prompt -%}
+ {%- set ns.system_prompt = ns.system_prompt + '\n\n' + tools_header + ts.schemas + tools_footer -%}
+ {%- else -%}
+ {%- set ns.system_prompt = tools_header + ts.schemas + tools_footer -%}
+ {%- endif -%}
+{%- endif -%}
+{{- bos_token -}}
+{{- ns.system_prompt -}}
+{%- for message in messages -%}
+ {%- if message['role'] == 'user' -%}
+ {{- '<|User|>' + (message['content'] or '') -}}
+ {%- set ns.pending_assistant = true -%}
+ {%- set ns.pending_tool_result = true -%}
+ {%- elif message['role'] == 'tool' -%}
+ {%- if not ns.pending_tool_result -%}
+ {{- '<|User|>' -}}
+ {%- endif -%}
+ {{- '' + (message['content'] or '') + '' -}}
+ {%- set ns.pending_assistant = true -%}
+ {%- set ns.pending_tool_result = true -%}
+ {%- elif message['role'] == 'assistant' -%}
+ {%- if ns.pending_assistant -%}
+ {{- '<|Assistant|>' -}}
+ {%- if thinking and message['reasoning_content'] is defined and message['reasoning_content'] -%}
+ {{- thinking_start_token + message['reasoning_content'] + thinking_end_token -}}
+ {%- else -%}
+ {{- thinking_end_token -}}
+ {%- endif -%}
+ {%- endif -%}
+ {{- (message['content'] or '') -}}
+ {%- if message['tool_calls'] -%}
+ {{- '\n\n<' + dsml_token + 'tool_calls>\n' -}}
+ {%- for tool in message['tool_calls'] -%}
+ {%- set func = tool['function'] -%}
+ {{- '<' + dsml_token + 'invoke name="' + func['name'] + '">\n' -}}
+ {%- set args = func['arguments'] -%}
+ {%- if args is string -%}
+ {%- set args = args | from_json -%}
+ {%- endif -%}
+ {%- for key, val in args.items() -%}
+ {%- if val is string -%}
+ {{- '<' + dsml_token + 'parameter name="' + key + '" string="true">' + val + '' + dsml_token + 'parameter>\n' -}}
+ {%- else -%}
+ {{- '<' + dsml_token + 'parameter name="' + key + '" string="false">' + (val | tojson) + '' + dsml_token + 'parameter>\n' -}}
+ {%- endif -%}
+ {%- endfor -%}
+ {{- '' + dsml_token + 'invoke>\n' -}}
+ {%- endfor -%}
+ {{- '' + dsml_token + 'tool_calls>' -}}
+ {%- endif -%}
+ {{- '<|end▁of▁sentence|>' -}}
+ {%- set ns.pending_assistant = false -%}
+ {%- set ns.pending_tool_result = false -%}
+ {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt and ns.pending_assistant -%}
+ {{- '<|Assistant|>' -}}
+ {%- if thinking -%}
+ {{- thinking_start_token -}}
+ {%- else -%}
+ {{- thinking_end_token -}}
+ {%- endif -%}
+{%- endif -%}
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index c9eead18aa39..e789e5a681ae 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -75,6 +75,7 @@ static const std::map LLM_ARCH_NAMES = {
{ LLM_ARCH_DEEPSEEK, "deepseek" },
{ LLM_ARCH_DEEPSEEK2, "deepseek2" },
{ LLM_ARCH_DEEPSEEK2OCR, "deepseek2-ocr" },
+ { LLM_ARCH_DEEPSEEK4, "deepseek4" },
{ LLM_ARCH_CHATGLM, "chatglm" },
{ LLM_ARCH_GLM4, "glm4" },
{ LLM_ARCH_GLM4_MOE, "glm4moe" },
@@ -209,6 +210,10 @@ static const std::map LLM_KV_NAMES = {
{ LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" },
{ LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" },
{ LLM_KV_FULL_ATTENTION_INTERVAL, "%s.full_attention_interval" },
+ { LLM_KV_HASH_LAYER_COUNT, "%s.hash_layer_count" },
+ { LLM_KV_HYPER_CONNECTION_COUNT, "%s.hyper_connection.count" },
+ { LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS, "%s.hyper_connection.sinkhorn_iterations" },
+ { LLM_KV_HYPER_CONNECTION_EPS, "%s.hyper_connection.epsilon" },
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -243,6 +248,10 @@ static const std::map LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_INDEXER_KEY_LENGTH, "%s.attention.indexer.key_length" },
{ LLM_KV_ATTENTION_INDEXER_TOP_K, "%s.attention.indexer.top_k" },
{ LLM_KV_ATTENTION_SHARED_KV_LAYERS, "%s.attention.shared_kv_layers" },
+ { LLM_KV_ATTENTION_COMPRESS_RATIOS, "%s.attention.compress_ratios" },
+ { LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE, "%s.attention.compress_rope_freq_base" },
+ { LLM_KV_ATTENTION_OUTPUT_LORA_RANK, "%s.attention.output_lora_rank" },
+ { LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT, "%s.attention.output_group_count" },
{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_DIMENSION_COUNT_SWA, "%s.rope.dimension_count_swa" },
@@ -346,6 +355,9 @@ static const std::map LLM_TENSOR_NAMES = {
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT_NORM_LFM2, "token_embd_norm" }, // fix for wrong tensor name
{ LLM_TENSOR_OUTPUT, "output" },
+ { LLM_TENSOR_OUTPUT_HC_BASE, "output_hc_base" },
+ { LLM_TENSOR_OUTPUT_HC_FN, "output_hc_fn" },
+ { LLM_TENSOR_OUTPUT_HC_SCALE, "output_hc_scale" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
@@ -422,8 +434,15 @@ static const std::map LLM_TENSOR_NAMES = {
{ LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
{ LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
{ LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
+ { LLM_TENSOR_ATTN_KV, "blk.%d.attn_kv" },
{ LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
{ LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
+ { LLM_TENSOR_ATTN_OUT_A, "blk.%d.attn_output_a" },
+ { LLM_TENSOR_ATTN_OUT_B, "blk.%d.attn_output_b" },
+ { LLM_TENSOR_ATTN_COMPRESSOR_APE, "blk.%d.attn_compressor_ape" },
+ { LLM_TENSOR_ATTN_COMPRESSOR_KV, "blk.%d.attn_compressor_kv" },
+ { LLM_TENSOR_ATTN_COMPRESSOR_GATE, "blk.%d.attn_compressor_gate" },
+ { LLM_TENSOR_ATTN_COMPRESSOR_NORM, "blk.%d.attn_compressor_norm" },
{ LLM_TENSOR_PER_LAYER_TOKEN_EMBD, "per_layer_token_embd" },
{ LLM_TENSOR_PER_LAYER_MODEL_PROJ, "per_layer_model_proj" },
{ LLM_TENSOR_PER_LAYER_PROJ_NORM, "per_layer_proj_norm" },
@@ -548,6 +567,17 @@ static const std::map LLM_TENSOR_NAMES = {
{ LLM_TENSOR_INDEXER_PROJ, "blk.%d.indexer.proj" },
{ LLM_TENSOR_INDEXER_ATTN_K, "blk.%d.indexer.attn_k" },
{ LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" },
+ { LLM_TENSOR_INDEXER_COMPRESSOR_APE, "blk.%d.indexer_compressor_ape" },
+ { LLM_TENSOR_INDEXER_COMPRESSOR_KV, "blk.%d.indexer_compressor_kv" },
+ { LLM_TENSOR_INDEXER_COMPRESSOR_GATE, "blk.%d.indexer_compressor_gate" },
+ { LLM_TENSOR_INDEXER_COMPRESSOR_NORM, "blk.%d.indexer_compressor_norm" },
+ { LLM_TENSOR_HC_ATTN_BASE, "blk.%d.hc_attn_base" },
+ { LLM_TENSOR_HC_ATTN_FN, "blk.%d.hc_attn_fn" },
+ { LLM_TENSOR_HC_ATTN_SCALE, "blk.%d.hc_attn_scale" },
+ { LLM_TENSOR_HC_FFN_BASE, "blk.%d.hc_ffn_base" },
+ { LLM_TENSOR_HC_FFN_FN, "blk.%d.hc_ffn_fn" },
+ { LLM_TENSOR_HC_FFN_SCALE, "blk.%d.hc_ffn_scale" },
+ { LLM_TENSOR_FFN_GATE_TID2EID, "blk.%d.ffn_gate_tid2eid" },
};
// declare information about the model weight tensors:
@@ -566,6 +596,9 @@ static const std::map LLM_TENSOR_INFOS = {
{LLM_TENSOR_TOKEN_TYPES, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}},
{LLM_TENSOR_TOKEN_EMBD_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // do the norms on the first layer (not the input layer)
{LLM_TENSOR_OUTPUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_OUTPUT_HC_BASE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}},
+ {LLM_TENSOR_OUTPUT_HC_FN, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_OUTPUT_HC_SCALE, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_SCALE}},
{LLM_TENSOR_CLS, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CLS_OUT, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}},
{LLM_TENSOR_CLS_NORM, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}},
@@ -592,10 +625,15 @@ static const std::map LLM_TENSOR_INFOS = {
{LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_KV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_OUT_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_OUT_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_COMPRESSOR_KV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_COMPRESSOR_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_SINKS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
{LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -757,6 +795,19 @@ static const std::map LLM_TENSOR_INFOS = {
{LLM_TENSOR_INDEXER_PROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_INDEXER_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_INDEXER_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_INDEXER_COMPRESSOR_KV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_INDEXER_COMPRESSOR_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_ATTN_COMPRESSOR_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_INDEXER_COMPRESSOR_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+ {LLM_TENSOR_HC_ATTN_BASE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_HC_ATTN_FN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_HC_ATTN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
+ {LLM_TENSOR_HC_FFN_BASE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_HC_FFN_FN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+ {LLM_TENSOR_HC_FFN_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
+ {LLM_TENSOR_FFN_GATE_TID2EID, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+ {LLM_TENSOR_ATTN_COMPRESSOR_APE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+ {LLM_TENSOR_INDEXER_COMPRESSOR_APE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
// NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the
// last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so
// the model loader doesn't fault on the block index.
@@ -902,6 +953,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
case LLM_ARCH_OLMO2:
case LLM_ARCH_OLMOE:
case LLM_ARCH_DEEPSEEK2:
+ case LLM_ARCH_DEEPSEEK4:
case LLM_ARCH_GLM_DSA:
case LLM_ARCH_BITNET:
case LLM_ARCH_T5:
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 89cf16cc37cf..a1dcb037c7a2 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -79,6 +79,7 @@ enum llm_arch {
LLM_ARCH_DEEPSEEK,
LLM_ARCH_DEEPSEEK2,
LLM_ARCH_DEEPSEEK2OCR,
+ LLM_ARCH_DEEPSEEK4,
LLM_ARCH_CHATGLM,
LLM_ARCH_GLM4,
LLM_ARCH_GLM4_MOE,
@@ -213,6 +214,10 @@ enum llm_kv {
LLM_KV_TOKEN_SHIFT_COUNT,
LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
LLM_KV_FULL_ATTENTION_INTERVAL,
+ LLM_KV_HASH_LAYER_COUNT,
+ LLM_KV_HYPER_CONNECTION_COUNT,
+ LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS,
+ LLM_KV_HYPER_CONNECTION_EPS,
LLM_KV_ATTENTION_HEAD_COUNT,
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -247,6 +252,10 @@ enum llm_kv {
LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
LLM_KV_ATTENTION_INDEXER_TOP_K,
LLM_KV_ATTENTION_SHARED_KV_LAYERS,
+ LLM_KV_ATTENTION_COMPRESS_RATIOS,
+ LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE,
+ LLM_KV_ATTENTION_OUTPUT_LORA_RANK,
+ LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT,
LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_DIMENSION_COUNT_SWA,
@@ -354,6 +363,9 @@ enum llm_tensor {
LLM_TENSOR_DENSE_2_OUT,
LLM_TENSOR_DENSE_3_OUT,
LLM_TENSOR_OUTPUT,
+ LLM_TENSOR_OUTPUT_HC_BASE,
+ LLM_TENSOR_OUTPUT_HC_FN,
+ LLM_TENSOR_OUTPUT_HC_SCALE,
LLM_TENSOR_OUTPUT_NORM,
LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
LLM_TENSOR_ROPE_FREQS,
@@ -482,12 +494,19 @@ enum llm_tensor {
LLM_TENSOR_CHANNEL_MIX_VALUE,
LLM_TENSOR_ATTN_Q_A,
LLM_TENSOR_ATTN_Q_B,
+ LLM_TENSOR_ATTN_KV,
LLM_TENSOR_ATTN_KV_A_MQA,
LLM_TENSOR_ATTN_KV_B,
LLM_TENSOR_ATTN_K_B,
LLM_TENSOR_ATTN_V_B,
+ LLM_TENSOR_ATTN_OUT_A,
+ LLM_TENSOR_ATTN_OUT_B,
LLM_TENSOR_ATTN_Q_A_NORM,
LLM_TENSOR_ATTN_KV_A_NORM,
+ LLM_TENSOR_ATTN_COMPRESSOR_APE,
+ LLM_TENSOR_ATTN_COMPRESSOR_KV,
+ LLM_TENSOR_ATTN_COMPRESSOR_GATE,
+ LLM_TENSOR_ATTN_COMPRESSOR_NORM,
LLM_TENSOR_ATTN_SUB_NORM,
LLM_TENSOR_FFN_SUB_NORM,
LLM_TENSOR_DEC_ATTN_NORM,
@@ -549,6 +568,17 @@ enum llm_tensor {
LLM_TENSOR_INDEXER_PROJ,
LLM_TENSOR_INDEXER_ATTN_K,
LLM_TENSOR_INDEXER_ATTN_Q_B,
+ LLM_TENSOR_INDEXER_COMPRESSOR_APE,
+ LLM_TENSOR_INDEXER_COMPRESSOR_KV,
+ LLM_TENSOR_INDEXER_COMPRESSOR_GATE,
+ LLM_TENSOR_INDEXER_COMPRESSOR_NORM,
+ LLM_TENSOR_HC_ATTN_BASE,
+ LLM_TENSOR_HC_ATTN_FN,
+ LLM_TENSOR_HC_ATTN_SCALE,
+ LLM_TENSOR_HC_FFN_BASE,
+ LLM_TENSOR_HC_FFN_FN,
+ LLM_TENSOR_HC_FFN_SCALE,
+ LLM_TENSOR_FFN_GATE_TID2EID,
LLM_TENSOR_NEXTN_EH_PROJ,
LLM_TENSOR_NEXTN_EMBED_TOKENS,
LLM_TENSOR_NEXTN_ENORM,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d62abc4009b8..b1b6aa350735 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -420,7 +420,7 @@ void llama_context::sched_reserve() {
const int64_t t_start_us = ggml_time_us();
- const uint32_t n_seqs = cparams.n_seq_max;
+ const uint32_t n_seqs = model.arch == LLM_ARCH_DEEPSEEK4 ? 1 : cparams.n_seq_max;
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
const size_t max_nodes = this->graph_max_nodes(n_tokens);
@@ -596,6 +596,22 @@ void llama_context::sched_reserve() {
n_nodes_pp = ggml_graph_n_nodes(gf);
}
+ // DeepSeek V4 resumed-prompt chunks use the compressed-attention decode
+ // graph, which is larger than the position-zero prefill graph.
+ if (model.arch == LLM_ARCH_DEEPSEEK4 && n_tokens > 1) {
+ const llama_pos reserve_pos0 = std::min(
+ cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : n_tokens,
+ std::max(cparams.n_batch, 8u*n_tokens));
+ auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
+ model.hparams.no_alloc, nullptr, reserve_pos0);
+ if (!gf) {
+ throw std::runtime_error("failed to allocate DeepSeek V4 resumed pp buffers");
+ }
+
+ n_splits_pp = std::max(n_splits_pp, ggml_backend_sched_get_n_splits(sched.get()));
+ n_nodes_pp = std::max(n_nodes_pp, ggml_graph_n_nodes(gf));
+ }
+
// reserve with tg (token generation) graph to get the number of splits and nodes
{
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
@@ -2171,6 +2187,15 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
return std::max(n_tokens * 40, 32u * model.n_tensors());
}
+ if (model.arch == LLM_ARCH_DEEPSEEK4) {
+ // DeepSeek V4 has a position-dependent compressed-attention decode path
+ // that creates many temporary tensor objects, especially when a long
+ // prompt is split into non-prefill ubatches. The visible graph node
+ // count is much smaller than the number of GGML objects allocated while
+ // building those graphs, so reserve a larger metadata arena than the
+ // generic tensor-count heuristic would provide.
+ return std::max(524288u, n_tokens * 192 + 64u * model.n_tensors());
+ }
uint32_t res = std::max(1024u, 8u*model.n_tensors());
for (const auto & lora : model.loras) {
res += lora->get_n_nodes();
@@ -2183,7 +2208,7 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
}
ggml_cgraph * llama_context::graph_reserve(
- uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
+ uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes, llama_pos pos0) {
LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
GGML_ASSERT(n_outputs >= 1);
@@ -2207,6 +2232,14 @@ ggml_cgraph * llama_context::graph_reserve(
llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
+ if (pos0 != 0 && ubatch.pos != nullptr) {
+ for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+ ubatch.pos[i*ubatch.n_pos] = pos0 + i;
+ for (uint32_t j = 1; j < ubatch.n_pos; ++j) {
+ ubatch.pos[i*ubatch.n_pos + j] = 0;
+ }
+ }
+ }
// set one output token per sequence in order to activate all backend samplers
std::vector seq_ids(n_seqs);
@@ -3357,6 +3390,29 @@ llama_context * llama_init_from_model(
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
}
+ // V4 (DeepSeek4) requires fp16 KV cache: V4's standard SWA K cache,
+ // compressed-attention K cache (cache.attn_k), and indexer K cache
+ // (cache.index_k) all share the same `type_k` and must agree in dtype
+ // because src/models/deepseek4.cpp concatenates the SWA K view with the
+ // compressed K view via ggml_concat (which asserts a->type == b->type).
+ // Furthermore, V4's K activations are post-fp8-quantized
+ // (ggml_dsv4_fp8_kv_quantize), and q8_0's single fp16 scale per 32-element
+ // block cannot faithfully reproduce fp8-quantized value distributions --
+ // pinning to q8_0 corrupts decode silently ("=" loops, "Mirror ..."
+ // garbage). Coerce here, before the SPLIT_MODE_TENSOR / FA / V-quant
+ // shared validations below and before the constructor's flash_attn check,
+ // so those validations see the effective fp16 types and won't reject V4
+ // requests with --cache-type-k|v q8_0. See
+ // docs/plans/v4-port-kv-q8-completion.md.
+ if (model->arch == LLM_ARCH_DEEPSEEK4) {
+ if (params.type_k != GGML_TYPE_F16 || params.type_v != GGML_TYPE_F16) {
+ LLAMA_LOG_WARN("DeepSeek4: forcing fp16 KV cache (--cache-type-k|v are ignored for V4 because compressed/indexer K caches require fp16; "
+ "see docs/plans/v4-port-kv-q8-completion.md)\n");
+ params.type_k = GGML_TYPE_F16;
+ params.type_v = GGML_TYPE_F16;
+ }
+ }
+
if (model->split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
LLAMA_LOG_INFO("%s: enabling flash_attn since it is required for SPLIT_MODE_TENSOR\n", __func__);
diff --git a/src/llama-context.h b/src/llama-context.h
index e16ac4c618ba..999ba5a800c5 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -240,7 +240,8 @@ struct llama_context {
// reserve a graph with a dummy ubatch of the specified size
ggml_cgraph * graph_reserve(
- uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
+ uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr,
+ llama_pos pos0 = 0);
bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 858c297dd762..bbb74a0661b4 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -500,29 +500,41 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
}
void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
- mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
- mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+ if (self_k_idxs && self_k_idxs->buffer) {
+ mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
+ }
+ if (self_v_idxs && self_v_idxs->buffer) {
+ mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+ }
- mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+ if (self_kq_mask && self_kq_mask->buffer) {
+ mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+ }
- mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
- mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+ if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
+ mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
+ }
+ if (self_v_idxs_swa && self_v_idxs_swa->buffer) {
+ mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+ }
- mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+ if (self_kq_mask_swa && self_kq_mask_swa->buffer) {
+ mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+ }
- if (self_k_rot) {
+ if (self_k_rot && self_k_rot->buffer) {
mctx->get_base()->set_input_k_rot(self_k_rot);
}
- if (self_v_rot) {
+ if (self_v_rot && self_v_rot->buffer) {
mctx->get_base()->set_input_v_rot(self_v_rot);
}
- if (self_k_rot_swa) {
+ if (self_k_rot_swa && self_k_rot_swa->buffer) {
mctx->get_swa()->set_input_k_rot(self_k_rot_swa);
}
- if (self_v_rot_swa) {
+ if (self_v_rot_swa && self_v_rot_swa->buffer) {
mctx->get_swa()->set_input_v_rot(self_v_rot_swa);
}
}
@@ -534,14 +546,19 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
bool res = true;
- res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
- //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+ if (self_k_idxs && self_k_idxs->buffer) {
+ res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+ //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+ res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
+ }
- res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
- //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+ if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
+ res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
+ //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
- res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
- res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
+ res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
+ }
return res;
}
@@ -591,7 +608,7 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
const int64_t n_rs = mctx->get_recr()->get_n_rs();
- if (inp_rs->s_copy) {
+ if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
int32_t * data = (int32_t *) inp_rs->s_copy->data;
@@ -614,10 +631,12 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
res &= can_reuse_kq_mask(inp_attn->self_kq_mask, mctx->get_attn(), params.ubatch, params.cparams);
- res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+ if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
+ res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
- res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
- res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+ }
res &= inp_rs->head == mctx->get_recr()->get_head();
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
@@ -635,7 +654,7 @@ void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) {
const int64_t n_rs = mctx->get_recr()->get_n_rs();
- if (inp_rs->s_copy) {
+ if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
int32_t * data = (int32_t *) inp_rs->s_copy->data;
@@ -657,10 +676,12 @@ bool llm_graph_input_mem_hybrid_k::can_reuse(const llm_graph_params & params) {
res &= can_reuse_kq_mask(inp_attn->self_kq_mask, mctx->get_attn(), params.ubatch, params.cparams);
- res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+ if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
+ res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
- res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
- res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+ }
res &= inp_rs->head == mctx->get_recr()->get_head();
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
@@ -674,38 +695,46 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
// base tensors may not be allocated if there are no non-SWA attention layers
if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
- attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
+ if (inp_attn->self_v_idxs && inp_attn->self_v_idxs->buffer) {
+ attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
+ }
- attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+ if (inp_attn->self_kq_mask && inp_attn->self_kq_mask->buffer) {
+ attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+ }
}
// swa tensors may not be allocated if there are no SWA attention layers
if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch);
- attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
+ if (inp_attn->self_v_idxs_swa && inp_attn->self_v_idxs_swa->buffer) {
+ attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
+ }
- attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
+ if (inp_attn->self_kq_mask_swa && inp_attn->self_kq_mask_swa->buffer) {
+ attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
+ }
}
- if (inp_attn->self_k_rot) {
+ if (inp_attn->self_k_rot && inp_attn->self_k_rot->buffer) {
attn_ctx->get_base()->set_input_k_rot(inp_attn->self_k_rot);
}
- if (inp_attn->self_v_rot) {
+ if (inp_attn->self_v_rot && inp_attn->self_v_rot->buffer) {
attn_ctx->get_base()->set_input_v_rot(inp_attn->self_v_rot);
}
- if (inp_attn->self_k_rot_swa) {
+ if (inp_attn->self_k_rot_swa && inp_attn->self_k_rot_swa->buffer) {
attn_ctx->get_swa()->set_input_k_rot(inp_attn->self_k_rot_swa);
}
- if (inp_attn->self_v_rot_swa) {
+ if (inp_attn->self_v_rot_swa && inp_attn->self_v_rot_swa->buffer) {
attn_ctx->get_swa()->set_input_v_rot(inp_attn->self_v_rot_swa);
}
const int64_t n_rs = mctx->get_recr()->get_n_rs();
- if (inp_rs->s_copy) {
+ if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
int32_t * data = (int32_t *) inp_rs->s_copy->data;
@@ -741,10 +770,12 @@ bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params)
res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams);
}
- res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+ if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
+ res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
- res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
- res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_main->ne[0] == params.ubatch.n_seqs;
+ res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+ }
res &= inp_rs->head == mctx->get_recr()->get_head();
res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
@@ -1325,7 +1356,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * gate_up_exps,
ggml_tensor * up_exps_s,
ggml_tensor * gate_exps_s,
- ggml_tensor * down_exps_s) const {
+ ggml_tensor * down_exps_s,
+ ggml_tensor * selected_experts_in) const {
return build_moe_ffn(
cur,
gate_inp, /* gate_inp_b */ nullptr,
@@ -1345,7 +1377,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
/* gate_up_exps_b */ nullptr,
up_exps_s,
gate_exps_s,
- down_exps_s
+ down_exps_s,
+ selected_experts_in
);
}
@@ -1372,10 +1405,12 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * gate_up_exps_b,
ggml_tensor * up_exps_s,
ggml_tensor * gate_exps_s,
- ggml_tensor * down_exps_s) const {
+ ggml_tensor * down_exps_s,
+ ggml_tensor * selected_experts_in) const {
const int64_t n_embd = cur->ne[0];
const int64_t n_tokens = cur->ne[1];
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
+ const bool weight_before_down = arch == LLM_ARCH_DEEPSEEK4; // DeepSeek V4 applies routed weights after SwiGLU and before w2
ggml_tensor * logits = nullptr;
@@ -1401,6 +1436,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
{
probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
} break;
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS:
+ {
+ probs = ggml_sqrt(ctx0, ggml_softplus(ctx0, logits)); // [n_expert, n_tokens]
+ } break;
case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT:
{
probs = logits; // [n_expert, n_tokens]
@@ -1455,8 +1494,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
}
// select experts
- ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
- cb(selected_experts->src[0], "ffn_moe_argsort", il);
+ ggml_tensor * selected_experts = selected_experts_in;
+ if (selected_experts == nullptr) {
+ selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+ cb(selected_experts->src[0], "ffn_moe_argsort", il);
+ }
cb(selected_experts, "ffn_moe_topk", il);
if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
@@ -1584,6 +1626,25 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
switch (type_op) {
case LLM_FFN_SILU:
if (gate_exps) {
+ if (arch == LLM_ARCH_DEEPSEEK4 && il >= 0) {
+ const float limit = hparams.swiglu_clamp_exp[il];
+ constexpr float eps = 1e-6f;
+ if (limit > eps) {
+ cur = ggml_clamp(ctx0, cur, -INFINITY, limit);
+ cb(cur, "ffn_moe_gate_clamped", il);
+
+ ggml_tensor * gate_act = ggml_silu(ctx0, cur);
+ cb(gate_act, "ffn_moe_silu", il);
+
+ up = ggml_clamp(ctx0, up, -limit, limit);
+ cb(up, "ffn_moe_up_clamped", il);
+
+ cur = ggml_mul(ctx0, gate_act, up);
+ cb(cur, "ffn_moe_swiglu_limited", il);
+ break;
+ }
+ }
+
// Step35: per-layer clamp for routed experts
if (arch == LLM_ARCH_STEP35 && il >= 0) {
const float limit = hparams.swiglu_clamp_exp[il];
@@ -1648,6 +1709,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
GGML_ABORT("fatal error");
}
+ if (weight_before_down) {
+ cur = ggml_mul(ctx0, cur, weights);
+ cb(cur, "ffn_moe_weighted_swiglu", il);
+ }
+
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il);
@@ -1665,7 +1731,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(experts, "ffn_moe_down_scaled", il);
}
- if (!weight_before_ffn) {
+ if (!weight_before_ffn && !weight_before_down) {
experts = ggml_mul(ctx0, experts, weights);
cb(experts, "ffn_moe_weighted", il);
}
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 9e55d0a675e0..260334f7302f 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -849,7 +849,8 @@ struct llm_graph_context {
ggml_tensor * gate_up_exps = nullptr,
ggml_tensor * up_exps_s = nullptr,
ggml_tensor * gate_exps_s = nullptr,
- ggml_tensor * down_exps_s = nullptr) const;
+ ggml_tensor * down_exps_s = nullptr,
+ ggml_tensor * selected_experts_in = nullptr) const;
ggml_tensor * build_moe_ffn(
ggml_tensor * cur,
@@ -874,7 +875,8 @@ struct llm_graph_context {
ggml_tensor * gate_up_exps_b = nullptr,
ggml_tensor * up_exps_s = nullptr,
ggml_tensor * gate_exps_s = nullptr,
- ggml_tensor * down_exps_s = nullptr) const;
+ ggml_tensor * down_exps_s = nullptr,
+ ggml_tensor * selected_experts_in = nullptr) const;
//
// inputs
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 2239309c8fb4..44eaf501f7dc 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -153,6 +153,10 @@ uint32_t llama_hparams::n_embd_v_gqa_max() const {
}
uint32_t llama_hparams::n_embd_r() const {
+ if (dsv4_state_size != 0) {
+ return dsv4_state_size;
+ }
+
if (wkv_head_size != 0) {
// for RWKV models
return token_shift_count * n_embd;
@@ -177,6 +181,10 @@ uint32_t llama_hparams::n_embd_r() const {
}
uint32_t llama_hparams::n_embd_s() const {
+ if (dsv4_state_size != 0) {
+ return dsv4_state_size;
+ }
+
if (wkv_head_size != 0) {
// corresponds to RWKV's wkv_states size
return n_embd * wkv_head_size;
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index e2d051edc6cd..3a0438283e77 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -14,6 +14,7 @@ enum llama_expert_gating_func_type {
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS = 4,
};
enum llama_swa_type {
@@ -75,6 +76,8 @@ struct llama_hparams {
uint32_t n_layer_dense_lead = 0;
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
+ uint32_t n_lora_o = 0;
+ uint32_t n_attn_out_groups = 0;
uint32_t n_ff_exp = 0;
uint32_t n_ff_shexp = 0;
uint32_t n_ff_chexp = 0;
@@ -91,6 +94,7 @@ struct llama_hparams {
uint32_t moe_every_n_layers = 0;
uint32_t moe_latent_size = 0;
uint32_t nextn_predict_layers = 0;
+ uint32_t n_hash_layers = 0;
bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches)
@@ -211,6 +215,14 @@ struct llama_hparams {
uint32_t indexer_head_size = 0;
uint32_t indexer_top_k = 0;
+ // DeepSeek V4 hyper-connections and sparse KV compression
+ uint32_t n_hc = 1;
+ uint32_t hc_sinkhorn_iters = 0;
+ float hc_eps = 0.0f;
+ float compress_rope_freq_base = 0.0f;
+ uint32_t dsv4_state_size = 0;
+ std::array attn_compress_ratio;
+
// qwen3vl deepstack
uint32_t n_deepstack_layers = 0;
diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp
index 26e2cb4270b0..9b9f17903637 100644
--- a/src/llama-kv-cache-iswa.cpp
+++ b/src/llama-kv-cache-iswa.cpp
@@ -60,14 +60,14 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
kv_base = std::make_unique(
- model, type_k, type_v,
+ model, hparams, type_k, type_v,
v_trans, offload, unified, size_base, n_seq_max, n_pad,
0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
kv_swa = std::make_unique(
- model, type_k, type_v,
+ model, hparams, type_k, type_v,
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
hparams.n_swa, hparams.swa_type, filter_swa, reuse);
}
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a49a055a6304..92585b671b55 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -79,6 +79,7 @@ static ggml_tensor * ggml_mul_mat_aux(
llama_kv_cache::llama_kv_cache(
const llama_model & model,
+ const llama_hparams & hparams,
ggml_type type_k,
ggml_type type_v,
bool v_trans,
@@ -91,7 +92,7 @@ llama_kv_cache::llama_kv_cache(
llama_swa_type swa_type,
const layer_filter_cb & filter,
const layer_reuse_cb & reuse) :
- model(model), hparams(model.hparams), v_trans(v_trans),
+ model(model), hparams(hparams), v_trans(v_trans),
n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
GGML_ASSERT(kv_size % n_pad == 0);
@@ -205,7 +206,7 @@ llama_kv_cache::llama_kv_cache(
}
const bool has_k = true;
- const bool has_v = !is_mla;
+ const bool has_v = !is_mla && model.arch != LLM_ARCH_DEEPSEEK4;
ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
@@ -253,7 +254,7 @@ llama_kv_cache::llama_kv_cache(
// allocate tensors and initialize the buffers to avoid NaNs in the padding
for (auto & [buft, ctx] : ctx_map) {
ggml_backend_buffer_t buf;
- if (model.hparams.no_alloc) {
+ if (hparams.no_alloc) {
buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 0b62dc7b2320..0b0a56ce92f4 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -95,6 +95,7 @@ class llama_kv_cache : public llama_memory_i {
llama_kv_cache(
const llama_model & model,
+ const llama_hparams & hparams,
ggml_type type_k,
ggml_type type_v,
bool v_trans,
diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp
index a59561ea54dd..58dadabc9f62 100644
--- a/src/llama-memory-hybrid-iswa.cpp
+++ b/src/llama-memory-hybrid-iswa.cpp
@@ -1,9 +1,113 @@
#include "llama-memory-hybrid-iswa.h"
+#include "ggml-backend.h"
+
#include "llama-impl.h"
+#include "llama-io.h"
#include "llama-model.h"
#include "llama-context.h"
+#include
+#include
+#include
+#include