diff --git a/common/chat.cpp b/common/chat.cpp
index 70b9f5dc2c58..38f7a2ed744a 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1661,6 +1661,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
 static common_chat_params common_chat_params_init_deepseek_v3_2(const common_chat_template &    tmpl,
                                                                  const autoparser::generation_params & inputs) {
     common_chat_params data;
+    const auto & src = tmpl.source();
 
     data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
     data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
@@ -1681,8 +1682,9 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
     const std::string DSML         = "｜DSML｜";
     const std::string THINK_START  = "<think>";
     const std::string THINK_END    = "</think>";
-    const std::string FC_START     = "<" + DSML + "function_calls>";
-    const std::string FC_END       = "</" + DSML + "function_calls>";
+    const std::string FC_NAME      = src.find("function_calls") != std::string::npos ? "function_calls" : "tool_calls";
+    const std::string FC_START     = "<" + DSML + FC_NAME + ">";
+    const std::string FC_END       = "</" + DSML + FC_NAME + ">";
     const std::string INVOKE_START = "<" + DSML + "invoke";
     const std::string INVOKE_END   = "</" + DSML + "invoke>";
     const std::string PARAM_START  = "<" + DSML + "parameter";
@@ -2093,12 +2095,12 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
         return common_chat_params_init_gigachat_v3(tmpl, params);
     }
 
-    // DeepSeek V3.2 format detection: template defines dsml_token and uses it for tool calls.
+    // DeepSeek DSML format detection: template defines dsml_token and uses it for tool calls.
     // The template source contains the token as a variable assignment, not as a literal in markup.
     if (src.find("dsml_token") != std::string::npos &&
-        src.find("function_calls") != std::string::npos &&
+        (src.find("function_calls") != std::string::npos || src.find("tool_calls") != std::string::npos) &&
         src.find("DSML") != std::string::npos) {
-        LOG_DBG("Using specialized template: DeepSeek V3.2\n");
+        LOG_DBG("Using specialized template: DeepSeek DSML\n");
         return common_chat_params_init_deepseek_v3_2(tmpl, params);
     }
 
diff --git a/conversion/__init__.py b/conversion/__init__.py
index 2c38123dff8d..bba37a5cbbc7 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -47,6 +47,7 @@
     "DeepseekForCausalLM": "deepseek",
     "DeepseekV2ForCausalLM": "deepseek",
     "DeepseekV3ForCausalLM": "deepseek",
+    "DeepseekV4ForCausalLM": "deepseek",
     "DistilBertForMaskedLM": "bert",
     "DistilBertForSequenceClassification": "bert",
     "DistilBertModel": "bert",
diff --git a/conversion/deepseek.py b/conversion/deepseek.py
index e149fcbf752e..86a3046b9e98 100644
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@@ -1,18 +1,26 @@
 from __future__ import annotations
 
+import concurrent.futures
+import ctypes
+import math
+import os
 import re
 
-from typing import Any, Callable, Iterable, TYPE_CHECKING
+from pathlib import Path
+from typing import Any, Callable, Iterable, Sequence, TYPE_CHECKING
 
+import numpy as np
 import torch
 
 if TYPE_CHECKING:
     from torch import Tensor
 
-from .base import MmprojModel, ModelBase, TextModel, gguf, logger
+from .base import LazyTorchTensor, MmprojModel, ModelBase, TextModel, gguf, logger
 
 from .qwen import QwenModel
 
+TORCH_FLOAT8_E8M0FNU = getattr(torch, "float8_e8m0fnu", None)
+
 
 @ModelBase.register("DeepseekOCRForCausalLM")
 class DeepseekOCRVisionModel(MmprojModel):
@@ -386,3 +394,648 @@ def prepare_tensors(self):
             experts = [k for d in self._experts for k in d.keys()]
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@ModelBase.register("DeepseekV4ForCausalLM")
+class DeepseekV4Model(TextModel):
+    model_arch = gguf.MODEL_ARCH.DEEPSEEK4
+
+    # Optional DeepSeek V4 debug / expert-quant knobs. In the pre-#17114
+    # monolithic convert_hf_to_gguf.py these were ModelBase.__init__ params
+    # wired to --deepseek4-* CLI flags. The refactored conversion/base.py
+    # ModelBase.__init__ does not accept them, so they default here; the
+    # standard DeepseekV4ForCausalLM conversion path does not require them.
+    deepseek4_max_layers: int | None = None
+    deepseek4_expert_outtypes: str | None = None
+    deepseek4_expert_workers: int = 1
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    _fp4_table = torch.tensor([
+        0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0,
+        0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0,
+    ], dtype=torch.float32)
+
+    _qtype_aliases: dict[str, gguf.GGMLQuantizationType] = {
+        "q8_0": gguf.GGMLQuantizationType.Q8_0,
+        "q2_k": gguf.GGMLQuantizationType.Q2_K,
+        "iq2_xxs": gguf.GGMLQuantizationType.IQ2_XXS,
+        "iq2_xs": gguf.GGMLQuantizationType.IQ2_XS,
+        "tq1_0": gguf.GGMLQuantizationType.TQ1_0,
+        "tq2_0": gguf.GGMLQuantizationType.TQ2_0,
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._deepseek4_original_block_count = self.block_count
+        if self.deepseek4_max_layers is not None:
+            if self.deepseek4_max_layers <= 0:
+                raise ValueError("--deepseek4-max-layers must be positive")
+            if self.deepseek4_max_layers > self.block_count:
+                raise ValueError(
+                    f"--deepseek4-max-layers={self.deepseek4_max_layers} exceeds model layer count {self.block_count}"
+                )
+            self.block_count = self.deepseek4_max_layers
+            self.hparams["num_hidden_layers"] = self.block_count
+            self.hparams["n_layers"] = self.block_count
+            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+            logger.warning(
+                "DeepSeek V4 debug export: writing only the first %d/%d transformer layers",
+                self.block_count,
+                self._deepseek4_original_block_count,
+            )
+
+        self._deepseek4_expert_qtypes = self._parse_expert_outtype_spec(self.deepseek4_expert_outtypes)
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        self.hparams["num_key_value_heads"] = self.hparams.get("num_key_value_heads", 1)
+
+        super().set_gguf_parameters()
+        hparams = self.hparams
+
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+        self.gguf_writer.add_attention_output_lora_rank(hparams["o_lora_rank"])
+        self.gguf_writer.add_attention_output_group_count(hparams["o_groups"])
+        self.gguf_writer.add_attention_compress_ratios(hparams["compress_ratios"])
+        self.gguf_writer.add_attention_compress_rope_freq_base(hparams["compress_rope_theta"])
+
+        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
+        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+        self.gguf_writer.add_expert_weights_scale(hparams.get("routed_scaling_factor", 1.0))
+        self.gguf_writer.add_hash_layer_count(min(hparams["num_hash_layers"], self.block_count))
+        if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None:
+            self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
+        if (swiglu_limit := hparams.get("swiglu_limit")) is not None and float(swiglu_limit) > 0.0:
+            self.gguf_writer.add_swiglu_clamp_exp([float(swiglu_limit)] * self.block_count)
+
+        if (sliding_window := hparams.get("sliding_window")) is not None:
+            self.gguf_writer.add_sliding_window(sliding_window)
+
+        self.gguf_writer.add_indexer_head_count(hparams["index_n_heads"])
+        self.gguf_writer.add_indexer_key_length(hparams["index_head_dim"])
+        self.gguf_writer.add_indexer_top_k(hparams["index_topk"])
+
+        if self.deepseek4_max_layers is None and (num_nextn_predict_layers := hparams.get("num_nextn_predict_layers")) is not None:
+            self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
+
+        self.gguf_writer.add_hyper_connection_count(hparams["hc_mult"])
+        self.gguf_writer.add_hyper_connection_sinkhorn_iters(hparams["hc_sinkhorn_iters"])
+        self.gguf_writer.add_hyper_connection_eps(hparams["hc_eps"])
+
+    @staticmethod
+    def _strip_model_prefix(name: str) -> str:
+        return name.removeprefix("model.")
+
+    def _skip_layer_tensor(self, stripped_name: str) -> bool:
+        if self.deepseek4_max_layers is None:
+            return False
+        match = re.match(r"layers\.(\d+)\.", stripped_name)
+        return match is not None and int(match.group(1)) >= self.block_count
+
+    @staticmethod
+    def _is_low_bit_ftype(ftype: gguf.LlamaFileType) -> bool:
+        return ftype in (
+            gguf.LlamaFileType.MOSTLY_TQ1_0,
+            gguf.LlamaFileType.MOSTLY_TQ2_0,
+            gguf.LlamaFileType.MOSTLY_Q2_K,
+            gguf.LlamaFileType.MOSTLY_IQ2_XXS,
+            gguf.LlamaFileType.MOSTLY_IQ2_XS,
+        )
+
+    @staticmethod
+    def _qtype_for_ftype(ftype: gguf.LlamaFileType) -> gguf.GGMLQuantizationType | None:
+        return {
+            gguf.LlamaFileType.MOSTLY_TQ1_0: gguf.GGMLQuantizationType.TQ1_0,
+            gguf.LlamaFileType.MOSTLY_TQ2_0: gguf.GGMLQuantizationType.TQ2_0,
+            gguf.LlamaFileType.MOSTLY_Q2_K: gguf.GGMLQuantizationType.Q2_K,
+            gguf.LlamaFileType.MOSTLY_IQ2_XXS: gguf.GGMLQuantizationType.IQ2_XXS,
+            gguf.LlamaFileType.MOSTLY_IQ2_XS: gguf.GGMLQuantizationType.IQ2_XS,
+            gguf.LlamaFileType.MOSTLY_Q8_0: gguf.GGMLQuantizationType.Q8_0,
+        }.get(ftype)
+
+    @classmethod
+    def _parse_qtype_name(cls, name: str) -> gguf.GGMLQuantizationType:
+        qtype = cls._qtype_aliases.get(name.strip().lower())
+        if qtype is None:
+            allowed = ", ".join(sorted(cls._qtype_aliases))
+            raise ValueError(f"unknown DeepSeek V4 expert outtype {name!r}; expected one of: {allowed}")
+        return qtype
+
+    @classmethod
+    def _parse_expert_outtype_spec(cls, spec: str | None) -> dict[str, gguf.GGMLQuantizationType]:
+        if spec is None:
+            return {}
+
+        result: dict[str, gguf.GGMLQuantizationType] = {}
+        for item in spec.split(","):
+            item = item.strip()
+            if not item:
+                continue
+            if "=" not in item:
+                qtype = cls._parse_qtype_name(item)
+                result.update({"w1": qtype, "w2": qtype, "w3": qtype})
+                continue
+            key, value = (part.strip().lower() for part in item.split("=", 1))
+            if key not in ("w1", "w2", "w3", "gate", "down", "up"):
+                raise ValueError(f"unknown DeepSeek V4 expert tensor selector {key!r}")
+            wid = {"gate": "w1", "down": "w2", "up": "w3"}.get(key, key)
+            result[wid] = cls._parse_qtype_name(value)
+        return result
+
+    @staticmethod
+    def _scale_to_float(scale: Tensor) -> Tensor:
+        if TORCH_FLOAT8_E8M0FNU is not None and scale.dtype == TORCH_FLOAT8_E8M0FNU:
+            return scale.float()
+
+        if scale.dtype in (torch.uint8, torch.int8):
+            e = scale.view(torch.uint8).to(torch.int32)
+            bits = torch.where(
+                e == 0,
+                torch.full_like(e, 0x00400000),
+                e << 23,
+            )
+            return bits.view(torch.float32)
+
+        return scale.float()
+
+    @staticmethod
+    def _scale_to_e8m0_bytes(scale: Tensor) -> Tensor:
+        if TORCH_FLOAT8_E8M0FNU is not None and scale.dtype == TORCH_FLOAT8_E8M0FNU:
+            return scale.view(torch.uint8)
+        if scale.dtype in (torch.uint8, torch.int8):
+            return scale.view(torch.uint8)
+
+        scale = scale.float()
+        e = torch.where(
+            scale > 0,
+            torch.floor(torch.log2(scale)).to(torch.int32) + 127,
+            torch.zeros_like(scale, dtype=torch.int32),
+        )
+        return torch.clamp(e, 0, 255).to(torch.uint8)
+
+    @classmethod
+    def _dequant_fp8_weight(cls, weight: Tensor, scale: Tensor, block_size: Sequence[int]) -> Tensor:
+        if len(block_size) != 2:
+            raise ValueError(f"DeepSeek V4 expects 2D FP8 block scales, got block size {block_size}")
+
+        block_out, block_in = block_size
+        out_dim, in_dim = weight.shape
+        if out_dim % block_out != 0 or in_dim % block_in != 0:
+            raise ValueError(f"FP8 tensor shape {tuple(weight.shape)} is not divisible by block size {block_size}")
+
+        scale = cls._scale_to_float(scale)
+        expected_scale = (out_dim // block_out, in_dim // block_in)
+        if tuple(scale.shape) != expected_scale:
+            raise ValueError(f"FP8 scale shape {tuple(scale.shape)} does not match expected {expected_scale}")
+
+        weight = weight.reshape(out_dim // block_out, block_out, in_dim // block_in, block_in)
+        weight = weight.float() * scale[:, None, :, None]
+        return weight.reshape(out_dim, in_dim)
+
+    @classmethod
+    def _dequant_fp4_weight(cls, weight: Tensor, scale: Tensor) -> Tensor:
+        weight = weight.view(torch.uint8)
+        out_dim, packed_in_dim = weight.shape
+        in_dim = packed_in_dim * 2
+        if in_dim % 32 != 0:
+            raise ValueError(f"FP4 packed tensor shape {tuple(weight.shape)} does not contain 32-value blocks")
+
+        n_blocks = in_dim // 32
+        scale = cls._scale_to_float(scale)
+        if tuple(scale.shape) != (out_dim, n_blocks):
+            raise ValueError(f"FP4 scale shape {tuple(scale.shape)} does not match expected {(out_dim, n_blocks)}")
+
+        fp4_table = cls._fp4_table.to(weight.device)
+        packed = weight.reshape(out_dim, n_blocks, 16)
+        low = packed & 0x0F
+        high = (packed >> 4) & 0x0F
+        vals = torch.stack((low, high), dim=-1).reshape(out_dim, n_blocks, 32)
+        vals = fp4_table[vals.long()] * scale.unsqueeze(-1)
+        return vals.reshape(out_dim, in_dim)
+
+    @classmethod
+    def _pack_fp4_as_mxfp4(cls, weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]:
+        weight = weight.view(torch.uint8)
+        out_dim, packed_in_dim = weight.shape
+        in_dim = packed_in_dim * 2
+        if in_dim % 32 != 0:
+            raise ValueError(f"FP4 packed tensor shape {tuple(weight.shape)} does not contain 32-value blocks")
+
+        n_blocks = in_dim // 32
+        scale_e = cls._scale_to_e8m0_bytes(scale)
+        if tuple(scale_e.shape) != (out_dim, n_blocks):
+            raise ValueError(f"FP4 scale shape {tuple(scale_e.shape)} does not match expected {(out_dim, n_blocks)}")
+
+        packed = weight.reshape(out_dim, n_blocks, 16)
+        low = packed & 0x0F
+        high = (packed >> 4) & 0x0F
+        vals = torch.stack((low, high), dim=-1).reshape(out_dim, n_blocks, 32)
+        qs = vals[:, :, :16] | (vals[:, :, 16:] << 4)
+        raw = torch.cat((scale_e.unsqueeze(-1), qs), dim=-1).reshape(out_dim, n_blocks * 17)
+        return raw.numpy(), [out_dim, in_dim]
+
+    _ggml_quant_lib: Any = None
+
+    @classmethod
+    def _load_ggml_quant_lib(cls):
+        if cls._ggml_quant_lib is not None:
+            return cls._ggml_quant_lib
+
+        # This module lives in the conversion/ package; the repo root (where
+        # build/bin/libggml.* lands) is its parent's parent. In the pre-#17114
+        # monolithic convert_hf_to_gguf.py, __file__ was the repo-root script,
+        # so .parent alone was the repo root -- search both so the lookup is
+        # correct regardless of package layout.
+        repo_root = Path(__file__).resolve().parent.parent
+        pkg_root  = Path(__file__).resolve().parent
+        candidates = [
+            os.environ.get("LLAMA_CPP_LIBGGML"),
+            repo_root / "build" / "bin" / "libggml.dylib",
+            repo_root / "build" / "bin" / "libggml.so",
+            repo_root / "build" / "bin" / "ggml.dll",
+            pkg_root  / "build" / "bin" / "libggml.dylib",
+            pkg_root  / "build" / "bin" / "libggml.so",
+            pkg_root  / "build" / "bin" / "ggml.dll",
+        ]
+        for candidate in candidates:
+            if candidate is None:
+                continue
+            path = Path(candidate)
+            if not path.is_file():
+                continue
+            lib = ctypes.CDLL(str(path))
+            lib.ggml_quantize_chunk.restype = ctypes.c_size_t
+            lib.ggml_quantize_chunk.argtypes = (
+                ctypes.c_int,
+                ctypes.POINTER(ctypes.c_float),
+                ctypes.c_void_p,
+                ctypes.c_int64,
+                ctypes.c_int64,
+                ctypes.c_int64,
+                ctypes.POINTER(ctypes.c_float),
+            )
+            lib.ggml_quantize_requires_imatrix.restype = ctypes.c_bool
+            lib.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,)
+            cls._ggml_quant_lib = lib
+            return lib
+
+        raise RuntimeError(
+            "DeepSeek V4 low-bit expert conversion needs llama.cpp's libggml. "
+            "Build llama.cpp first or set LLAMA_CPP_LIBGGML to libggml."
+        )
+
+    @classmethod
+    def _quantize_deepseek4_expert(cls, data: np.ndarray, qtype: gguf.GGMLQuantizationType) -> np.ndarray:
+        c_quantized_types = {
+            gguf.GGMLQuantizationType.Q2_K,
+            gguf.GGMLQuantizationType.IQ2_XXS,
+            gguf.GGMLQuantizationType.IQ2_XS,
+        }
+        if qtype not in c_quantized_types:
+            return gguf.quants.quantize(data, qtype)
+
+        data = np.ascontiguousarray(data, dtype=np.float32)
+        out = np.zeros(gguf.quant_shape_to_byte_shape(data.shape, qtype), dtype=np.uint8, order="C")
+        lib = cls._load_ggml_quant_lib()
+        nrows = math.prod(data.shape[:-1])
+        n_per_row = data.shape[-1]
+        imatrix = ctypes.cast(0, ctypes.POINTER(ctypes.c_float))
+        if lib.ggml_quantize_requires_imatrix(qtype.value):
+            qw = np.ascontiguousarray(np.sum(data.reshape(-1, n_per_row) ** 2, axis=0), dtype=np.float32)
+            imatrix = qw.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+        result_size = lib.ggml_quantize_chunk(
+            qtype.value,
+            data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+            out.ctypes.data_as(ctypes.c_void_p),
+            0,
+            nrows,
+            n_per_row,
+            imatrix,
+        )
+        if result_size != out.size:
+            raise RuntimeError(f"ggml_quantize_chunk wrote {result_size} bytes, expected {out.size}")
+        return out
+
+    def _write_deepseek4_tid2eid_tensors(self) -> set[str]:
+        consumed: set[str] = set()
+        for name in list(self.model_tensors.keys()):
+            stripped = self._strip_model_prefix(name)
+            if self._skip_layer_tensor(stripped):
+                consumed.add(name)
+                continue
+            if re.match(r"layers\.\d+\.ffn\.gate\.tid2eid$", stripped) is None:
+                continue
+
+            data = LazyTorchTensor.to_eager(self.model_tensors[name]()).to(torch.int32).numpy()
+            new_name = self.map_tensor_name(stripped)
+            logger.info(f"{new_name}, int32 --> I32, shape = {{{', '.join(str(n) for n in reversed(data.shape))}}}")
+            self.gguf_writer.add_tensor(new_name, data)
+            consumed.add(name)
+        return consumed
+
+    def _write_deepseek4_expert_tensors(self) -> set[str]:
+        default_qtype = self._qtype_for_ftype(self.ftype)
+        if default_qtype is None and not self._deepseek4_expert_qtypes:
+            if any(re.match(r"(?:model\.)?layers\.\d+\.ffn\.experts\.\d+\.w[123]\.weight$", name) for name in self.model_tensors):
+                raise NotImplementedError(
+                    "DeepSeek V4 routed FP4 experts must be converted directly to a compact GGUF type. "
+                    "Use --outtype iq2_xxs, iq2_xs, q2_k, tq2_0, tq1_0, or q8_0."
+                )
+            return set()
+
+        n_experts = self.hparams["n_routed_experts"]
+        consumed: set[str] = set()
+        groups: dict[tuple[int, str], dict[int, tuple[str, str]]] = {}
+
+        for name in list(self.model_tensors.keys()):
+            stripped = self._strip_model_prefix(name)
+            if self._skip_layer_tensor(stripped):
+                consumed.add(name)
+                continue
+            match = re.match(r"layers\.(\d+)\.ffn\.experts\.(\d+)\.(w[123])\.weight$", stripped)
+            if match is None:
+                continue
+
+            bid = int(match.group(1))
+            xid = int(match.group(2))
+            wid = match.group(3)
+            qtype = self._deepseek4_expert_qtypes.get(wid, default_qtype)
+            if qtype is None:
+                raise RuntimeError(f"No DeepSeek V4 expert quantization type selected for {wid}")
+            scale_name = f"{stripped.removesuffix('.weight')}.scale"
+            model_scale_name = scale_name if scale_name in self.model_tensors else f"model.{scale_name}"
+            if model_scale_name not in self.model_tensors:
+                raise ValueError(f"Missing DeepSeek V4 FP4 scale tensor for {stripped}")
+
+            groups.setdefault((bid, wid), {})[xid] = (name, model_scale_name)
+            consumed.update((name, model_scale_name))
+
+        def convert_one(name: str, model_scale_name: str, qtype: gguf.GGMLQuantizationType) -> np.ndarray:
+            weight = LazyTorchTensor.to_eager(self.model_tensors[name]())
+            scale = LazyTorchTensor.to_eager(self.model_tensors[model_scale_name]())
+
+            if qtype == gguf.GGMLQuantizationType.MXFP4:
+                data, _ = self._pack_fp4_as_mxfp4(weight, scale)
+                return data
+
+            data = self._dequant_fp4_weight(weight, scale).numpy()
+            return self._quantize_deepseek4_expert(data, qtype)
+
+        def add_merged_tensor(bid: int, wid: str, qtype: gguf.GGMLQuantizationType, experts: dict[int, np.ndarray]) -> None:
+            missing = sorted(set(range(n_experts)).difference(experts))
+            if missing:
+                raise ValueError(f"Missing DeepSeek V4 expert tensors for layer {bid} {wid}: {missing[:8]}")
+
+            merged = np.stack([experts[i] for i in range(n_experts)], axis=0)
+            merged_name = f"layers.{bid}.ffn.experts.{wid}.weight"
+            new_name = self.map_tensor_name(merged_name)
+            shape = gguf.quant_shape_from_byte_shape(merged.shape, qtype) if merged.dtype == np.uint8 else merged.shape
+            shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
+            logger.info(f"{new_name}, DeepSeek FP4 --> {qtype.name}, shape = {shape_str}")
+            self.gguf_writer.add_tensor(new_name, merged, raw_dtype=qtype)
+
+        worker_count = max(1, self.deepseek4_expert_workers)
+        for bid, wid in sorted(groups):
+            qtype = self._deepseek4_expert_qtypes.get(wid, default_qtype)
+            if qtype is None:
+                raise RuntimeError(f"No DeepSeek V4 expert quantization type selected for {wid}")
+            group = groups[(bid, wid)]
+            experts: dict[int, np.ndarray] = {}
+            logger.info(
+                "DeepSeek V4: quantizing blk.%d %s experts to %s with %d worker%s",
+                bid,
+                wid,
+                qtype.name,
+                worker_count,
+                "" if worker_count == 1 else "s",
+            )
+
+            if worker_count == 1:
+                for done, xid in enumerate(sorted(group), start=1):
+                    name, model_scale_name = group[xid]
+                    experts[xid] = convert_one(name, model_scale_name, qtype)
+                    if done % 32 == 0 or done == n_experts:
+                        logger.info("DeepSeek V4: blk.%d %s %d/%d experts", bid, wid, done, n_experts)
+            else:
+                max_pending = worker_count * 2
+                pending: dict[concurrent.futures.Future[np.ndarray], int] = {}
+                xids = iter(sorted(group))
+                done = 0
+
+                with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
+                    def submit_next() -> bool:
+                        try:
+                            xid = next(xids)
+                        except StopIteration:
+                            return False
+                        name, model_scale_name = group[xid]
+                        future = executor.submit(convert_one, name, model_scale_name, qtype)
+                        pending[future] = xid
+                        return True
+
+                    while len(pending) < max_pending and submit_next():
+                        pass
+
+                    while pending:
+                        finished, _ = concurrent.futures.wait(
+                            pending,
+                            return_when=concurrent.futures.FIRST_COMPLETED,
+                        )
+                        for future in finished:
+                            xid = pending.pop(future)
+                            experts[xid] = future.result()
+                            done += 1
+                            if done % 32 == 0 or done == n_experts:
+                                logger.info("DeepSeek V4: blk.%d %s %d/%d experts", bid, wid, done, n_experts)
+                            submit_next()
+
+            add_merged_tensor(bid, wid, qtype, experts)
+
+        return consumed
+
+    def _prepare_deepseek4_scaled_tensors(self) -> None:
+        block_size = (self.hparams.get("quantization_config") or {}).get("weight_block_size", [128, 128])
+        consumed: set[str] = set()
+
+        for name in list(self.model_tensors.keys()):
+            stripped = self._strip_model_prefix(name)
+            if stripped.startswith("mtp.") or self._skip_layer_tensor(stripped):
+                consumed.add(name)
+
+        consumed.update(self._write_deepseek4_tid2eid_tensors())
+        consumed.update(self._write_deepseek4_expert_tensors())
+
+        for name in list(self.model_tensors.keys()):
+            if name in consumed:
+                continue
+            stripped = self._strip_model_prefix(name)
+            if not stripped.endswith(".scale"):
+                continue
+            if re.match(r"layers\.\d+\.ffn\.experts\.\d+\.w[123]\.scale$", stripped) is not None:
+                continue
+
+            weight_name = f"{stripped.removesuffix('.scale')}.weight"
+            model_weight_name = weight_name if weight_name in self.model_tensors else f"model.{weight_name}"
+            if model_weight_name not in self.model_tensors:
+                raise ValueError(f"Missing DeepSeek V4 FP8 weight tensor for scale {stripped}")
+
+            w = self.model_tensors[model_weight_name]
+            s = self.model_tensors[name]
+            self.model_tensors[model_weight_name] = (
+                lambda w=w, s=s, bs=block_size: self._dequant_fp8_weight(
+                    LazyTorchTensor.to_eager(w()),
+                    LazyTorchTensor.to_eager(s()),
+                    bs,
+                )
+            )
+            consumed.add(name)
+
+        for name in consumed:
+            self.model_tensors.pop(name, None)
+
+    def prepare_tensors(self):
+        self._prepare_deepseek4_scaled_tensors()
+
+        if any(name.endswith(".scale") for name in self.model_tensors):
+            raise NotImplementedError("Unhandled DeepSeek V4 scale tensors remain after conversion preparation")
+
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
+        del name
+        del new_name
+        del bid
+
+        if not self._is_low_bit_ftype(self.ftype) or n_dims <= 1:
+            return False
+
+        # DeepSeek V4 routed experts are handled in _write_deepseek4_expert_tensors(),
+        # where each expert is converted directly from FP4 to the requested compact
+        # GGUF type.  Keep the rest of the model in float form so attention,
+        # hyper-connections, indexers, compressors, shared experts and logits do not
+        # inherit the global low-bit file type.
+        return gguf.GGMLQuantizationType.F16
+
+    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
+        mapped = self._map_tensor_name_deepseek4(name)
+        if mapped is not None:
+            return mapped
+        return super().map_tensor_name(name, try_suffixes)
+
+    def _map_tensor_name_deepseek4(self, name: str) -> str | None:
+        if name.startswith("model."):
+            name = name.removeprefix("model.")
+
+        top_level: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
+            "embed.weight":    (gguf.MODEL_TENSOR.TOKEN_EMBD, ".weight"),
+            "norm.weight":     (gguf.MODEL_TENSOR.OUTPUT_NORM, ".weight"),
+            "head.weight":     (gguf.MODEL_TENSOR.OUTPUT, ".weight"),
+            "hc_head_base":    (gguf.MODEL_TENSOR.OUTPUT_HC_BASE, ".weight"),
+            "hc_head_fn":      (gguf.MODEL_TENSOR.OUTPUT_HC_FN, ".weight"),
+            "hc_head_scale":   (gguf.MODEL_TENSOR.OUTPUT_HC_SCALE, ".weight"),
+        }
+        if name in top_level:
+            tensor, suffix = top_level[name]
+            return self.format_tensor_name(tensor, suffix=suffix)
+
+        match = re.match(r"layers\.(\d+)\.(.+)", name)
+        if match is None:
+            return None
+
+        bid = int(match.group(1))
+        rest = match.group(2)
+
+        layer_level: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
+            "hc_attn_base":                  (gguf.MODEL_TENSOR.HC_ATTN_BASE, ".weight"),
+            "hc_attn_fn":                    (gguf.MODEL_TENSOR.HC_ATTN_FN, ".weight"),
+            "hc_attn_scale":                 (gguf.MODEL_TENSOR.HC_ATTN_SCALE, ".weight"),
+            "hc_ffn_base":                   (gguf.MODEL_TENSOR.HC_FFN_BASE, ".weight"),
+            "hc_ffn_fn":                     (gguf.MODEL_TENSOR.HC_FFN_FN, ".weight"),
+            "hc_ffn_scale":                  (gguf.MODEL_TENSOR.HC_FFN_SCALE, ".weight"),
+            "attn.attn_sink":                (gguf.MODEL_TENSOR.ATTN_SINKS, ".weight"),
+            "attn.wq_a.weight":              (gguf.MODEL_TENSOR.ATTN_Q_A, ".weight"),
+            "attn.wq_b.weight":              (gguf.MODEL_TENSOR.ATTN_Q_B, ".weight"),
+            "attn.q_norm.weight":            (gguf.MODEL_TENSOR.ATTN_Q_A_NORM, ".weight"),
+            "attn.wkv.weight":               (gguf.MODEL_TENSOR.ATTN_KV, ".weight"),
+            "attn.kv_norm.weight":           (gguf.MODEL_TENSOR.ATTN_KV_A_NORM, ".weight"),
+            "attn.wo_a.weight":              (gguf.MODEL_TENSOR.ATTN_OUT_A, ".weight"),
+            "attn.wo_b.weight":              (gguf.MODEL_TENSOR.ATTN_OUT_B, ".weight"),
+            "attn.compressor.ape":           (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_APE, ".weight"),
+            "attn.compressor.wkv.weight":    (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_KV, ".weight"),
+            "attn.compressor.wgate.weight":  (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_GATE, ".weight"),
+            "attn.compressor.norm.weight":   (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_NORM, ".weight"),
+            "attn.indexer.wq_b.weight":      (gguf.MODEL_TENSOR.INDEXER_ATTN_Q_B, ".weight"),
+            "attn.indexer.weights_proj.weight": (gguf.MODEL_TENSOR.INDEXER_PROJ, ".weight"),
+            "attn.indexer.compressor.ape":   (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_APE, ".weight"),
+            "attn.indexer.compressor.wkv.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_KV, ".weight"),
+            "attn.indexer.compressor.wgate.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_GATE, ".weight"),
+            "attn.indexer.compressor.norm.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_NORM, ".weight"),
+            "attn_norm.weight":              (gguf.MODEL_TENSOR.ATTN_NORM, ".weight"),
+            "ffn_norm.weight":               (gguf.MODEL_TENSOR.FFN_NORM, ".weight"),
+            "ffn.shared_experts.w1.weight":  (gguf.MODEL_TENSOR.FFN_GATE_SHEXP, ".weight"),
+            "ffn.shared_experts.w3.weight":  (gguf.MODEL_TENSOR.FFN_UP_SHEXP, ".weight"),
+            "ffn.shared_experts.w2.weight":  (gguf.MODEL_TENSOR.FFN_DOWN_SHEXP, ".weight"),
+            "ffn.gate.weight":               (gguf.MODEL_TENSOR.FFN_GATE_INP, ".weight"),
+            "ffn.gate.bias":                 (gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, ".bias"),
+            "ffn.gate.tid2eid":              (gguf.MODEL_TENSOR.FFN_GATE_TID2EID, ".weight"),
+            "ffn.experts.w1.weight":         (gguf.MODEL_TENSOR.FFN_GATE_EXP, ".weight"),
+            "ffn.experts.w3.weight":         (gguf.MODEL_TENSOR.FFN_UP_EXP, ".weight"),
+            "ffn.experts.w2.weight":         (gguf.MODEL_TENSOR.FFN_DOWN_EXP, ".weight"),
+        }
+        if rest in layer_level:
+            tensor, suffix = layer_level[rest]
+            return self.format_tensor_name(tensor, bid, suffix=suffix)
+
+        return None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("model."):
+            name = name.removeprefix("model.")
+
+        # TODO: llama.cpp does not have Multi-Token Prediction for DeepSeek yet.
+        if name.startswith("mtp."):
+            return
+
+        # process the experts separately
+        match = re.match(r"layers\.(\d+)\.ffn\.experts\.(\d+)\.(w[123])\.weight", name)
+        if match is not None:
+            bid = int(match.group(1))
+            xid = int(match.group(2))
+            wid = match.group(3)
+            n_experts = self.hparams["n_routed_experts"]
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                for w_name in ["w1", "w3", "w2"]:
+                    datas: list[Tensor] = []
+
+                    for expert_id in range(n_experts):
+                        ename = f"layers.{bid}.ffn.experts.{expert_id}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+                    merged_name = f"layers.{bid}.ffn.experts.{w_name}.weight"
+                    yield self.map_tensor_name(merged_name), data_torch
+                return
+
+            del xid, wid
+            return
+
+        yield self.map_tensor_name(name), data_torch
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 41566d41aef3..f71943ed33aa 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -567,6 +567,11 @@ extern "C" {
         GGML_OP_RWKV_WKV7,
         GGML_OP_SOLVE_TRI,
         GGML_OP_GATED_DELTA_NET,
+        GGML_OP_DSV4_HC_SPLIT_SINKHORN,
+        GGML_OP_DSV4_HC_WEIGHTED_SUM,
+        GGML_OP_DSV4_HC_EXPAND,
+        GGML_OP_DSV4_FP8_KV_QUANTIZE,
+        GGML_OP_DSV4_ROPE_TAIL,
 
         GGML_OP_UNARY,
 
@@ -2555,6 +2560,61 @@ extern "C" {
             struct ggml_tensor  * beta,
             struct ggml_tensor  * state);
 
+    // DeepSeek V4 hyperconnection helper.
+    // Splits [mix, tokens] into pre/post/comb regions and applies the
+    // Sinkhorn normalization used by the reference implementation.
+    GGML_API struct ggml_tensor * ggml_dsv4_hc_split_sinkhorn(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * mixes,
+            struct ggml_tensor  * scale,
+            struct ggml_tensor  * base,
+            int                   n_hc,
+            int                   sinkhorn_iters,
+            float                 eps);
+
+    // DeepSeek V4 hyperconnection weighted-sum helper.
+    // Computes sum_hc weights[hc, token] * x[embd, hc, token].
+    GGML_API struct ggml_tensor * ggml_dsv4_hc_weighted_sum(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * x,
+            struct ggml_tensor  * weights);
+
+    // DeepSeek V4 hyperconnection expand helper.
+    // Computes post * block_out + comb^T @ residual for each token.
+    GGML_API struct ggml_tensor * ggml_dsv4_hc_expand(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * block_out,
+            struct ggml_tensor  * residual,
+            struct ggml_tensor  * post,
+            struct ggml_tensor  * comb);
+
+    // DeepSeek V4 FP8 KV-cache simulation helper.
+    // Quantizes/dequantizes the non-RoPE prefix in E4M3FN blocks and leaves
+    // the RoPE tail unchanged, matching the reference inference path.
+    GGML_API struct ggml_tensor * ggml_dsv4_fp8_kv_quantize(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_rot);
+
+    // DeepSeek V4 partial RoPE helper.
+    // Leaves the non-RoPE prefix unchanged and applies RoPE to the tail,
+    // matching ggml_concat(prefix, ggml_rope_ext(tail)).
+    GGML_API struct ggml_tensor * ggml_dsv4_rope_tail(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * pos,
+            struct ggml_tensor  * freq_factors,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow,
+            bool                  inverse);
+
     // custom operators
 
     typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
index 4e36909f45e9..d51620c288a6 100644
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
@@ -754,7 +754,16 @@ static bool ggml_is_view_op(enum ggml_op op) {
 #endif
 
 #ifndef GGML_SCHED_MAX_SPLIT_INPUTS
-#define GGML_SCHED_MAX_SPLIT_INPUTS 30
+// V4 multi-GPU note: V4 (DeepSeek-V4) graphs need a higher value (~80-128)
+// when split across multiple devices, due to dense per-layer inputs
+// (hyperconnection × 4 + indexer/compressor state + multiple KV caches).
+// Single-device runs never trip the cap. The constant gates not just
+// `sched_split.inputs[N]` (small) but also `nodes_size` and
+// `context_buffer_size` allocations that scale as `graph_size × N` —
+// bumping the default adds ~200 MB per scheduler instance for V4-sized
+// graphs, paid even by single-GPU users who don't need it. We bump it
+// anyway for the DSv4-Flash use-case (Strix Halo / unified memory).
+#define GGML_SCHED_MAX_SPLIT_INPUTS 128
 #endif
 
 #ifndef GGML_SCHED_MAX_COPIES
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index cd5c61a81879..70f8def3a742 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2047,6 +2047,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_gated_delta_net(params, tensor);
             } break;
+        case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+            {
+                ggml_compute_forward_dsv4_hc_split_sinkhorn(params, tensor);
+            } break;
+        case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+            {
+                ggml_compute_forward_dsv4_hc_weighted_sum(params, tensor);
+            } break;
+        case GGML_OP_DSV4_HC_EXPAND:
+            {
+                ggml_compute_forward_dsv4_hc_expand(params, tensor);
+            } break;
+        case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+            {
+                ggml_compute_forward_dsv4_fp8_kv_quantize(params, tensor);
+            } break;
+        case GGML_OP_DSV4_ROPE_TAIL:
+            {
+                ggml_compute_forward_dsv4_rope_tail(params, tensor);
+            } break;
         case GGML_OP_MAP_CUSTOM1:
             {
                 ggml_compute_forward_map_custom1(params, tensor);
@@ -2227,6 +2247,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_COUNT_EQUAL:
         case GGML_OP_SOLVE_TRI:
         case GGML_OP_GATED_DELTA_NET:
+        case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+        case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+        case GGML_OP_DSV4_HC_EXPAND:
+        case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+        case GGML_OP_DSV4_ROPE_TAIL:
             {
                 n_tasks = n_threads;
             } break;
@@ -2847,6 +2872,7 @@ struct ggml_cplan ggml_graph_plan(
                 case GGML_OP_SOFT_MAX:
                 case GGML_OP_ROPE:
                 case GGML_OP_ROPE_BACK:
+                case GGML_OP_DSV4_ROPE_TAIL:
                     {
                         cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                     } break;
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 7485ba4fc861..f473cb724725 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5970,6 +5970,127 @@ void ggml_compute_forward_rope_back(
     }
 }
 
+// ggml_compute_forward_dsv4_rope_tail
+
+template<typename T>
+static void ggml_compute_forward_dsv4_rope_tail_flt(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+    const ggml_tensor * src2 = dst->src[2];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+
+    const int n_dims     = ((int32_t *) dst->op_params)[0];
+    const int mode       = ((int32_t *) dst->op_params)[1];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[2];
+    const bool inverse   = ((int32_t *) dst->op_params)[3] != 0;
+
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    memcpy(&freq_base,   (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params + 8, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 9, sizeof(float));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(nb0 == nb00);
+    GGML_ASSERT(nb0 == sizeof(T));
+    GGML_ASSERT(n_dims <= ne0);
+    GGML_ASSERT(n_dims % 2 == 0);
+    GGML_ASSERT(mode == GGML_ROPE_TYPE_NORMAL || mode == GGML_ROPE_TYPE_NEOX);
+
+    const int64_t n_nope = ne0 - n_dims;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(dst);
+    const int dr = (nr + nth - 1)/nth;
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    const float * freq_factors = NULL;
+    if (src2 != NULL) {
+        GGML_ASSERT(src2->type == GGML_TYPE_F32);
+        GGML_ASSERT(src2->ne[0] >= n_dims / 2);
+        freq_factors = (const float *) src2->data;
+    }
+
+    const float sin_sign = inverse ? -1.0f : 1.0f;
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    int ir = 0;
+    int64_t last_i2 = -1;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                float * cache = (float *) params->wdata + (n_dims + CACHE_LINE_SIZE_F32)*ith;
+                if (last_i2 != i2) {
+                    const int64_t p = pos[i2];
+                    ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, n_dims, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+                    last_i2 = i2;
+                }
+
+                const T * src = (const T *)((const char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                T * dst_data  =       (T *)((      char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1);
+
+                for (int64_t i0 = 0; i0 < n_nope; ++i0) {
+                    dst_data[i0] = src[i0];
+                }
+
+                const T * src_tail = src + n_nope;
+                T * dst_tail = dst_data + n_nope;
+
+                switch (mode) {
+                    case GGML_ROPE_TYPE_NORMAL:
+                        rotate_pairs<T>(n_dims, 1, cache, src_tail, dst_tail, 1);
+                        break;
+                    case GGML_ROPE_TYPE_NEOX:
+                        rotate_pairs<T>(n_dims, n_dims/2, cache, src_tail, dst_tail);
+                        break;
+                    default:
+                        GGML_ABORT("rope type not supported");
+                }
+            }
+        }
+    }
+}
+
+void ggml_compute_forward_dsv4_rope_tail(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_dsv4_rope_tail_flt<ggml_fp16_t>(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_dsv4_rope_tail_flt<float>(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
 // ggml_compute_forward_conv_transpose_1d
 
 static void ggml_compute_forward_conv_transpose_1d_f16_f32(
@@ -10903,6 +11024,343 @@ void ggml_compute_forward_rwkv_wkv7(
     }
 }
 
+// ggml_compute_forward_dsv4_hc_split_sinkhorn
+
+void ggml_compute_forward_dsv4_hc_split_sinkhorn(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * mixes = dst->src[0];
+    const ggml_tensor * scale = dst->src[1];
+    const ggml_tensor * base  = dst->src[2];
+
+    GGML_ASSERT(mixes->type == GGML_TYPE_F32);
+    GGML_ASSERT(scale->type == GGML_TYPE_F32);
+    GGML_ASSERT(base->type  == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type   == GGML_TYPE_F32);
+    GGML_ASSERT(mixes->nb[0] == sizeof(float));
+    GGML_ASSERT(scale->nb[0] == sizeof(float));
+    GGML_ASSERT(base->nb[0]  == sizeof(float));
+    GGML_ASSERT(dst->nb[0]   == sizeof(float));
+
+    const int n_hc           = ggml_get_op_params_i32(dst, 0);
+    const int sinkhorn_iters = ggml_get_op_params_i32(dst, 1);
+    const float eps          = ggml_get_op_params_f32(dst, 2);
+    const int64_t mix_hc     = mixes->ne[0];
+    const int64_t n_rows     = ggml_nrows(mixes);
+
+    GGML_ASSERT(n_hc > 0 && n_hc <= 16);
+    GGML_ASSERT(sinkhorn_iters > 0);
+    GGML_ASSERT(mix_hc == (2 + n_hc) * n_hc);
+    GGML_ASSERT(ggml_nrows(dst) == n_rows);
+
+    const float * scale_data = (const float *) scale->data;
+    const float * base_data  = (const float *) base->data;
+
+    const float pre_scale  = scale_data[0];
+    const float post_scale = scale_data[1];
+    const float comb_scale = scale_data[2];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t dr = (n_rows + nth - 1) / nth;
+    const int64_t r0 = dr * ith;
+    const int64_t r1 = std::min(r0 + dr, n_rows);
+
+    for (int64_t r = r0; r < r1; ++r) {
+        const float * mix = (const float *) ((const char *) mixes->data + r*mixes->nb[1]);
+        float * out = (float *) ((char *) dst->data + r*dst->nb[1]);
+
+        for (int i = 0; i < n_hc; ++i) {
+            const float z = mix[i] * pre_scale + base_data[i];
+            out[i] = 1.0f / (1.0f + expf(-z)) + eps;
+        }
+
+        for (int i = 0; i < n_hc; ++i) {
+            const int off = n_hc + i;
+            const float z = mix[off] * post_scale + base_data[off];
+            out[off] = 2.0f / (1.0f + expf(-z));
+        }
+
+        float c[16*16];
+
+        // comb is laid out as a flat [n_hc*n_hc] block per token, written as
+        // c[src_hc + dst_hc*n_hc]; after the graph's reshape_3d this is ggml
+        // tensor comb[ne0=src_hc, ne1=dst_hc, ne2=token]. The Sinkhorn pass
+        // below softmaxes over src_hc, then alternates row/col normalization.
+        // ggml_dsv4_hc_expand intentionally reads ggml-dim0 as dst_hc, which
+        // transposes this matrix on read so it computes comb^T @ residual
+        // (the V4 hyperconnection contract). CPU/Metal/CUDA use the identical
+        // flat write + transposed read; do not "fix" one side in isolation.
+        for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+            float row_max = -INFINITY;
+            for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                const int idx = src_hc + dst_hc*n_hc;
+                const int off = 2*n_hc + idx;
+                const float v = mix[off] * comb_scale + base_data[off];
+                c[idx] = v;
+                row_max = std::max(row_max, v);
+            }
+
+            float row_sum = 0.0f;
+            for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                const int idx = src_hc + dst_hc*n_hc;
+                const float v = expf(c[idx] - row_max);
+                c[idx] = v;
+                row_sum += v;
+            }
+
+            const float inv_sum = 1.0f / row_sum;
+            for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                const int idx = src_hc + dst_hc*n_hc;
+                c[idx] = c[idx] * inv_sum + eps;
+            }
+        }
+
+        for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+            float sum = 0.0f;
+            for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                sum += c[src_hc + dst_hc*n_hc];
+            }
+
+            const float inv_denom = 1.0f / (sum + eps);
+            for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                c[src_hc + dst_hc*n_hc] *= inv_denom;
+            }
+        }
+
+        for (int iter = 1; iter < sinkhorn_iters; ++iter) {
+            for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                float sum = 0.0f;
+                for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                    sum += c[src_hc + dst_hc*n_hc];
+                }
+
+                const float inv_denom = 1.0f / (sum + eps);
+                for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                    c[src_hc + dst_hc*n_hc] *= inv_denom;
+                }
+            }
+
+            for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                float sum = 0.0f;
+                for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                    sum += c[src_hc + dst_hc*n_hc];
+                }
+
+                const float inv_denom = 1.0f / (sum + eps);
+                for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                    c[src_hc + dst_hc*n_hc] *= inv_denom;
+                }
+            }
+        }
+
+        for (int i = 0; i < n_hc*n_hc; ++i) {
+            out[2*n_hc + i] = c[i];
+        }
+    }
+}
+
+// ggml_compute_forward_dsv4_hc_weighted_sum
+
+void ggml_compute_forward_dsv4_hc_weighted_sum(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * x       = dst->src[0];
+    const ggml_tensor * weights = dst->src[1];
+
+    GGML_ASSERT(x->type       == GGML_TYPE_F32);
+    GGML_ASSERT(weights->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type     == GGML_TYPE_F32);
+    GGML_ASSERT(x->ne[0]       == dst->ne[0]);
+    GGML_ASSERT(x->ne[1]       == weights->ne[0]);
+    GGML_ASSERT(x->ne[2]       == dst->ne[1]);
+    GGML_ASSERT(weights->ne[1] == dst->ne[1]);
+    GGML_ASSERT(x->ne[3]       == 1);
+    GGML_ASSERT(weights->ne[2] == 1);
+    GGML_ASSERT(weights->ne[3] == 1);
+    GGML_ASSERT(dst->ne[2]     == 1);
+    GGML_ASSERT(dst->ne[3]     == 1);
+
+    const int64_t n_embd   = dst->ne[0];
+    const int64_t n_hc     = x->ne[1];
+    const int64_t n_tokens = dst->ne[1];
+    const int64_t n_elem   = n_embd * n_tokens;
+
+    const int64_t i0 = (n_elem * params->ith) / params->nth;
+    const int64_t i1 = (n_elem * (params->ith + 1)) / params->nth;
+
+    const char * x_data = (const char *) x->data;
+    const char * w_data = (const char *) weights->data;
+          char * y_data = (      char *) dst->data;
+
+    for (int64_t i = i0; i < i1; ++i) {
+        const int64_t d = i % n_embd;
+        const int64_t t = i / n_embd;
+
+        float acc = 0.0f;
+        for (int64_t h = 0; h < n_hc; ++h) {
+            const float xv = *(const float *) (x_data + d*x->nb[0] + h*x->nb[1] + t*x->nb[2]);
+            const float wv = *(const float *) (w_data + h*weights->nb[0] + t*weights->nb[1]);
+            acc += xv * wv;
+        }
+
+        *(float *) (y_data + d*dst->nb[0] + t*dst->nb[1]) = acc;
+    }
+}
+
+// ggml_compute_forward_dsv4_hc_expand
+
+void ggml_compute_forward_dsv4_hc_expand(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * block_out = dst->src[0];
+    const ggml_tensor * residual  = dst->src[1];
+    const ggml_tensor * post      = dst->src[2];
+    const ggml_tensor * comb      = dst->src[3];
+
+    GGML_ASSERT(block_out->type == GGML_TYPE_F32);
+    GGML_ASSERT(residual->type  == GGML_TYPE_F32);
+    GGML_ASSERT(post->type      == GGML_TYPE_F32);
+    GGML_ASSERT(comb->type      == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type       == GGML_TYPE_F32);
+    GGML_ASSERT(block_out->ne[0] == dst->ne[0]);
+    GGML_ASSERT(block_out->ne[1] == dst->ne[2]);
+    GGML_ASSERT(residual->ne[0]  == dst->ne[0]);
+    GGML_ASSERT(residual->ne[1]  == dst->ne[1]);
+    GGML_ASSERT(residual->ne[2]  == dst->ne[2]);
+    GGML_ASSERT(post->ne[0]      == dst->ne[1]);
+    GGML_ASSERT(post->ne[1]      == dst->ne[2]);
+    GGML_ASSERT(comb->ne[0]      == dst->ne[1]);
+    GGML_ASSERT(comb->ne[1]      == dst->ne[1]);
+    GGML_ASSERT(comb->ne[2]      == dst->ne[2]);
+    GGML_ASSERT(block_out->ne[3] == 1);
+    GGML_ASSERT(residual->ne[3]  == 1);
+    GGML_ASSERT(post->ne[2]      == 1);
+    GGML_ASSERT(post->ne[3]      == 1);
+    GGML_ASSERT(comb->ne[3]      == 1);
+    GGML_ASSERT(dst->ne[3]       == 1);
+
+    const int64_t n_embd   = dst->ne[0];
+    const int64_t n_hc     = dst->ne[1];
+    const int64_t n_tokens = dst->ne[2];
+    const int64_t n_elem   = n_embd * n_hc * n_tokens;
+
+    const int64_t i0 = (n_elem * params->ith) / params->nth;
+    const int64_t i1 = (n_elem * (params->ith + 1)) / params->nth;
+
+    const char * block_data = (const char *) block_out->data;
+    const char * res_data   = (const char *) residual->data;
+    const char * post_data  = (const char *) post->data;
+    const char * comb_data  = (const char *) comb->data;
+          char * dst_data   = (      char *) dst->data;
+
+    for (int64_t i = i0; i < i1; ++i) {
+        const int64_t d      = i % n_embd;
+        const int64_t tmp    = i / n_embd;
+        const int64_t dst_hc = tmp % n_hc;
+        const int64_t t      = tmp / n_hc;
+
+        const float block_v = *(const float *) (block_data + d*block_out->nb[0] + t*block_out->nb[1]);
+        const float post_v  = *(const float *) (post_data  + dst_hc*post->nb[0] + t*post->nb[1]);
+
+        float acc = block_v * post_v;
+        // comb arrives as comb[ne0=src_hc, ne1=dst_hc, ne2=t] from
+        // dsv4_hc_split_sinkhorn (flat write src_hc + dst_hc*n_hc). Reading
+        // ne0 as dst_hc and ne1 as src_hc here transposes it, giving
+        // (comb^T @ residual)[d, dst_hc] = sum_src_hc comb[src_hc,dst_hc,t]
+        // * residual[d, src_hc, t]. This transpose is deliberate and matches
+        // the Metal/CUDA expand kernels (validated 19/19 vs this CPU oracle).
+        for (int64_t src_hc = 0; src_hc < n_hc; ++src_hc) {
+            const float comb_v = *(const float *) (comb_data + dst_hc*comb->nb[0] + src_hc*comb->nb[1] + t*comb->nb[2]);
+            const float res_v  = *(const float *) (res_data  + d*residual->nb[0] + src_hc*residual->nb[1] + t*residual->nb[2]);
+            acc += comb_v * res_v;
+        }
+
+        *(float *) (dst_data + d*dst->nb[0] + dst_hc*dst->nb[1] + t*dst->nb[2]) = acc;
+    }
+}
+
+static float ggml_dsv4_e4m3fn_dequant(float x) {
+    const float sign = x < 0.0f ? -1.0f : 1.0f;
+    const float ax = std::min(std::fabs(x), 448.0f);
+
+    int best = 0;
+    float best_diff = ax;
+
+    for (int i = 1; i < 127; ++i) {
+        const int exp  = (i >> 3) & 0x0f;
+        const int mant = i & 0x07;
+        const float val = exp == 0
+            ? std::ldexp(float(mant), -9)
+            : std::ldexp(1.0f + float(mant) / 8.0f, exp - 7);
+        const float diff = std::fabs(ax - val);
+        if (diff < best_diff || (diff == best_diff && (i & 1) == 0 && (best & 1) != 0)) {
+            best = i;
+            best_diff = diff;
+        }
+    }
+
+    const int exp  = (best >> 3) & 0x0f;
+    const int mant = best & 0x07;
+    const float val = exp == 0
+        ? std::ldexp(float(mant), -9)
+        : std::ldexp(1.0f + float(mant) / 8.0f, exp - 7);
+
+    return sign * val;
+}
+
+void ggml_compute_forward_dsv4_fp8_kv_quantize(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    const int64_t n_rot = ggml_get_op_params_i32(dst, 0);
+    const int64_t head_dim = src0->ne[0];
+    const int64_t n_nope = head_dim - n_rot;
+
+    GGML_ASSERT(n_rot >= 0);
+    GGML_ASSERT(n_nope > 0);
+    GGML_ASSERT(n_nope % 64 == 0);
+
+    const int64_t n_rows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+    const int64_t row_start = (n_rows * params->ith) / params->nth;
+    const int64_t row_end   = (n_rows * (params->ith + 1)) / params->nth;
+
+    for (int64_t row = row_start; row < row_end; ++row) {
+        const int64_t i1 = row % src0->ne[1];
+        const int64_t i2 = (row / src0->ne[1]) % src0->ne[2];
+        const int64_t i3 = row / (src0->ne[1] * src0->ne[2]);
+
+        const char * src_base = (const char *) src0->data + i1*src0->nb[1] + i2*src0->nb[2] + i3*src0->nb[3];
+        char       * dst_base = (      char *) dst->data  + i1*dst->nb[1]  + i2*dst->nb[2]  + i3*dst->nb[3];
+
+        for (int64_t off = 0; off < n_nope; off += 64) {
+            float amax = 0.0f;
+            for (int64_t i = 0; i < 64; ++i) {
+                const float v = *(const float *) (src_base + (off + i)*src0->nb[0]);
+                amax = std::max(amax, std::fabs(v));
+            }
+
+            amax = std::max(amax, 1.0e-4f);
+            const float scale = std::ldexp(1.0f, int(std::ceil(std::log2(amax / 448.0f))));
+            for (int64_t i = 0; i < 64; ++i) {
+                const float v = *(const float *) (src_base + (off + i)*src0->nb[0]);
+                *(float *) (dst_base + (off + i)*dst->nb[0]) =
+                    ggml_dsv4_e4m3fn_dequant(std::clamp(v / scale, -448.0f, 448.0f)) * scale;
+            }
+        }
+
+        for (int64_t i = n_nope; i < head_dim; ++i) {
+            *(float *) (dst_base + i*dst->nb[0]) = *(const float *) (src_base + i*src0->nb[0]);
+        }
+    }
+}
+
 // ggml_compute_forward_map_custom1
 
 void ggml_compute_forward_map_custom1(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 7398e5618948..4da4db62aa4e 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -104,6 +104,11 @@ void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, s
 void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_gated_delta_net(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_hc_split_sinkhorn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_hc_weighted_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_hc_expand(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_fp8_kv_quantize(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_rope_tail(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cu b/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cu
new file mode 100644
index 000000000000..d32049c97da0
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cu
@@ -0,0 +1,164 @@
+#include "dsv4-fp8-kv-quantize.cuh"
+
+#if !defined(__HIP_PLATFORM_AMD__) && __CUDA_ARCH__ >= 890
+#include <cuda_fp8.h>
+#endif
+
+#include <cstdint>
+
+// E4M3FN code value: 0..127.
+// Format: 1 sign + 4 exponent + 3 mantissa, bias 7, no inf/nan reserved.
+// (i >> 3) & 0xf = exponent, i & 7 = mantissa. Code 0 is +0.
+// Mirrors the CPU helper at ggml-cpu/ops.cpp:11245-11247 and the Metal
+// helper dsv4_e4m3fn_value at ggml-metal.metal:2302-2308.
+static __device__ __forceinline__ float dsv4_e4m3fn_value(int i) {
+    const int e = (i >> 3) & 0x0f;
+    const int m = i & 0x07;
+    return e == 0
+        ? float(m) * 0.001953125f                              // 2^-9 * m  (subnormal)
+        : (1.0f + float(m) * 0.125f) * exp2f(float(e - 7));    // normal
+}
+
+// Round |x| to the nearest E4M3FN positive code value, breaking ties
+// toward the EVEN code (matches CPU reference ops.cpp:11242-11253 exactly).
+// Returns the dequantized F32, sign-preserved.
+static __device__ __forceinline__ float dsv4_e4m3fn_dequant_sw(float x) {
+    const float sign = x < 0.0f ? -1.0f : 1.0f;
+    const float ax   = fminf(fabsf(x), 448.0f);
+
+    int   best      = 0;
+    float best_diff = ax;
+    #pragma unroll
+    for (int i = 1; i < 127; ++i) {
+        const float val  = dsv4_e4m3fn_value(i);
+        const float diff = fabsf(ax - val);
+        if (diff < best_diff || (diff == best_diff && (i & 1) == 0 && (best & 1) != 0)) {
+            best      = i;
+            best_diff = diff;
+        }
+    }
+    return sign * dsv4_e4m3fn_value(best);
+}
+
+// Dual-path E4M3FN quantize+dequantize round-trip with saturation.
+//
+// Native path uses NVIDIA's documented FP8 class API. The constructor
+// __nv_fp8_e4m3(float) applies round-to-nearest-even and saturates to
+// the finite E4M3 range (+/-448). The explicit float() conversion expands
+// the FP8 storage back to F32. This is the supported public API per
+// NVIDIA's cuda_fp8.h headers (CUDA toolkit >= 11.8).
+//
+// (We intentionally avoid the lower-level __nv_cvt_fp8_to_halfraw +
+// __half2float chain: the class wrapper is clearer and avoids a half
+// hop on F32-only data. There is no __nv_cvt_fp8_to_float intrinsic.)
+static __device__ __forceinline__ float dsv4_e4m3fn_roundtrip(float x) {
+#if !defined(__HIP_PLATFORM_AMD__) && __CUDA_ARCH__ >= 890
+    const __nv_fp8_e4m3 q(x);
+    return float(q);
+#else
+    // Software emulation: matches CPU reference bit-for-bit.
+    return dsv4_e4m3fn_dequant_sw(x);
+#endif
+}
+
+// Warp-level (32 threads) max-reduction via __shfl_xor_sync.
+static __device__ __forceinline__ float warp_reduce_max(float v) {
+    #pragma unroll
+    for (int offset = 16; offset > 0; offset >>= 1) {
+        v = fmaxf(v, __shfl_xor_sync(0xffffffffu, v, offset, 32));
+    }
+    return v;
+}
+
+// One block per row. blockDim.x == 64 (two warps).
+static __global__ void dsv4_fp8_kv_quantize_f32(
+        const char * __restrict__ src,
+        char       * __restrict__ dst,
+        const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
+        const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03,
+        const int64_t nb0,  const int64_t nb1,  const int64_t nb2,  const int64_t nb3,
+        const int     n_rot) {
+
+    const int64_t n_rows = ne01 * ne02 * ne03;
+    const int64_t row    = blockIdx.x;
+    if (row >= n_rows) return;
+
+    const int     tid     = threadIdx.x;        // 0..63
+    const int     warp_id = tid >> 5;           // 0 or 1
+    const int     lane    = tid & 31;
+
+    const int64_t i1 = row % ne01;
+    const int64_t i2 = (row / ne01) % ne02;
+    const int64_t i3 = row / (ne01 * ne02);
+
+    const char * src_base = src + i1*nb01 + i2*nb02 + i3*nb03;
+    char       * dst_base = dst + i1*nb1  + i2*nb2  + i3*nb3;
+
+    const int64_t n_nope = ne00 - (int64_t) n_rot;
+
+    // Shared-mem slot for the two warps' partial max.
+    __shared__ float warp_max[2];
+
+    // Prefix loop: 64-element blocks.
+    for (int64_t off = 0; off < n_nope; off += 64) {
+        const float v = *(const float *)(src_base + (off + tid) * nb00);
+
+        // Two-stage block-max reduction across 64 threads.
+        // Stage 1: each warp reduces its 32 lanes via shfl_xor; lane 0 stores
+        //          the warp's max to shared memory.
+        // Stage 2: a single thread (warp 0, lane 0) combines the two warp maxes
+        //          and writes the final block max back to warp_max[0].
+        float m = warp_reduce_max(fabsf(v));
+        if (lane == 0) warp_max[warp_id] = m;
+        __syncthreads();
+        if (warp_id == 0 && lane == 0) {
+            warp_max[0] = fmaxf(warp_max[0], warp_max[1]);
+        }
+        __syncthreads();
+
+        const float amax  = fmaxf(warp_max[0], 1.0e-4f);
+        const float scale = exp2f(ceilf(log2f(amax / 448.0f)));
+
+        const float q = dsv4_e4m3fn_roundtrip(fminf(fmaxf(v / scale, -448.0f), 448.0f)) * scale;
+        *(float *)(dst_base + (off + tid) * nb0) = q;
+
+        __syncthreads();  // protect warp_max for the next block
+    }
+
+    // Tail loop: copy n_rot elements per row through unchanged.
+    // 64 threads stride through the tail.
+    for (int64_t i = n_nope + tid; i < ne00; i += 64) {
+        *(float *)(dst_base + i * nb0) = *(const float *)(src_base + i * nb00);
+    }
+}
+
+void ggml_cuda_op_dsv4_fp8_kv_quantize(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src = dst->src[0];
+
+    GGML_ASSERT(src->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_are_same_shape(src, dst));
+
+    const int n_rot = ggml_get_op_params_i32(dst, 0);
+    const int64_t head_dim = src->ne[0];
+    const int64_t n_nope = head_dim - (int64_t) n_rot;
+
+    GGML_ASSERT(n_rot >= 0);
+    GGML_ASSERT(n_nope > 0);
+    GGML_ASSERT(n_nope % 64 == 0);
+
+    const int64_t n_rows = src->ne[1] * src->ne[2] * src->ne[3];
+
+    const dim3 grid((unsigned) n_rows, 1, 1);
+    const dim3 block(64, 1, 1);
+
+    cudaStream_t stream = ctx.stream();
+    dsv4_fp8_kv_quantize_f32<<<grid, block, 0, stream>>>(
+        (const char *) src->data,
+        (      char *) dst->data,
+        src->ne[0], src->ne[1], src->ne[2], src->ne[3],
+        (int64_t) src->nb[0], (int64_t) src->nb[1], (int64_t) src->nb[2], (int64_t) src->nb[3],
+        (int64_t) dst->nb[0], (int64_t) dst->nb[1], (int64_t) dst->nb[2], (int64_t) dst->nb[3],
+        n_rot);
+    CUDA_CHECK(cudaGetLastError());
+}
diff --git a/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cuh b/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cuh
new file mode 100644
index 000000000000..8e0fd958674d
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-fp8-kv-quantize.cuh
@@ -0,0 +1,36 @@
+#pragma once
+
+// V4 FP8 KV-cache simulation: quantizes/dequantizes the non-RoPE prefix
+// of each row in 64-element blocks through E4M3FN representation with
+// per-block scaling; leaves the RoPE tail unchanged.
+//
+// Block-scaled algorithm (must match CPU reference for the
+// test-backend-ops NMSE check):
+//   for each row (n_rows = ne01 * ne02 * ne03):
+//     for off in [0, n_nope) step 64:
+//       amax  = max(|src[off..off+64)|, 1e-4)
+//       scale = 2^ceil(log2(amax / 448))
+//       dst[off+i] = dequant_e4m3fn(clamp(src[off+i]/scale, -448, 448)) * scale
+//     copy src[n_nope..ne00) to dst unchanged   // RoPE tail
+//
+// References:
+//   CPU reference:        ggml/src/ggml-cpu/ops.cpp:11235-11313
+//   Metal kernel:         ggml/src/ggml-metal/ggml-metal.metal:2302-2376
+//   Metal dispatch:       ggml/src/ggml-metal/ggml-metal-ops.cpp:1550-1594
+//   Public API:           ggml/include/ggml.h:2591 (ggml_dsv4_fp8_kv_quantize)
+//
+// Dual-path implementation:
+//   - __CUDA_ARCH__ >= 890 (Ada/Hopper/Blackwell): native FP8 via the
+//     __nv_fp8_e4m3 class wrapper from <cuda_fp8.h> (round-to-nearest-even,
+//     saturate-to-finite to +/-448).
+//   - __CUDA_ARCH__ <  890 (Volta/Turing/Ampere): software emulation by
+//     nearest-even E4M3FN code search, mirroring the CPU reference.
+//
+// Both paths produce numerically equivalent output (subject to FP8's
+// inherent lossiness). The four test_dsv4_fp8_kv_quantize cases from
+// Stream A (tests/test-backend-ops.cpp:8868-8871) validate with
+// max_nmse_err = 1e-3.
+
+#include "common.cuh"
+
+void ggml_cuda_op_dsv4_fp8_kv_quantize(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/dsv4-hc-expand.cu b/ggml/src/ggml-cuda/dsv4-hc-expand.cu
new file mode 100644
index 000000000000..8c4aab67f34d
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-hc-expand.cu
@@ -0,0 +1,92 @@
+#include "dsv4-hc-expand.cuh"
+
+// out[i_embd, i_hc, i_tok] = post[i_hc, i_tok] * block_out[i_embd, i_tok]
+//                          + sum_{hc'} comb[i_hc, hc', i_tok] * residual[i_embd, hc', i_tok]
+//
+// block_out is 2D (no hc axis); the post*block_out term is broadcast across hc.
+// See ggml/src/ggml-cpu/ops.cpp:11218-11231 for the CPU reference loop body.
+static __global__ void dsv4_hc_expand_f32(
+        const float * __restrict__ block_out,
+        const float * __restrict__ residual,
+        const float * __restrict__ post,
+        const float * __restrict__ comb,
+        float       * __restrict__ dst,
+        const int n_embd, const int n_hc, const int n_tokens,
+        // block_out strides (2D -- no hc axis)
+        const int nb_b0, const int nb_b1,
+        // residual strides (3D)
+        const int nb_r0, const int nb_r1, const int nb_r2,
+        // post strides (2D)
+        const int nb_p0, const int nb_p1,
+        // comb strides (3D)
+        const int nb_c0, const int nb_c1, const int nb_c2,
+        // dst strides (3D)
+        const int nb0,   const int nb1,   const int nb2) {
+    const int64_t gid   = (int64_t)blockIdx.x * blockDim.x + threadIdx.x;
+    const int64_t total = (int64_t)n_embd * n_hc * n_tokens;
+    if (gid >= total) {
+        return;
+    }
+
+    const int i_embd = gid % n_embd;
+    const int rest   = gid / n_embd;
+    const int i_hc   = rest % n_hc;
+    const int i_tok  = rest / n_hc;
+
+    // post * block_out  (block_out is 2D: indexed by (i_embd, i_tok) only)
+    const float p = *(const float *)((const char *)post
+        + i_hc * nb_p0 + i_tok * nb_p1);
+    const float b = *(const float *)((const char *)block_out
+        + i_embd * nb_b0 + i_tok * nb_b1);
+    float acc = p * b;
+
+    // comb @ residual: sum over hc'
+    for (int hc_p = 0; hc_p < n_hc; ++hc_p) {
+        const float c = *(const float *)((const char *)comb
+            + i_hc * nb_c0 + hc_p * nb_c1 + i_tok * nb_c2);
+        const float r = *(const float *)((const char *)residual
+            + i_embd * nb_r0 + hc_p * nb_r1 + i_tok * nb_r2);
+        acc += c * r;
+    }
+
+    float * d = (float *)((char *)dst
+        + i_embd * nb0 + i_hc * nb1 + i_tok * nb2);
+    *d = acc;
+}
+
+void ggml_cuda_op_dsv4_hc_expand(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * block_out = dst->src[0];
+    const ggml_tensor * residual  = dst->src[1];
+    const ggml_tensor * post      = dst->src[2];
+    const ggml_tensor * comb      = dst->src[3];
+
+    GGML_ASSERT(block_out->type == GGML_TYPE_F32);
+    GGML_ASSERT(residual->type  == GGML_TYPE_F32);
+    GGML_ASSERT(post->type      == GGML_TYPE_F32);
+    GGML_ASSERT(comb->type      == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type       == GGML_TYPE_F32);
+
+    const int n_embd   = (int) dst->ne[0];
+    const int n_hc     = (int) dst->ne[1];
+    const int n_tokens = (int) dst->ne[2];
+
+    const int64_t total = (int64_t)n_embd * n_hc * n_tokens;
+    constexpr int blk   = 256;
+    const dim3 grid((total + blk - 1) / blk);
+    const dim3 block(blk);
+
+    cudaStream_t stream = ctx.stream();
+    dsv4_hc_expand_f32<<<grid, block, 0, stream>>>(
+        (const float *) block_out->data,
+        (const float *) residual->data,
+        (const float *) post->data,
+        (const float *) comb->data,
+        (float *)       dst->data,
+        n_embd, n_hc, n_tokens,
+        (int) block_out->nb[0], (int) block_out->nb[1],
+        (int) residual->nb[0],  (int) residual->nb[1],  (int) residual->nb[2],
+        (int) post->nb[0],      (int) post->nb[1],
+        (int) comb->nb[0],      (int) comb->nb[1],      (int) comb->nb[2],
+        (int) dst->nb[0],       (int) dst->nb[1],       (int) dst->nb[2]);
+    CUDA_CHECK(cudaGetLastError());
+}
diff --git a/ggml/src/ggml-cuda/dsv4-hc-expand.cuh b/ggml/src/ggml-cuda/dsv4-hc-expand.cuh
new file mode 100644
index 000000000000..28f1a0ee8c8c
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-hc-expand.cuh
@@ -0,0 +1,26 @@
+#pragma once
+
+// V4 hyperconnection expand: per-token mix of block_out and residual.
+//
+// out[i, hc, tok] = post[hc, tok] * block_out[i, tok]
+//                 + sum_{hc'} comb[hc, hc', tok] * residual[i, hc', tok]
+//
+// Shapes:
+//   block_out: 2D {n_embd, n_tokens}            -- no hc axis
+//   residual:  3D {n_embd, n_hc,    n_tokens}
+//   post:      2D {n_hc,  n_tokens}
+//   comb:      3D {n_hc,  n_hc,     n_tokens}
+//   dst:       3D {n_embd, n_hc,    n_tokens}
+//
+// Reference Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:2247-2276
+// CPU reference:          ggml/src/ggml-cpu/ops.cpp:11200+
+// Public API:             ggml/include/ggml.h:2581 (ggml_dsv4_hc_expand)
+// Shape constructor:      ggml/src/ggml.c:6363-6366
+//
+// Embarrassingly parallel: one thread per output element (i_embd, i_hc, i_tok).
+// Each thread does an n_hc-wide accumulation for the comb*residual term plus
+// one fused multiply-add for the post*block_out term.
+
+#include "common.cuh"
+
+void ggml_cuda_op_dsv4_hc_expand(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cu b/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cu
new file mode 100644
index 000000000000..39d7833c9b0b
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cu
@@ -0,0 +1,198 @@
+#include "dsv4-hc-split-sinkhorn.cuh"
+
+// Maximum n_hc supported (matches CPU reference assert at ops.cpp:11014 and
+// the dst comb matrix scratch buffer size below).
+#define DSV4_HC_SINKHORN_MAX_N_HC 16
+
+// One block per row. Inside the block:
+//   - threads cooperate (parallel for) on the pre/post slices and the final
+//     copy of the comb matrix back to dst.
+//   - tid == 0 runs the n_hc x n_hc Sinkhorn iterations serially. n_hc <= 16
+//     so this is at most a few thousand FLOPs per row.
+//
+// The comb matrix lives in shared memory (sized for the worst case 16x16
+// = 256 floats = 1 KiB per block, well within any device's shared-mem
+// budget).
+static __global__ void dsv4_hc_split_sinkhorn_f32(
+        const float * __restrict__ mixes,
+        const float * __restrict__ scale,
+        const float * __restrict__ base,
+        float       * __restrict__ dst,
+        const int   n_hc,
+        const int   sinkhorn_iters,
+        const int   n_rows,
+        const int   mix_hc,
+        const int   nb01,    // input  row stride in bytes
+        const int   nb1,     // output row stride in bytes
+        const float eps) {
+    const int row = blockIdx.x;
+    if (row >= n_rows) {
+        return;
+    }
+
+    const int tid   = threadIdx.x;
+    const int blksz = blockDim.x;
+
+    const float pre_scale  = scale[0];
+    const float post_scale = scale[1];
+    const float comb_scale = scale[2];
+
+    const float * row_in  = (const float *) ((const char *) mixes + row * nb01);
+    float       * row_out = (float *)       ((char *)       dst   + row * nb1);
+
+    // ---------------- Section 1: pre slice ----------------
+    // out[i] = sigmoid(mix[i] * pre_scale + base[i]) + eps
+    for (int i = tid; i < n_hc; i += blksz) {
+        const float z = row_in[i] * pre_scale + base[i];
+        row_out[i] = 1.0f / (1.0f + expf(-z)) + eps;
+    }
+
+    // ---------------- Section 2: post slice ----------------
+    // out[n_hc + i] = 2 * sigmoid(mix[n_hc + i] * post_scale + base[n_hc + i])
+    for (int i = tid; i < n_hc; i += blksz) {
+        const int off = n_hc + i;
+        const float z = row_in[off] * post_scale + base[off];
+        row_out[off] = 2.0f / (1.0f + expf(-z));
+    }
+
+    // ---------------- Section 3: comb matrix Sinkhorn ----------------
+    //
+    // c[src_hc + dst_hc * n_hc] layout (matches CPU reference at
+    // ggml-cpu/ops.cpp:11055).
+    extern __shared__ float shmem[];
+    float * c = shmem;            // n_hc * n_hc floats
+
+    // Load the comb logits = mix * comb_scale + base (parallel over the block).
+    for (int i = tid; i < n_hc * n_hc; i += blksz) {
+        const int off = 2 * n_hc + i;
+        c[i] = row_in[off] * comb_scale + base[off];
+    }
+    __syncthreads();
+
+    // Sinkhorn iterations run on thread 0; n_hc <= 16 keeps the inner loops
+    // trivially cheap (~ 1k FLOPs per row total).
+    if (tid == 0) {
+        // First pass: per-dst_hc softmax (max-subtract for numerical stability,
+        // exp, normalize) + eps stabilizer.
+        for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+            float row_max = -INFINITY;
+            for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                row_max = fmaxf(row_max, c[src_hc + dst_hc * n_hc]);
+            }
+
+            float row_sum = 0.0f;
+            for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                const int idx = src_hc + dst_hc * n_hc;
+                const float v = expf(c[idx] - row_max);
+                c[idx] = v;
+                row_sum += v;
+            }
+
+            const float inv_sum = 1.0f / row_sum;
+            for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                const int idx = src_hc + dst_hc * n_hc;
+                c[idx] = c[idx] * inv_sum + eps;
+            }
+        }
+
+        // First column-normalize: per src_hc, divide by (column sum + eps).
+        for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+            float sum = 0.0f;
+            for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                sum += c[src_hc + dst_hc * n_hc];
+            }
+            const float inv_denom = 1.0f / (sum + eps);
+            for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                c[src_hc + dst_hc * n_hc] *= inv_denom;
+            }
+        }
+
+        // Remaining sinkhorn_iters - 1 alternations: row-normalize then column-normalize.
+        for (int it = 1; it < sinkhorn_iters; ++it) {
+            // Row-normalize: per dst_hc, divide by (row sum + eps).
+            for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                float sum = 0.0f;
+                for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                    sum += c[src_hc + dst_hc * n_hc];
+                }
+                const float inv_denom = 1.0f / (sum + eps);
+                for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                    c[src_hc + dst_hc * n_hc] *= inv_denom;
+                }
+            }
+            // Column-normalize: per src_hc, divide by (column sum + eps).
+            for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                float sum = 0.0f;
+                for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                    sum += c[src_hc + dst_hc * n_hc];
+                }
+                const float inv_denom = 1.0f / (sum + eps);
+                for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                    c[src_hc + dst_hc * n_hc] *= inv_denom;
+                }
+            }
+        }
+    }
+    __syncthreads();
+
+    // Copy the comb matrix back to dst (parallel over the block).
+    for (int i = tid; i < n_hc * n_hc; i += blksz) {
+        row_out[2 * n_hc + i] = c[i];
+    }
+
+    // Suppress unused-warning for mix_hc; it's covered by the host-side asserts.
+    (void) mix_hc;
+}
+
+void ggml_cuda_op_dsv4_hc_split_sinkhorn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * mixes = dst->src[0];
+    const ggml_tensor * scale = dst->src[1];
+    const ggml_tensor * base  = dst->src[2];
+
+    GGML_ASSERT(mixes->type == GGML_TYPE_F32);
+    GGML_ASSERT(scale->type == GGML_TYPE_F32);
+    GGML_ASSERT(base->type  == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type   == GGML_TYPE_F32);
+    GGML_ASSERT(mixes->nb[0] == sizeof(float));
+    GGML_ASSERT(scale->nb[0] == sizeof(float));
+    GGML_ASSERT(base->nb[0]  == sizeof(float));
+    GGML_ASSERT(dst->nb[0]   == sizeof(float));
+
+    const int   n_hc           = ggml_get_op_params_i32(dst, 0);
+    const int   sinkhorn_iters = ggml_get_op_params_i32(dst, 1);
+    const float eps            = ggml_get_op_params_f32(dst, 2);
+
+    GGML_ASSERT(n_hc > 0 && n_hc <= DSV4_HC_SINKHORN_MAX_N_HC);
+    GGML_ASSERT(sinkhorn_iters > 0);
+
+    const int n_rows = (int) ggml_nrows(mixes);
+    const int mix_hc = (int) mixes->ne[0];
+    const int nb01   = (int) mixes->nb[1];
+    const int nb1    = (int) dst->nb[1];
+
+    GGML_ASSERT(mix_hc == (2 + n_hc) * n_hc);
+    GGML_ASSERT((int) ggml_nrows(dst) == n_rows);
+
+    // Block size MUST be a warp multiple (>= 32) so that the in-block
+    // __syncthreads() barriers are well-formed and any future warp-wide
+    // shuffle has a complete mask. With mix_hc in {24, 80} the natural
+    // size is rounded up to 32 or 96.
+    constexpr int CUDA_WARP_SIZE = 32;
+    constexpr int CUDA_MAX_BLOCK = 256;
+    const int rounded = ((mix_hc + CUDA_WARP_SIZE - 1) / CUDA_WARP_SIZE) * CUDA_WARP_SIZE;
+    const int threads_per_block = std::min(CUDA_MAX_BLOCK, std::max(CUDA_WARP_SIZE, rounded));
+
+    const dim3 grid(n_rows);
+    const dim3 block(threads_per_block);
+    const size_t shared = (size_t) n_hc * (size_t) n_hc * sizeof(float);
+
+    cudaStream_t stream = ctx.stream();
+    dsv4_hc_split_sinkhorn_f32<<<grid, block, shared, stream>>>(
+        (const float *) mixes->data,
+        (const float *) scale->data,
+        (const float *) base->data,
+        (float *)       dst->data,
+        n_hc, sinkhorn_iters, n_rows, mix_hc,
+        nb01, nb1, eps);
+    CUDA_CHECK(cudaGetLastError());
+}
diff --git a/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cuh b/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cuh
new file mode 100644
index 000000000000..d77455298468
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-hc-split-sinkhorn.cuh
@@ -0,0 +1,42 @@
+#pragma once
+
+// V4 hyperconnection splitter with Sinkhorn normalization.
+//
+// Splits the mix vector [mix_hc, n_rows] into three sections:
+//   - out[0:n_hc]       = sigmoid(mix[i] * scale[0] + base[i]) + eps     ("pre")
+//   - out[n_hc:2*n_hc]  = 2 * sigmoid(mix[off] * scale[1] + base[off])    ("post")
+//   - out[2*n_hc:]      = Sinkhorn-normalized n_hc x n_hc comb matrix     ("comb")
+//
+// The comb section starts as logits (mix * scale[2] + base), then a
+// per-dst_hc row softmax (max-subtract + exp + normalize) with `eps` added,
+// then alternating column / row normalizations for sinkhorn_iters - 1 more
+// iterations. The result is doubly-stochastic up to `eps`-stabilization.
+//
+// Expected shape:
+//   mixes  : [mix_hc, n_rows]    float32, contiguous along ne[0]
+//   scale  : [3]                 float32 (pre, post, comb scales)
+//   base   : [mix_hc]            float32, matches the mix layout
+//   dst    : [mix_hc, n_rows]    float32, same shape as mixes
+// where  mix_hc == (2 + n_hc) * n_hc  and  n_hc in [1, 16].
+//
+// Op params (i32, i32, f32): n_hc, sinkhorn_iters, eps.
+//
+// CUDA kernel design:
+//   - One CUDA block per output row.
+//   - Block size rounded up to a warp multiple (>= 32) so __syncthreads()
+//     and any future block-wide reductions are well-formed even when the
+//     natural row width (mix_hc = 24 or 80 for n_hc = 4 or 8) is not a
+//     warp multiple. Excess threads do no memory work; loops guard `i < n`.
+//   - Sections 1, 2, and the final copy parallelize across the block.
+//   - Section 3 (Sinkhorn iterations on the n_hc x n_hc comb matrix) is
+//     serialized on `tid == 0`; n_hc <= 16 makes this trivially cheap
+//     (O(n_hc^2 * sinkhorn_iters) per row) and avoids the complexity of
+//     warp-cooperative reductions over a 4-or-8-wide inner dimension.
+//
+// Reference Metal kernel:  ggml/src/ggml-metal/ggml-metal.metal:2076-2245
+// CPU reference:           ggml/src/ggml-cpu/ops.cpp:11037-11117
+// Public API:              ggml/include/ggml.h (ggml_dsv4_hc_split_sinkhorn)
+
+#include "common.cuh"
+
+void ggml_cuda_op_dsv4_hc_split_sinkhorn(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cu b/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cu
new file mode 100644
index 000000000000..75f38d2aa7c3
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cu
@@ -0,0 +1,104 @@
+#include "dsv4-hc-weighted-sum.cuh"
+
+// CUDA port of kernel_dsv4_hc_weighted_sum (ggml-metal.metal:2278-2327).
+//
+// Layout (FP32 throughout):
+//   x       : {n_embd, n_hc,    n_tokens}
+//   weights : {n_hc,   n_tokens}
+//   dst     : {n_embd, n_tokens}
+// Output[d, t] = sum_{h=0..n_hc-1} x[d, h, t] * weights[h, t].
+//
+// One thread per output element. Total threads = n_embd * n_tokens.
+// Strides are passed in BYTES (matching ggml's nb[] convention); element
+// access is via `(const char *) base + d*nb0 + h*nb1 + t*nb2` reinterpret
+// as `const float *`, identical to the Metal kernel and CPU reference.
+
+static __global__ void dsv4_hc_weighted_sum_f32(
+        const char * __restrict__ x,
+        const char * __restrict__ weights,
+              char * __restrict__ dst,
+        const int      n_embd,
+        const int      n_hc,
+        const int      n_tokens,
+        const int64_t  nb_x0,
+        const int64_t  nb_x1,
+        const int64_t  nb_x2,
+        const int64_t  nb_w0,
+        const int64_t  nb_w1,
+        const int64_t  nb0,
+        const int64_t  nb1) {
+    const int64_t gid   = (int64_t) blockIdx.x * blockDim.x + threadIdx.x;
+    const int64_t total = (int64_t) n_embd * n_tokens;
+    if (gid >= total) {
+        return;
+    }
+
+    const int64_t d = gid % n_embd;
+    const int64_t t = gid / n_embd;
+
+    float acc = 0.0f;
+    for (int h = 0; h < n_hc; ++h) {
+        const float xv = *((const float *) (x       + d*nb_x0 + h*nb_x1 + t*nb_x2));
+        const float wv = *((const float *) (weights + h*nb_w0 + t*nb_w1));
+        acc += xv * wv;
+    }
+
+    *((float *) (dst + d*nb0 + t*nb1)) = acc;
+}
+
+void ggml_cuda_op_dsv4_hc_weighted_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * x       = dst->src[0];
+    const ggml_tensor * weights = dst->src[1];
+
+    GGML_ASSERT(x->type       == GGML_TYPE_F32);
+    GGML_ASSERT(weights->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type     == GGML_TYPE_F32);
+
+    // Shape contract: see ggml.c:6335-6339 and the CPU reference asserts at
+    // ggml-cpu/ops.cpp:11129-11140.
+    GGML_ASSERT(x->ne[0]       == dst->ne[0]);
+    GGML_ASSERT(x->ne[1]       == weights->ne[0]);
+    GGML_ASSERT(x->ne[2]       == dst->ne[1]);
+    GGML_ASSERT(weights->ne[1] == dst->ne[1]);
+    GGML_ASSERT(x->ne[3]       == 1);
+    GGML_ASSERT(weights->ne[2] == 1);
+    GGML_ASSERT(weights->ne[3] == 1);
+    GGML_ASSERT(dst->ne[2]     == 1);
+    GGML_ASSERT(dst->ne[3]     == 1);
+
+    const int n_embd   = (int) dst->ne[0];
+    const int n_hc     = (int) x->ne[1];
+    const int n_tokens = (int) dst->ne[1];
+
+    const int64_t nb_x0 = (int64_t) x->nb[0];
+    const int64_t nb_x1 = (int64_t) x->nb[1];
+    const int64_t nb_x2 = (int64_t) x->nb[2];
+    const int64_t nb_w0 = (int64_t) weights->nb[0];
+    const int64_t nb_w1 = (int64_t) weights->nb[1];
+    const int64_t nb0   = (int64_t) dst->nb[0];
+    const int64_t nb1   = (int64_t) dst->nb[1];
+
+    const int64_t total = (int64_t) n_embd * n_tokens;
+    if (total == 0) {
+        return;
+    }
+
+    constexpr int CUDA_DSV4_HC_WEIGHTED_SUM_BLOCK_SIZE = 256;
+    const dim3 block_dims(CUDA_DSV4_HC_WEIGHTED_SUM_BLOCK_SIZE, 1, 1);
+    const dim3 grid_dims((unsigned) ((total + CUDA_DSV4_HC_WEIGHTED_SUM_BLOCK_SIZE - 1) /
+                                     CUDA_DSV4_HC_WEIGHTED_SUM_BLOCK_SIZE),
+                         1, 1);
+
+    cudaStream_t stream = ctx.stream();
+
+    dsv4_hc_weighted_sum_f32<<<grid_dims, block_dims, 0, stream>>>(
+        (const char *) x->data,
+        (const char *) weights->data,
+              (char *) dst->data,
+        n_embd, n_hc, n_tokens,
+        nb_x0, nb_x1, nb_x2,
+        nb_w0, nb_w1,
+        nb0, nb1);
+
+    CUDA_CHECK(cudaGetLastError());
+}
diff --git a/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cuh b/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cuh
new file mode 100644
index 000000000000..e6ee9c19a267
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-hc-weighted-sum.cuh
@@ -0,0 +1,25 @@
+#pragma once
+
+// V4 hyper-connection weighted-sum: collapses the hc dimension.
+//
+//   out[embd, token] = sum over hc of weights[hc, token] * x[embd, hc, token]
+//
+// Inputs (all GGML_TYPE_F32):
+//   dst->src[0] = x       shape {n_embd, n_hc,    n_tokens, 1}
+//   dst->src[1] = weights shape {n_hc,   n_tokens, 1,       1}
+// Output (GGML_TYPE_F32):
+//   dst                   shape {n_embd, n_tokens, 1,       1}
+//
+// Reference Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:2278-2327
+// Reference Metal dispatch: ggml/src/ggml-metal/ggml-metal-ops.cpp:1440-1486
+// CPU reference: ggml/src/ggml-cpu/ops.cpp:11121 (ggml_compute_forward_dsv4_hc_weighted_sum)
+// Public API: ggml/include/ggml.h:2574 (ggml_dsv4_hc_weighted_sum)
+//
+// Implementation: embarrassingly parallel; one thread per output element
+// (n_embd * n_tokens total), each thread loops over n_hc to accumulate.
+// Strides are kept in bytes (matching the Metal kernel + the ggml tensor
+// nb[] convention) and applied via (const char *) base + offset casts.
+
+#include "common.cuh"
+
+void ggml_cuda_op_dsv4_hc_weighted_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/dsv4-rope-tail.cu b/ggml/src/ggml-cuda/dsv4-rope-tail.cu
new file mode 100644
index 000000000000..abbf9b2181be
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-rope-tail.cu
@@ -0,0 +1,219 @@
+#include "dsv4-rope-tail.cuh"
+
+#include "ggml.h"      // ggml_rope_yarn_corr_dims, ggml_get_op_params_i32
+
+#include <algorithm>   // std::min / std::max in dispatch
+#include <cstring>     // memcpy
+
+// YaRN helper. Direct port of ggml/src/ggml-cuda/rope.cu:22-41
+// (template<bool forward> rope_yarn). Duplicated here to keep this
+// translation unit self-contained — rope.cuh does not currently expose
+// the function as a reusable device helper. The math is identical.
+
+struct dsv4_rope_corr_dims {
+    float v[2];
+};
+
+static __device__ __forceinline__ float dsv4_rope_yarn_ramp(
+        const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / max(0.001f, high - low);
+    return 1.0f - min(1.0f, max(0.0f, y));
+}
+
+// forward=true: standard rotation; forward=false: inverse (sin flipped).
+template<bool forward>
+static __device__ __forceinline__ void dsv4_rope_yarn(
+        const float theta_extrap, const float freq_scale,
+        const dsv4_rope_corr_dims corr_dims, const int i0,
+        const float ext_factor, float mscale,
+        float & cos_theta, float & sin_theta) {
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        const float ramp_mix = dsv4_rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
+        theta = theta_interp * (1.0f - ramp_mix) + theta_extrap * ramp_mix;
+        mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale);
+    }
+    cos_theta = cosf(theta) * mscale;
+    sin_theta = sinf(theta) * mscale;
+    if (!forward) {
+        sin_theta = -sin_theta;
+    }
+}
+
+// Main kernel. Launch shape matches Metal:
+//   grid  = (ne01, ne02, ne03)
+//   block = (min(256, ne00), 1, 1)
+// Each thread walks the ne00 dim with stride ntg (== blockDim.x).
+// Translation of kernel_dsv4_rope_tail_f32 at
+// ggml/src/ggml-metal/ggml-metal.metal:4906-4997.
+static __global__ void dsv4_rope_tail_f32_kernel(
+        const float * __restrict__ src0,
+        const int   * __restrict__ pos,
+        const float * __restrict__ freq_factors,
+        float       * __restrict__ dst,
+        const int    ne00,
+        const int    nb00, const int nb01, const int nb02, const int nb03,
+        const int    nb0,  const int nb1,  const int nb2,  const int nb3,
+        const int    n_dims,
+        const float  freq_base, const float freq_scale,
+        const float  ext_factor, const float attn_factor,
+        const dsv4_rope_corr_dims corr_dims,
+        const bool   is_neox, const bool inverse) {
+    const int i1 = blockIdx.x;
+    const int i2 = blockIdx.y;
+    const int i3 = blockIdx.z;
+    const int tid = threadIdx.x;
+    const int ntg = blockDim.x;
+
+    const int n_nope = ne00 - n_dims;
+    if (n_nope < 0) {
+        return;
+    }
+
+    const float theta_base_pos = (float) pos[i2];
+    const float inv_ndims = -1.0f / (float) n_dims;
+
+    const char * src_base = (const char *) src0 + i3 * nb03 + i2 * nb02 + i1 * nb01;
+    char       * dst_base = (char *)       dst  + i3 * nb3  + i2 * nb2  + i1 * nb1;
+
+    for (int i0 = tid; i0 < ne00; i0 += ntg) {
+        // Pass-through prefix: non-RoPE portion of the row.
+        if (i0 < n_nope) {
+            *((float *) (dst_base + i0 * nb0)) = *((const float *) (src_base + i0 * nb00));
+            continue;
+        }
+
+        const int r = i0 - n_nope;
+
+        if (is_neox) {
+            const int n_half = n_dims / 2;
+            if (r >= n_half) {
+                continue;
+            }
+
+            const int ic = r;
+            const int rel_i0 = 2 * ic;
+            const float theta = theta_base_pos * powf(freq_base, inv_ndims * (float) rel_i0);
+            const float freq_factor = freq_factors ? freq_factors[ic] : 1.0f;
+
+            float cos_theta;
+            float sin_theta;
+            // Use forward=true; inverse handled as a sign flip below to match
+            // Metal's "if (args.inverse) sin_theta = -sin_theta" pattern.
+            dsv4_rope_yarn<true>(theta / freq_factor, freq_scale, corr_dims,
+                                 rel_i0, ext_factor, attn_factor,
+                                 cos_theta, sin_theta);
+            if (inverse) {
+                sin_theta = -sin_theta;
+            }
+
+            const int j0 = n_nope + ic;
+            const int j1 = n_nope + ic + n_half;
+            const float x0 = *((const float *) (src_base + j0 * nb00));
+            const float x1 = *((const float *) (src_base + j1 * nb00));
+            *((float *) (dst_base + j0 * nb0)) = x0 * cos_theta - x1 * sin_theta;
+            *((float *) (dst_base + j1 * nb0)) = x0 * sin_theta + x1 * cos_theta;
+        } else {
+            // NORMAL mode: rotate adjacent pair (j0, j0+1).
+            if ((r & 1) != 0) {
+                continue;
+            }
+
+            const int ic = r / 2;
+            const float theta = theta_base_pos * powf(freq_base, inv_ndims * (float) r);
+            const float freq_factor = freq_factors ? freq_factors[ic] : 1.0f;
+
+            float cos_theta;
+            float sin_theta;
+            dsv4_rope_yarn<true>(theta / freq_factor, freq_scale, corr_dims,
+                                 r, ext_factor, attn_factor,
+                                 cos_theta, sin_theta);
+            if (inverse) {
+                sin_theta = -sin_theta;
+            }
+
+            const int j0 = n_nope + r;
+            const int j1 = j0 + 1;
+            const float x0 = *((const float *) (src_base + j0 * nb00));
+            const float x1 = *((const float *) (src_base + j1 * nb00));
+            *((float *) (dst_base + j0 * nb0)) = x0 * cos_theta - x1 * sin_theta;
+            *((float *) (dst_base + j1 * nb0)) = x0 * sin_theta + x1 * cos_theta;
+        }
+    }
+}
+
+void ggml_cuda_op_dsv4_rope_tail(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * pos  = dst->src[1];
+    const ggml_tensor * ff   = dst->src[2];  // optional; may be NULL
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+    GGML_ASSERT(pos->type  == GGML_TYPE_I32);
+
+    // op_params layout — matches Metal dispatch at
+    // ggml/src/ggml-metal/ggml-metal-ops.cpp:1606-1623 verbatim:
+    //   [0] = n_dims      (i32)
+    //   [1] = mode        (i32)
+    //   [2] = n_ctx_orig  (i32)
+    //   [3] = inverse     (i32, treated as bool)
+    //   [4] = freq_base   (f32)
+    //   [5] = freq_scale  (f32)
+    //   [6] = ext_factor  (f32)
+    //   [7] = attn_factor (f32)
+    //   [8] = beta_fast   (f32)
+    //   [9] = beta_slow   (f32)
+    const int32_t n_dims     = ggml_get_op_params_i32(dst, 0);
+    const int32_t mode       = ggml_get_op_params_i32(dst, 1);
+    const int32_t n_ctx_orig = ggml_get_op_params_i32(dst, 2);
+    const int32_t inverse_i  = ggml_get_op_params_i32(dst, 3);
+    const bool    inverse    = inverse_i != 0;
+
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+    memcpy(&freq_base,   (const int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale,  (const int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&ext_factor,  (const int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&attn_factor, (const int32_t *) dst->op_params + 7, sizeof(float));
+    memcpy(&beta_fast,   (const int32_t *) dst->op_params + 8, sizeof(float));
+    memcpy(&beta_slow,   (const int32_t *) dst->op_params + 9, sizeof(float));
+
+    const bool is_neox = (mode == GGML_ROPE_TYPE_NEOX);
+
+    // Precompute YaRN corr_dims host-side (matches Metal call at
+    // ggml/src/ggml-metal/ggml-metal.metal:4927).
+    dsv4_rope_corr_dims corr_dims;
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
+
+    const int ne00 = (int) src0->ne[0];
+    const int ne01 = (int) src0->ne[1];
+    const int ne02 = (int) src0->ne[2];
+    const int ne03 = (int) src0->ne[3];
+
+    GGML_ASSERT(ne01 > 0 && ne02 > 0 && ne03 > 0);
+
+    const int nth = std::min(256, std::max(1, ne00));
+    const dim3 grid(ne01, ne02, ne03);
+    const dim3 block(nth, 1, 1);
+
+    cudaStream_t stream = ctx.stream();
+    dsv4_rope_tail_f32_kernel<<<grid, block, 0, stream>>>(
+        (const float *) src0->data,
+        (const int *)   pos->data,
+        ff ? (const float *) ff->data : nullptr,
+        (float *)       dst->data,
+        ne00,
+        (int) src0->nb[0], (int) src0->nb[1], (int) src0->nb[2], (int) src0->nb[3],
+        (int) dst->nb[0],  (int) dst->nb[1],  (int) dst->nb[2],  (int) dst->nb[3],
+        n_dims,
+        freq_base, freq_scale, ext_factor, attn_factor,
+        corr_dims,
+        is_neox, inverse);
+
+    CUDA_CHECK(cudaGetLastError());
+}
diff --git a/ggml/src/ggml-cuda/dsv4-rope-tail.cuh b/ggml/src/ggml-cuda/dsv4-rope-tail.cuh
new file mode 100644
index 000000000000..24c1e5dac4fd
--- /dev/null
+++ b/ggml/src/ggml-cuda/dsv4-rope-tail.cuh
@@ -0,0 +1,26 @@
+#pragma once
+
+// V4 partial-RoPE: applies RoPE rotation to the last n_dims elements of each
+// row, leaving the non-RoPE prefix (i.e. the first ne00 - n_dims elements)
+// unchanged. The rotation math is the same as ggml_rope_ext (with YaRN
+// extrapolation when ext_factor != 0), restricted to the tail.
+//
+// Reference Metal kernel: ggml/src/ggml-metal/ggml-metal.metal:4906-4997
+// CPU reference:          ggml/src/ggml-cpu/ops.cpp:5961
+// Public API:             ggml/include/ggml.h:2599 (ggml_dsv4_rope_tail)
+//
+// The dispatch function extracts op_params (i32 slots 0..3:
+// n_dims, mode, n_ctx_orig, inverse; f32 slots 4..9: freq_base, freq_scale,
+// ext_factor, attn_factor, beta_fast, beta_slow) from the destination
+// tensor, precomputes YaRN corr_dims host-side, and launches the kernel
+// with grid = (ne01, ne02, ne03), block.x = min(256, ne00), matching the
+// Metal dispatch at ggml/src/ggml-metal/ggml-metal-ops.cpp:1670.
+//
+// Supports the two RoPE modes the public V4 API allows
+// (ggml/src/ggml.c:6426 ASSERT mode == NORMAL || mode == NEOX). All
+// other modes are rejected via ggml_backend_cuda_device_supports_op so
+// the framework falls back to CPU rather than producing wrong output.
+
+#include "common.cuh"
+
+void ggml_cuda_op_dsv4_rope_tail(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index beeb52389464..a0feb032e215 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -391,6 +391,18 @@ static __device__ __forceinline__ void dequantize_V_q4_0(const void * __restrict
             ((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]);
         }
     } else
+#else // FP16_AVAILABLE
+    // Software-FP16 fallback: compute in float, pack via __floats2half2_rn (RTNE).
+    // Required because the half-precision V-dequant template is instantiated <half, 4>
+    // for the full CC matrix (50/61/70/...), but FP16_AVAILABLE is undefined for CC < 600.
+    if constexpr (std::is_same_v<T, half>) {
+        const float d = __half2float(x[ib].d);
+
+#pragma unroll
+        for (int l0 = 0; l0 < ne; l0 += 2) {
+            ((half2 *) dst)[l0/2] = __floats2half2_rn(d * q8[l0 + 0], d * q8[l0 + 1]);
+        }
+    } else
 #endif // FP16_AVAILABLE
     if constexpr (std::is_same_v<T, float>) {
         const float d = x[ib].d;
@@ -431,6 +443,16 @@ static __device__ __forceinline__ void dequantize_V_q4_1(const void * __restrict
             ((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]) + m;
         }
     } else
+#else // FP16_AVAILABLE
+    // Software-FP16 fallback: see dequantize_V_q4_0 for the rationale.
+    if constexpr (std::is_same_v<T, half>) {
+        const float2 dm = __half22float2(x[ib].dm);
+
+#pragma unroll
+        for (int l0 = 0; l0 < ne; l0 += 2) {
+            ((half2 *) dst)[l0/2] = __floats2half2_rn(dm.x * q8[l0 + 0] + dm.y, dm.x * q8[l0 + 1] + dm.y);
+        }
+    } else
 #endif // FP16_AVAILABLE
     if constexpr (std::is_same_v<T, float>) {
         const float2 dm = __half22float2(x[ib].dm);
@@ -481,6 +503,16 @@ static __device__ __forceinline__ void dequantize_V_q5_0(const void * __restrict
             ((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]);
         }
     } else
+#else // FP16_AVAILABLE
+    // Software-FP16 fallback: see dequantize_V_q4_0 for the rationale.
+    if constexpr (std::is_same_v<T, half>) {
+        const float d = __half2float(x[ib].d);
+
+#pragma unroll
+        for (int l0 = 0; l0 < ne; l0 += 2) {
+            ((half2 *) dst)[l0/2] = __floats2half2_rn(d * q8[l0 + 0], d * q8[l0 + 1]);
+        }
+    } else
 #endif // FP16_AVAILABLE
     if constexpr (std::is_same_v<T, float>) {
         const float d = x[ib].d;
@@ -531,6 +563,16 @@ static __device__ __forceinline__ void dequantize_V_q5_1(const void * __restrict
             ((half2 *) dst)[l0/2] = d * make_half2(q8[l0 + 0], q8[l0 + 1]) + m;
         }
     } else
+#else // FP16_AVAILABLE
+    // Software-FP16 fallback: see dequantize_V_q4_0 for the rationale.
+    if constexpr (std::is_same_v<T, half>) {
+        const float2 dm = __half22float2(x[ib].dm);
+
+#pragma unroll
+        for (int l0 = 0; l0 < ne; l0 += 2) {
+            ((half2 *) dst)[l0/2] = __floats2half2_rn(dm.x * q8[l0 + 0] + dm.y, dm.x * q8[l0 + 1] + dm.y);
+        }
+    } else
 #endif // FP16_AVAILABLE
     if constexpr (std::is_same_v<T, float>) {
         const float2 dm = __half22float2(x[ib].dm);
@@ -564,6 +606,16 @@ static __device__ __forceinline__ void dequantize_V_q8_0(const void * __restrict
             ((half2 *) dst)[l0/2] = d * make_half2(qs[l0 + 0], qs[l0 + 1]);
         }
     } else
+#else // FP16_AVAILABLE
+    // Software-FP16 fallback: see dequantize_V_q4_0 for the rationale.
+    if constexpr (std::is_same<T, half>::value) {
+        const float d = __half2float(x[ib].d);
+
+#pragma unroll
+        for (int l0 = 0; l0 < ne; l0 += 2) {
+            ((half2 *) dst)[l0/2] = __floats2half2_rn(d * qs[l0 + 0], d * qs[l0 + 1]);
+        }
+    } else
 #endif // FP16_AVAILABLE
     if constexpr (std::is_same<T, float>::value) {
         const float d = x[ib].d;
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index e25be3592fd4..e0fbceec7a0e 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -23,6 +23,11 @@
 #include "ggml-cuda/cumsum.cuh"
 #include "ggml-cuda/diagmask.cuh"
 #include "ggml-cuda/diag.cuh"
+#include "ggml-cuda/dsv4-fp8-kv-quantize.cuh"
+#include "ggml-cuda/dsv4-hc-expand.cuh"
+#include "ggml-cuda/dsv4-hc-split-sinkhorn.cuh"
+#include "ggml-cuda/dsv4-hc-weighted-sum.cuh"
+#include "ggml-cuda/dsv4-rope-tail.cuh"
 #include "ggml-cuda/fattn.cuh"
 #include "ggml-cuda/getrows.cuh"
 #include "ggml-cuda/im2col.cuh"
@@ -2777,7 +2782,52 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
         nb1, nb2, nb3, stream);
 }
 
+// ---------- DSV4 debug logging (env-gated, set GGML_DSV4_DEBUG=1 to enable) ----------
+static bool dsv4_debug_enabled() {
+    static const bool enabled = (getenv("GGML_DSV4_DEBUG") != nullptr);
+    return enabled;
+}
+
+static const char * dsv4_op_short(enum ggml_op op) {
+    switch (op) {
+        case GGML_OP_DSV4_ROPE_TAIL:         return "DSV4_ROPE_TAIL";
+        case GGML_OP_DSV4_HC_SPLIT_SINKHORN: return "DSV4_HC_SPLIT_SINKHORN";
+        case GGML_OP_DSV4_HC_WEIGHTED_SUM:   return "DSV4_HC_WEIGHTED_SUM";
+        case GGML_OP_DSV4_HC_EXPAND:         return "DSV4_HC_EXPAND";
+        case GGML_OP_DSV4_FP8_KV_QUANTIZE:   return "DSV4_FP8_KV_QUANTIZE";
+        default:                              return nullptr;
+    }
+}
+
+static bool dsv4_op_is_v4(enum ggml_op op) {
+    return dsv4_op_short(op) != nullptr;
+}
+
+static void dsv4_log_op_entry(int device, const struct ggml_tensor * dst) {
+    if (!dsv4_debug_enabled() || !dsv4_op_is_v4(dst->op)) return;
+    fprintf(stderr, "[DSV4_DEBUG] dev=%d op=%s dst=%s(%s) shape=[%lld,%lld,%lld,%lld]\n",
+            device, dsv4_op_short(dst->op),
+            dst->name, ggml_type_name(dst->type),
+            (long long) dst->ne[0], (long long) dst->ne[1],
+            (long long) dst->ne[2], (long long) dst->ne[3]);
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
+        if (!dst->src[i]) continue;
+        const char * buft_name = "(null-buf)";
+        int is_split = 0;
+        if (dst->src[i]->buffer) {
+            buft_name = ggml_backend_buft_name(dst->src[i]->buffer->buft);
+            is_split  = ggml_backend_buft_is_cuda_split(dst->src[i]->buffer->buft) ? 1 : 0;
+        }
+        fprintf(stderr, "[DSV4_DEBUG]   src[%d]=%s(%s) buft=%s split=%d data=%p extra=%p\n",
+                i, dst->src[i]->name, ggml_type_name(dst->src[i]->type),
+                buft_name, is_split, dst->src[i]->data, (void *) dst->src[i]->extra);
+    }
+    fflush(stderr);
+}
+// ---------- end DSV4 debug ----------
+
 static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
+    dsv4_log_op_entry(ctx.device, dst);
     switch (dst->op) {
         case GGML_OP_ARGMAX:
             ggml_cuda_argmax(ctx, dst);
@@ -3020,6 +3070,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_ROPE_BACK:
             ggml_cuda_op_rope_back(ctx, dst);
             break;
+        case GGML_OP_DSV4_ROPE_TAIL:
+            ggml_cuda_op_dsv4_rope_tail(ctx, dst);
+            break;
         case GGML_OP_ROLL:
             ggml_cuda_op_roll(ctx, dst);
             break;
@@ -3089,6 +3142,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_RWKV_WKV7:
             ggml_cuda_op_rwkv_wkv7(ctx, dst);
             break;
+        case GGML_OP_DSV4_HC_EXPAND:
+            ggml_cuda_op_dsv4_hc_expand(ctx, dst);
+            break;
+        case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+            ggml_cuda_op_dsv4_fp8_kv_quantize(ctx, dst);
+            break;
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
             ggml_cuda_cross_entropy_loss_back(ctx, dst);
             break;
@@ -3104,6 +3163,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_FILL:
             ggml_cuda_op_fill(ctx, dst);
             break;
+        case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+            ggml_cuda_op_dsv4_hc_split_sinkhorn(ctx, dst);
+            break;
+        case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+            ggml_cuda_op_dsv4_hc_weighted_sum(ctx, dst);
+            break;
         default:
             return false;
     }
@@ -3208,7 +3273,30 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
 #ifdef GGML_CUDA_NO_PEER_COPY
             return false;
 #else
+            if (dsv4_debug_enabled()) {
+                fprintf(stderr, "[DSV4_DEBUG] peer-copy: src_dev=%d dst_dev=%d bytes=%zu "
+                                "src=%s(%s,op=%s,buft=%s) dst=%s(%s,op=%s,buft=%s) src_ptr=%p dst_ptr=%p\n",
+                        cuda_ctx_src->device, cuda_ctx_dst->device, ggml_nbytes(dst),
+                        src->name, ggml_type_name(src->type), ggml_op_name(src->op),
+                        src->buffer ? ggml_backend_buft_name(src->buffer->buft) : "?",
+                        dst->name, ggml_type_name(dst->type), ggml_op_name(dst->op),
+                        dst->buffer ? ggml_backend_buft_name(dst->buffer->buft) : "?",
+                        src->data, dst->data);
+                fflush(stderr);
+                // Force any deferred CUDA error to surface BEFORE the next op, so the log line
+                // immediately above truly identifies the failing copy (codex review nit #1).
+                cudaError_t pre_err = cudaGetLastError();
+                if (pre_err != cudaSuccess) {
+                    fprintf(stderr, "[DSV4_DEBUG] pre-copy stale error: %s\n", cudaGetErrorString(pre_err));
+                    fflush(stderr);
+                }
+            }
             CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream()));
+            if (dsv4_debug_enabled()) {
+                // Synchronous wait to force the async error (if any) to surface at the offending copy,
+                // not at some later API call. Heavy perturbation — only with GGML_DSV4_DEBUG=1.
+                CUDA_CHECK(cudaStreamSynchronize(cuda_ctx_src->stream()));
+            }
 #endif // GGML_CUDA_NO_PEER_COPY
         }
 
@@ -5034,8 +5122,17 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
 static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
     ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
 
-    // split buffers can only be used with GGML_OP_MUL_MAT
-    if (op->op != GGML_OP_MUL_MAT) {
+    // split buffers can only be used with GGML_OP_MUL_MAT and DeepSeek V4 custom ops.
+    // Without the DSV4 exception, multi-GPU scheduler rejects the V4 ops once their
+    // weight tensors land in cuda_split buffers and falls back to CPU — which then
+    // corrupts data via host<->device transfer mismatches and crashes during decode.
+    // Reported and root-caused by @DenisVASI9 on an 8x A100 40GB rig.
+    if (op->op != GGML_OP_MUL_MAT &&
+        op->op != GGML_OP_DSV4_HC_SPLIT_SINKHORN &&
+        op->op != GGML_OP_DSV4_HC_WEIGHTED_SUM &&
+        op->op != GGML_OP_DSV4_HC_EXPAND &&
+        op->op != GGML_OP_DSV4_FP8_KV_QUANTIZE &&
+        op->op != GGML_OP_DSV4_ROPE_TAIL) {
         for (int i = 0; i < GGML_MAX_SRC; i++) {
             if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
                 return false;
@@ -5053,6 +5150,30 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         }
     }
 
+    // Some ops write through a pre-allocated destination buffer (e.g. SET_ROWS
+    // into a KV cache). For those, the dst lives on a specific device — dispatching
+    // the op on a different device causes the CUDA kernel to write through a
+    // foreign-device pointer (dst->data), surfacing as cudaErrorIllegalAddress.
+    //
+    // SET_ROWS returns a view tensor (ggml_view_tensor(ctx, a)) so op->buffer is
+    // nullptr. We must walk the view chain to find the real buffer.
+    // Diagnosed via CUDA_LAUNCH_BLOCKING=1 + GGML_DSV4_DEBUG=1 on @DenisVASI9's
+    // 8x A100 rig: V4's dsv4_store_cache_rows emits SET_ROWS at layer-7 K-cache
+    // (on CUDA1) while sched dispatched on CUDA0 → illegal access.
+    {
+        const ggml_tensor * t = op;
+        while (t->view_src) {
+            t = t->view_src;
+        }
+        if (t->buffer && ggml_backend_buft_is_cuda(t->buffer->buft)) {
+            ggml_backend_cuda_buffer_type_context * buft_ctx =
+                (ggml_backend_cuda_buffer_type_context *) t->buffer->buft->context;
+            if (buft_ctx->device != dev_ctx->device) {
+                return false;
+            }
+        }
+    }
+
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -5358,6 +5479,23 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_ROPE_BACK: {
             return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
         }
+        case GGML_OP_DSV4_ROPE_TAIL: {
+            // Only F32 in/out is supported on this kernel (matches Metal kargs).
+            if (op->src[0]->type != GGML_TYPE_F32 || op->type != GGML_TYPE_F32) {
+                return false;
+            }
+            // Kernel implements mode == NORMAL (0) and mode == NEOX (2);
+            // any other mode is rejected so the framework falls back to CPU
+            // rather than producing wrong output. ggml/src/ggml.c:6426 ASSERTs
+            // this constraint at op-construction time, but we re-check here
+            // for defense-in-depth.
+            const int32_t mode = ggml_get_op_params_i32(op, 1);
+            if (mode != GGML_ROPE_TYPE_NORMAL && mode != GGML_ROPE_TYPE_NEOX) {
+                return false;
+            }
+            // Same contiguity requirement as GGML_OP_ROPE.
+            return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
+        }
         case GGML_OP_IM2COL:
         case GGML_OP_IM2COL_3D:
         case GGML_OP_CONV_2D:
@@ -5393,6 +5531,12 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_GATED_LINEAR_ATTN:
         case GGML_OP_RWKV_WKV7:
             return true;
+        case GGML_OP_DSV4_HC_EXPAND:
+            return op->type == GGML_TYPE_F32
+                && op->src[0]->type == GGML_TYPE_F32
+                && op->src[1]->type == GGML_TYPE_F32
+                && op->src[2]->type == GGML_TYPE_F32
+                && op->src[3]->type == GGML_TYPE_F32;
         case GGML_OP_GATED_DELTA_NET:
             //TODO: enable once MUSA compiler is solved https://github.com/ggml-org/llama.cpp/pull/19504#issuecomment-4018634327
 #ifdef GGML_USE_MUSA
@@ -5400,6 +5544,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
 #else
             return true;
 #endif // GGML_USE_MUSA
+        case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+            return op->type == GGML_TYPE_F32
+                && op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_FLASH_ATTN_EXT:
             return ggml_cuda_flash_attn_ext_supported(dev_ctx->device, op);
         case GGML_OP_CROSS_ENTROPY_LOSS:
@@ -5412,6 +5559,15 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_DIAG:
         case GGML_OP_SOLVE_TRI:
             return true;
+        case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+            return op->src[0]->type == GGML_TYPE_F32 &&
+                   op->src[1]->type == GGML_TYPE_F32 &&
+                   op->src[2]->type == GGML_TYPE_F32 &&
+                   op->type         == GGML_TYPE_F32;
+        case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+            return op->type         == GGML_TYPE_F32
+                && op->src[0]->type == GGML_TYPE_F32
+                && op->src[1]->type == GGML_TYPE_F32;
 
         default:
             return false;
diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp
index e288a27f992a..d6e5bf98e04f 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-device.cpp
@@ -459,6 +459,54 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max(ggml_me
     return res;
 }
 
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_split_sinkhorn(ggml_metal_library_t lib, const ggml_tensor * op) {
+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[2]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+    const char * name = "kernel_dsv4_hc_split_sinkhorn";
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
+    }
+
+    return res;
+}
+
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_weighted_sum(ggml_metal_library_t lib, const ggml_tensor * op) {
+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+    const char * name = "kernel_dsv4_hc_weighted_sum";
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
+    }
+
+    return res;
+}
+
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_expand(ggml_metal_library_t lib, const ggml_tensor * op) {
+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[2]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[3]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+    const char * name = "kernel_dsv4_hc_expand";
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
+    }
+
+    return res;
+}
+
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv(ggml_metal_library_t lib, const ggml_tensor * op) {
     GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
     GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
@@ -1429,6 +1477,36 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext(
     return res;
 }
 
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_fp8_kv_quantize(ggml_metal_library_t lib, const ggml_tensor * op) {
+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+    const char * name = "kernel_dsv4_fp8_kv_quantize_f32";
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
+    }
+
+    res.smem = 64*sizeof(float);
+
+    return res;
+}
+
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_rope_tail(ggml_metal_library_t lib, const ggml_tensor * op) {
+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+    const char * name = "kernel_dsv4_rope_tail_f32";
+
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, name, name, nullptr);
+    }
+
+    return res;
+}
+
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_vec(
         ggml_metal_library_t lib,
         const ggml_tensor * op,
diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h
index 1f212a92f985..1e3a8485f922 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.h
+++ b/ggml/src/ggml-metal/ggml-metal-device.h
@@ -123,6 +123,11 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_bl
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_cumsum_add        (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_tri               (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_soft_max          (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_split_sinkhorn(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_weighted_sum(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_hc_expand    (ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_fp8_kv_quantize(ggml_metal_library_t lib, const struct ggml_tensor * op);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_dsv4_rope_tail    (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv          (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_conv_batched  (ggml_metal_library_t lib, const struct ggml_tensor * op, int ssm_conv_bs);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_ssm_scan          (ggml_metal_library_t lib, const struct ggml_tensor * op);
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index 780dfe81bb3c..f8b2e65c2a46 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1187,6 +1187,53 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
         case GGML_OP_SSM_CONV:
         case GGML_OP_SSM_SCAN:
             return has_simdgroup_reduction;
+        case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+            return ggml_is_contiguous_rows(op->src[0]) &&
+                ggml_is_contiguous(op->src[1]) &&
+                ggml_is_contiguous(op->src[2]) &&
+                op->src[0]->type == GGML_TYPE_F32 &&
+                op->src[1]->type == GGML_TYPE_F32 &&
+                op->src[2]->type == GGML_TYPE_F32 &&
+                op->type == GGML_TYPE_F32;
+        case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+            return op->src[0]->type == GGML_TYPE_F32 &&
+                op->src[1]->type == GGML_TYPE_F32 &&
+                op->type == GGML_TYPE_F32 &&
+                op->src[0]->ne[0] == op->ne[0] &&
+                op->src[0]->ne[1] == op->src[1]->ne[0] &&
+                op->src[0]->ne[2] == op->ne[1] &&
+                op->src[1]->ne[1] == op->ne[1];
+        case GGML_OP_DSV4_HC_EXPAND:
+            return op->src[0]->type == GGML_TYPE_F32 &&
+                op->src[1]->type == GGML_TYPE_F32 &&
+                op->src[2]->type == GGML_TYPE_F32 &&
+                op->src[3]->type == GGML_TYPE_F32 &&
+                op->type == GGML_TYPE_F32 &&
+                op->src[0]->ne[0] == op->ne[0] &&
+                op->src[0]->ne[1] == op->ne[2] &&
+                op->src[1]->ne[0] == op->ne[0] &&
+                op->src[1]->ne[1] == op->ne[1] &&
+                op->src[1]->ne[2] == op->ne[2] &&
+                op->src[2]->ne[0] == op->ne[1] &&
+                op->src[2]->ne[1] == op->ne[2] &&
+                op->src[3]->ne[0] == op->ne[1] &&
+                op->src[3]->ne[1] == op->ne[1] &&
+                op->src[3]->ne[2] == op->ne[2];
+        case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+            return op->src[0]->type == GGML_TYPE_F32 &&
+                op->type == GGML_TYPE_F32 &&
+                op->src[0]->ne[0] > ggml_get_op_params_i32(op, 0) &&
+                (op->src[0]->ne[0] - ggml_get_op_params_i32(op, 0)) % 64 == 0;
+        case GGML_OP_DSV4_ROPE_TAIL:
+            return op->src[0]->type == GGML_TYPE_F32 &&
+                op->src[1]->type == GGML_TYPE_I32 &&
+                op->type == GGML_TYPE_F32 &&
+                op->src[0]->ne[2] == op->src[1]->ne[0] &&
+                ggml_get_op_params_i32(op, 0) > 0 &&
+                ggml_get_op_params_i32(op, 0) <= op->src[0]->ne[0] &&
+                ggml_get_op_params_i32(op, 0) % 2 == 0 &&
+                (ggml_get_op_params_i32(op, 1) == GGML_ROPE_TYPE_NORMAL ||
+                 ggml_get_op_params_i32(op, 1) == GGML_ROPE_TYPE_NEOX);
         case GGML_OP_RWKV_WKV6:
         case GGML_OP_RWKV_WKV7:
             return true;
diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h
index ff74cafb5b79..48301eff4709 100644
--- a/ggml/src/ggml-metal/ggml-metal-impl.h
+++ b/ggml/src/ggml-metal/ggml-metal-impl.h
@@ -798,6 +798,90 @@ typedef struct {
     int32_t  n_head_log2;
 } ggml_metal_kargs_soft_max;
 
+typedef struct {
+    int32_t  n_hc;
+    int32_t  sinkhorn_iters;
+    int64_t  n_rows;
+    int64_t  mix_hc;
+    uint64_t nb01;
+    uint64_t nb1;
+    float    eps;
+} ggml_metal_kargs_dsv4_hc_split_sinkhorn;
+
+typedef struct {
+    int64_t  n_embd;
+    int64_t  n_hc;
+    int64_t  n_tokens;
+    uint64_t nb_x0;
+    uint64_t nb_x1;
+    uint64_t nb_x2;
+    uint64_t nb_w0;
+    uint64_t nb_w1;
+    uint64_t nb0;
+    uint64_t nb1;
+} ggml_metal_kargs_dsv4_hc_weighted_sum;
+
+typedef struct {
+    int64_t  n_embd;
+    int64_t  n_hc;
+    int64_t  n_tokens;
+    uint64_t nb_block0;
+    uint64_t nb_block1;
+    uint64_t nb_res0;
+    uint64_t nb_res1;
+    uint64_t nb_res2;
+    uint64_t nb_post0;
+    uint64_t nb_post1;
+    uint64_t nb_comb0;
+    uint64_t nb_comb1;
+    uint64_t nb_comb2;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+} ggml_metal_kargs_dsv4_hc_expand;
+
+typedef struct {
+    int64_t  ne00;
+    int64_t  ne01;
+    int64_t  ne02;
+    int64_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    int32_t  n_rot;
+} ggml_metal_kargs_dsv4_fp8_kv_quantize;
+
+typedef struct {
+    int64_t  ne00;
+    int64_t  ne01;
+    int64_t  ne02;
+    int64_t  ne03;
+    uint64_t nb00;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    uint64_t nb0;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
+    int32_t  n_dims;
+    int32_t  mode;
+    int32_t  n_ctx_orig;
+    int32_t  inverse;
+    float    freq_base;
+    float    freq_scale;
+    float    ext_factor;
+    float    attn_factor;
+    float    beta_fast;
+    float    beta_slow;
+    bool     src2;
+} ggml_metal_kargs_dsv4_rope_tail;
+
 typedef struct {
     int64_t  ne00;
     int64_t  ne01;
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index a114391c2e8c..b2f6ed37847d 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -320,6 +320,26 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
             {
                 n_fuse = ggml_metal_op_soft_max(ctx, idx);
             } break;
+        case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+            {
+                n_fuse = ggml_metal_op_dsv4_hc_split_sinkhorn(ctx, idx);
+            } break;
+        case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+            {
+                n_fuse = ggml_metal_op_dsv4_hc_weighted_sum(ctx, idx);
+            } break;
+        case GGML_OP_DSV4_HC_EXPAND:
+            {
+                n_fuse = ggml_metal_op_dsv4_hc_expand(ctx, idx);
+            } break;
+        case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+            {
+                n_fuse = ggml_metal_op_dsv4_fp8_kv_quantize(ctx, idx);
+            } break;
+        case GGML_OP_DSV4_ROPE_TAIL:
+            {
+                n_fuse = ggml_metal_op_dsv4_rope_tail(ctx, idx);
+            } break;
         case GGML_OP_SSM_CONV:
             {
                 n_fuse = ggml_metal_op_ssm_conv(ctx, idx);
@@ -1369,6 +1389,289 @@ int ggml_metal_op_soft_max(ggml_metal_op_t ctx, int idx) {
     return 1;
 }
 
+int ggml_metal_op_dsv4_hc_split_sinkhorn(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[2]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[0]->ne[2] == 1);
+    GGML_ASSERT(op->src[0]->ne[3] == 1);
+
+    const int32_t n_hc           = ggml_get_op_params_i32(op, 0);
+    const int32_t sinkhorn_iters = ggml_get_op_params_i32(op, 1);
+    const float eps              = ggml_get_op_params_f32(op, 2);
+
+    GGML_TENSOR_LOCALS(int64_t,  ne0, op->src[0], ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+
+    ggml_metal_kargs_dsv4_hc_split_sinkhorn args = {
+        /*.n_hc            =*/ n_hc,
+        /*.sinkhorn_iters  =*/ sinkhorn_iters,
+        /*.n_rows          =*/ ne01*ne02*ne03,
+        /*.mix_hc          =*/ ne00,
+        /*.nb01            =*/ nb01,
+        /*.nb1             =*/ nb1,
+        /*.eps             =*/ eps,
+    };
+
+    auto pipeline = ggml_metal_library_get_pipeline_dsv4_hc_split_sinkhorn(lib, op);
+
+    const int nth = std::min<int64_t>(256, std::max<int64_t>(1, args.n_rows));
+    const int n_tg = (args.n_rows + nth - 1) / nth;
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[2]), 3);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         4);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, n_tg, 1, 1, nth, 1, 1);
+
+    return 1;
+}
+
+int ggml_metal_op_dsv4_hc_weighted_sum(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+    ggml_tensor * x       = op->src[0];
+    ggml_tensor * weights = op->src[1];
+
+    GGML_TENSOR_LOCALS(int64_t,  ne,   op,      ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb,   op,      nb);
+    GGML_TENSOR_LOCALS(uint64_t, nb_x, x,       nb);
+    GGML_TENSOR_LOCALS(uint64_t, nb_w, weights, nb);
+
+    ggml_metal_kargs_dsv4_hc_weighted_sum args = {
+        /*.n_embd   =*/ ne0,
+        /*.n_hc     =*/ x->ne[1],
+        /*.n_tokens =*/ ne1,
+        /*.nb_x0    =*/ nb_x0,
+        /*.nb_x1    =*/ nb_x1,
+        /*.nb_x2    =*/ nb_x2,
+        /*.nb_w0    =*/ nb_w0,
+        /*.nb_w1    =*/ nb_w1,
+        /*.nb0      =*/ nb0,
+        /*.nb1      =*/ nb1,
+    };
+
+    auto pipeline = ggml_metal_library_get_pipeline_dsv4_hc_weighted_sum(lib, op);
+
+    const int64_t n_elem = ne0*ne1;
+    const int nth = std::min<int64_t>(256, std::max<int64_t>(1, n_elem));
+    const int n_tg = (n_elem + nth - 1) / nth;
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(x),       1);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(weights), 2);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),      3);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, n_tg, 1, 1, nth, 1, 1);
+
+    return 1;
+}
+
+int ggml_metal_op_dsv4_hc_expand(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[1]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[2]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[3]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+    ggml_tensor * block_out = op->src[0];
+    ggml_tensor * residual  = op->src[1];
+    ggml_tensor * post      = op->src[2];
+    ggml_tensor * comb      = op->src[3];
+
+    GGML_TENSOR_LOCALS(int64_t,  ne,       op,        ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb,       op,        nb);
+    GGML_TENSOR_LOCALS(uint64_t, nb_block, block_out, nb);
+    GGML_TENSOR_LOCALS(uint64_t, nb_res,   residual,  nb);
+    GGML_TENSOR_LOCALS(uint64_t, nb_post,  post,      nb);
+    GGML_TENSOR_LOCALS(uint64_t, nb_comb,  comb,      nb);
+
+    ggml_metal_kargs_dsv4_hc_expand args = {
+        /*.n_embd    =*/ ne0,
+        /*.n_hc      =*/ ne1,
+        /*.n_tokens  =*/ ne2,
+        /*.nb_block0 =*/ nb_block0,
+        /*.nb_block1 =*/ nb_block1,
+        /*.nb_res0   =*/ nb_res0,
+        /*.nb_res1   =*/ nb_res1,
+        /*.nb_res2   =*/ nb_res2,
+        /*.nb_post0  =*/ nb_post0,
+        /*.nb_post1  =*/ nb_post1,
+        /*.nb_comb0  =*/ nb_comb0,
+        /*.nb_comb1  =*/ nb_comb1,
+        /*.nb_comb2  =*/ nb_comb2,
+        /*.nb0       =*/ nb0,
+        /*.nb1       =*/ nb1,
+        /*.nb2       =*/ nb2,
+    };
+
+    auto pipeline = ggml_metal_library_get_pipeline_dsv4_hc_expand(lib, op);
+
+    const int64_t n_elem = ne0*ne1*ne2;
+    const int nth = std::min<int64_t>(256, std::max<int64_t>(1, n_elem));
+    const int n_tg = (n_elem + nth - 1) / nth;
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(block_out), 1);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(residual),  2);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(post),      3);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(comb),      4);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),        5);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, n_tg, 1, 1, nth, 1, 1);
+
+    return 1;
+}
+
+int ggml_metal_op_dsv4_fp8_kv_quantize(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+    const int32_t n_rot = ggml_get_op_params_i32(op, 0);
+
+    GGML_TENSOR_LOCALS(int64_t,  ne0, op->src[0], ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+
+    ggml_metal_kargs_dsv4_fp8_kv_quantize args = {
+        /*.ne00 =*/ ne00,
+        /*.ne01 =*/ ne01,
+        /*.ne02 =*/ ne02,
+        /*.ne03 =*/ ne03,
+        /*.nb00 =*/ nb00,
+        /*.nb01 =*/ nb01,
+        /*.nb02 =*/ nb02,
+        /*.nb03 =*/ nb03,
+        /*.nb0  =*/ nb0,
+        /*.nb1  =*/ nb1,
+        /*.nb2  =*/ nb2,
+        /*.nb3  =*/ nb3,
+        /*.n_rot =*/ n_rot,
+    };
+
+    auto pipeline = ggml_metal_library_get_pipeline_dsv4_fp8_kv_quantize(lib, op);
+
+    const int64_t n_rows = ne01*ne02*ne03;
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
+    ggml_metal_encoder_set_threadgroup_memory_size(enc, pipeline.smem, 0);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, n_rows, 1, 1, 64, 1, 1);
+
+    return 1;
+}
+
+int ggml_metal_op_dsv4_rope_tail(ggml_metal_op_t ctx, int idx) {
+    ggml_tensor * op = ctx->node(idx);
+
+    ggml_metal_library_t lib = ctx->lib;
+    ggml_metal_encoder_t enc = ctx->enc;
+
+    GGML_ASSERT(op->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(op->src[1]->type == GGML_TYPE_I32);
+    GGML_ASSERT(op->type == GGML_TYPE_F32);
+
+    const int32_t n_dims     = ggml_get_op_params_i32(op, 0);
+    const int32_t mode       = ggml_get_op_params_i32(op, 1);
+    const int32_t n_ctx_orig = ggml_get_op_params_i32(op, 2);
+    const int32_t inverse    = ggml_get_op_params_i32(op, 3);
+
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+
+    memcpy(&freq_base,   (const int32_t *) op->op_params + 4, sizeof(float));
+    memcpy(&freq_scale,  (const int32_t *) op->op_params + 5, sizeof(float));
+    memcpy(&ext_factor,  (const int32_t *) op->op_params + 6, sizeof(float));
+    memcpy(&attn_factor, (const int32_t *) op->op_params + 7, sizeof(float));
+    memcpy(&beta_fast,   (const int32_t *) op->op_params + 8, sizeof(float));
+    memcpy(&beta_slow,   (const int32_t *) op->op_params + 9, sizeof(float));
+
+    GGML_TENSOR_LOCALS(int64_t,  ne0, op->src[0], ne);
+    GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb);
+    GGML_TENSOR_LOCALS(uint64_t, nb,  op,         nb);
+
+    ggml_metal_kargs_dsv4_rope_tail args = {
+        /*.ne00        =*/ ne00,
+        /*.ne01        =*/ ne01,
+        /*.ne02        =*/ ne02,
+        /*.ne03        =*/ ne03,
+        /*.nb00        =*/ nb00,
+        /*.nb01        =*/ nb01,
+        /*.nb02        =*/ nb02,
+        /*.nb03        =*/ nb03,
+        /*.nb0         =*/ nb0,
+        /*.nb1         =*/ nb1,
+        /*.nb2         =*/ nb2,
+        /*.nb3         =*/ nb3,
+        /*.n_dims      =*/ n_dims,
+        /*.mode        =*/ mode,
+        /*.n_ctx_orig  =*/ n_ctx_orig,
+        /*.inverse     =*/ inverse,
+        /*.freq_base   =*/ freq_base,
+        /*.freq_scale  =*/ freq_scale,
+        /*.ext_factor  =*/ ext_factor,
+        /*.attn_factor =*/ attn_factor,
+        /*.beta_fast   =*/ beta_fast,
+        /*.beta_slow   =*/ beta_slow,
+        /*.src2        =*/ op->src[2] != nullptr,
+    };
+
+    auto pipeline = ggml_metal_library_get_pipeline_dsv4_rope_tail(lib, op);
+
+    const int nth = std::min<int64_t>(256, std::max<int64_t>(1, ne00));
+
+    ggml_metal_encoder_set_pipeline(enc, pipeline);
+    ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[1]), 2);
+    if (op->src[2]) {
+        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[2]), 3);
+    } else {
+        ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 3);
+    }
+    ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         4);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1);
+
+    return 1;
+}
+
 int ggml_metal_op_ssm_conv(ggml_metal_op_t ctx, int idx) {
     ggml_tensor * op = ctx->node(idx);
 
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h
index 36c61071b4fa..e402a450e619 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.h
+++ b/ggml/src/ggml-metal/ggml-metal-ops.h
@@ -55,6 +55,11 @@ int ggml_metal_op_get_rows          (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_set_rows          (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_diag              (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_soft_max          (ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_dsv4_hc_split_sinkhorn(ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_dsv4_hc_weighted_sum(ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_dsv4_hc_expand    (ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_dsv4_fp8_kv_quantize(ggml_metal_op_t ctx, int idx);
+int ggml_metal_op_dsv4_rope_tail    (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_ssm_conv          (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_ssm_scan          (ggml_metal_op_t ctx, int idx);
 int ggml_metal_op_rwkv              (ggml_metal_op_t ctx, int idx);
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index f6ffb2b3a1c6..37cabfcdb2bb 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -2073,6 +2073,308 @@ template [[host_name("kernel_soft_max_f32")]]   kernel kernel_soft_max_t   kerne
 template [[host_name("kernel_soft_max_f16_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4<half4>;
 template [[host_name("kernel_soft_max_f32_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4<float4>;
 
+kernel void kernel_dsv4_hc_split_sinkhorn(
+        constant ggml_metal_kargs_dsv4_hc_split_sinkhorn & args,
+        device  const float * mixes,
+        device  const float * scale,
+        device  const float * base,
+        device        float * dst,
+        uint tid [[thread_position_in_grid]]) {
+    if ((int64_t) tid >= args.n_rows) {
+        return;
+    }
+
+    constexpr int HC_MAX = 16;
+    const int HC = args.n_hc;
+    if (HC <= 0 || HC > HC_MAX) {
+        return;
+    }
+
+    device const float * mix = mixes + ((int64_t) tid)*args.mix_hc;
+    device       float * out = dst    + ((int64_t) tid)*args.mix_hc;
+
+    const float epsv       = args.eps;
+    const float pre_scale  = scale[0];
+    const float post_scale = scale[1];
+    const float comb_scale = scale[2];
+
+    if (HC == 4) {
+        const float4 pre_z =
+            *((device const float4 *) mix) * pre_scale +
+            *((device const float4 *) base);
+        *((device float4 *) out) = 1.0f / (1.0f + exp(-pre_z)) + epsv;
+
+        const float4 post_z =
+            *((device const float4 *) (mix  + 4)) * post_scale +
+            *((device const float4 *) (base + 4));
+        *((device float4 *) (out + 4)) = 2.0f / (1.0f + exp(-post_z));
+
+        float4 r0 =
+            *((device const float4 *) (mix  +  8)) * comb_scale +
+            *((device const float4 *) (base +  8));
+        float4 r1 =
+            *((device const float4 *) (mix  + 12)) * comb_scale +
+            *((device const float4 *) (base + 12));
+        float4 r2 =
+            *((device const float4 *) (mix  + 16)) * comb_scale +
+            *((device const float4 *) (base + 16));
+        float4 r3 =
+            *((device const float4 *) (mix  + 20)) * comb_scale +
+            *((device const float4 *) (base + 20));
+
+        const float m0 = max(max(r0.x, r0.y), max(r0.z, r0.w));
+        const float m1 = max(max(r1.x, r1.y), max(r1.z, r1.w));
+        const float m2 = max(max(r2.x, r2.y), max(r2.z, r2.w));
+        const float m3 = max(max(r3.x, r3.y), max(r3.z, r3.w));
+
+        r0 = exp(r0 - m0);
+        r1 = exp(r1 - m1);
+        r2 = exp(r2 - m2);
+        r3 = exp(r3 - m3);
+
+        r0 = r0 * (1.0f / (r0.x + r0.y + r0.z + r0.w)) + epsv;
+        r1 = r1 * (1.0f / (r1.x + r1.y + r1.z + r1.w)) + epsv;
+        r2 = r2 * (1.0f / (r2.x + r2.y + r2.z + r2.w)) + epsv;
+        r3 = r3 * (1.0f / (r3.x + r3.y + r3.z + r3.w)) + epsv;
+
+        float4 col_inv = 1.0f / (r0 + r1 + r2 + r3 + epsv);
+        r0 *= col_inv;
+        r1 *= col_inv;
+        r2 *= col_inv;
+        r3 *= col_inv;
+
+        for (int iter = 1; iter < args.sinkhorn_iters; ++iter) {
+            r0 *= 1.0f / (r0.x + r0.y + r0.z + r0.w + epsv);
+            r1 *= 1.0f / (r1.x + r1.y + r1.z + r1.w + epsv);
+            r2 *= 1.0f / (r2.x + r2.y + r2.z + r2.w + epsv);
+            r3 *= 1.0f / (r3.x + r3.y + r3.z + r3.w + epsv);
+
+            col_inv = 1.0f / (r0 + r1 + r2 + r3 + epsv);
+            r0 *= col_inv;
+            r1 *= col_inv;
+            r2 *= col_inv;
+            r3 *= col_inv;
+        }
+
+        *((device float4 *) (out +  8)) = r0;
+        *((device float4 *) (out + 12)) = r1;
+        *((device float4 *) (out + 16)) = r2;
+        *((device float4 *) (out + 20)) = r3;
+        return;
+    }
+
+    for (int i = 0; i < HC; ++i) {
+        const float z = mix[i] * pre_scale + base[i];
+        out[i] = 1.0f / (1.0f + exp(-z)) + epsv;
+    }
+
+    for (int i = 0; i < HC; ++i) {
+        const int off = HC + i;
+        const float z = mix[off] * post_scale + base[off];
+        out[off] = 2.0f / (1.0f + exp(-z));
+    }
+
+    float c[HC_MAX*HC_MAX];
+
+    for (int dst_hc = 0; dst_hc < HC; ++dst_hc) {
+        float row_max = -INFINITY;
+        for (int src_hc = 0; src_hc < HC; ++src_hc) {
+            const int idx = src_hc + dst_hc*HC;
+            const int off = 2*HC + idx;
+            const float v = mix[off] * comb_scale + base[off];
+            c[idx] = v;
+            row_max = max(row_max, v);
+        }
+
+        float row_sum = 0.0f;
+        for (int src_hc = 0; src_hc < HC; ++src_hc) {
+            const int idx = src_hc + dst_hc*HC;
+            const float v = exp(c[idx] - row_max);
+            c[idx] = v;
+            row_sum += v;
+        }
+
+        const float inv_sum = 1.0f / row_sum;
+        for (int src_hc = 0; src_hc < HC; ++src_hc) {
+            const int idx = src_hc + dst_hc*HC;
+            c[idx] = c[idx] * inv_sum + epsv;
+        }
+    }
+
+    for (int src_hc = 0; src_hc < HC; ++src_hc) {
+        float sum = 0.0f;
+        for (int dst_hc = 0; dst_hc < HC; ++dst_hc) {
+            sum += c[src_hc + dst_hc*HC];
+        }
+
+        const float inv_denom = 1.0f / (sum + epsv);
+        for (int dst_hc = 0; dst_hc < HC; ++dst_hc) {
+            c[src_hc + dst_hc*HC] *= inv_denom;
+        }
+    }
+
+    for (int iter = 1; iter < args.sinkhorn_iters; ++iter) {
+        for (int dst_hc = 0; dst_hc < HC; ++dst_hc) {
+            float sum = 0.0f;
+            for (int src_hc = 0; src_hc < HC; ++src_hc) {
+                sum += c[src_hc + dst_hc*HC];
+            }
+
+            const float inv_denom = 1.0f / (sum + epsv);
+            for (int src_hc = 0; src_hc < HC; ++src_hc) {
+                c[src_hc + dst_hc*HC] *= inv_denom;
+            }
+        }
+
+        for (int src_hc = 0; src_hc < HC; ++src_hc) {
+            float sum = 0.0f;
+            for (int dst_hc = 0; dst_hc < HC; ++dst_hc) {
+                sum += c[src_hc + dst_hc*HC];
+            }
+
+            const float inv_denom = 1.0f / (sum + epsv);
+            for (int dst_hc = 0; dst_hc < HC; ++dst_hc) {
+                c[src_hc + dst_hc*HC] *= inv_denom;
+            }
+        }
+    }
+
+    for (int i = 0; i < HC*HC; ++i) {
+        out[2*HC + i] = c[i];
+    }
+}
+
+kernel void kernel_dsv4_hc_expand(
+        constant ggml_metal_kargs_dsv4_hc_expand & args,
+        device  const char * block_out,
+        device  const char * residual,
+        device  const char * post,
+        device  const char * comb,
+        device        char * dst,
+        uint gid [[thread_position_in_grid]]) {
+    const int64_t n_elem = args.n_embd * args.n_hc * args.n_tokens;
+    if ((int64_t) gid >= n_elem) {
+        return;
+    }
+
+    const int64_t d      = ((int64_t) gid) % args.n_embd;
+    const int64_t tmp    = ((int64_t) gid) / args.n_embd;
+    const int64_t dst_hc = tmp % args.n_hc;
+    const int64_t t      = tmp / args.n_hc;
+
+    const float block_v = *((device const float *) (block_out + d*args.nb_block0 + t*args.nb_block1));
+    const float post_v  = *((device const float *) (post      + dst_hc*args.nb_post0 + t*args.nb_post1));
+
+    float acc = block_v * post_v;
+    for (int64_t src_hc = 0; src_hc < args.n_hc; ++src_hc) {
+        const float comb_v = *((device const float *) (comb     + dst_hc*args.nb_comb0 + src_hc*args.nb_comb1 + t*args.nb_comb2));
+        const float res_v  = *((device const float *) (residual + d*args.nb_res0 + src_hc*args.nb_res1 + t*args.nb_res2));
+        acc += comb_v * res_v;
+    }
+
+    *((device float *) (dst + d*args.nb0 + dst_hc*args.nb1 + t*args.nb2)) = acc;
+}
+
+kernel void kernel_dsv4_hc_weighted_sum(
+        constant ggml_metal_kargs_dsv4_hc_weighted_sum & args,
+        device  const char * x,
+        device  const char * weights,
+        device        char * dst,
+        uint gid [[thread_position_in_grid]]) {
+    const int64_t n_elem = args.n_embd * args.n_tokens;
+    if ((int64_t) gid >= n_elem) {
+        return;
+    }
+
+    const int64_t d = ((int64_t) gid) % args.n_embd;
+    const int64_t t = ((int64_t) gid) / args.n_embd;
+
+    float acc = 0.0f;
+    for (int64_t h = 0; h < args.n_hc; ++h) {
+        const float xv = *((device const float *) (x       + d*args.nb_x0 + h*args.nb_x1 + t*args.nb_x2));
+        const float wv = *((device const float *) (weights + h*args.nb_w0 + t*args.nb_w1));
+        acc += xv * wv;
+    }
+
+    *((device float *) (dst + d*args.nb0 + t*args.nb1)) = acc;
+}
+
+static inline float dsv4_e4m3fn_value(int i) {
+    const int exp  = (i >> 3) & 0x0f;
+    const int mant = i & 0x07;
+    return exp == 0
+        ? float(mant) * 0.001953125f
+        : (1.0f + float(mant) * 0.125f) * exp2(float(exp - 7));
+}
+
+static inline float dsv4_e4m3fn_dequant(float x) {
+    const float sign = x < 0.0f ? -1.0f : 1.0f;
+    const float ax = min(abs(x), 448.0f);
+
+    int best = 0;
+    float best_diff = ax;
+    for (int i = 1; i < 127; ++i) {
+        const float val = dsv4_e4m3fn_value(i);
+        const float diff = abs(ax - val);
+        if (diff < best_diff || (diff == best_diff && (i & 1) == 0 && (best & 1) != 0)) {
+            best = i;
+            best_diff = diff;
+        }
+    }
+
+    return sign * dsv4_e4m3fn_value(best);
+}
+
+kernel void kernel_dsv4_fp8_kv_quantize_f32(
+        constant ggml_metal_kargs_dsv4_fp8_kv_quantize & args,
+        device  const char * src0,
+        device        char * dst,
+        threadgroup  float * scratch [[threadgroup(0)]],
+        uint row [[threadgroup_position_in_grid]],
+        uint tid [[thread_position_in_threadgroup]]) {
+    const int64_t n_rows = args.ne01 * args.ne02 * args.ne03;
+    if ((int64_t) row >= n_rows) {
+        return;
+    }
+
+    const int64_t i1 = row % args.ne01;
+    const int64_t i2 = (row / args.ne01) % args.ne02;
+    const int64_t i3 = row / (args.ne01 * args.ne02);
+
+    device const char * src_base = src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03;
+    device       char * dst_base = dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3;
+
+    const int64_t n_nope = args.ne00 - args.n_rot;
+
+    for (int64_t off = 0; off < n_nope; off += 64) {
+        float v = 0.0f;
+        if (tid < 64) {
+            v = *((device const float *) (src_base + (off + tid)*args.nb00));
+            scratch[tid] = abs(v);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+
+        for (uint stride = 32; stride > 0; stride >>= 1) {
+            if (tid < stride) {
+                scratch[tid] = max(scratch[tid], scratch[tid + stride]);
+            }
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+        }
+
+        const float amax = max(scratch[0], 1.0e-4f);
+        const float scale = exp2(ceil(log2(amax / 448.0f)));
+        if (tid < 64) {
+            const float q = dsv4_e4m3fn_dequant(clamp(v / scale, -448.0f, 448.0f)) * scale;
+            *((device float *) (dst_base + (off + tid)*args.nb0)) = q;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+
+    for (int64_t i = n_nope + tid; i < args.ne00; i += 64) {
+        *((device float *) (dst_base + i*args.nb0)) = *((device const float *) (src_base + i*args.nb00));
+    }
+}
+
 // ref: ggml.c:ggml_compute_forward_ssm_conv_f32
 kernel void kernel_ssm_conv_f32_f32(
         constant ggml_metal_kargs_ssm_conv & args,
@@ -4632,6 +4934,95 @@ template [[host_name("kernel_rope_multi_f16")]] kernel kernel_rope_multi_t kerne
 template [[host_name("kernel_rope_vision_f32")]] kernel kernel_rope_vision_t kernel_rope_vision<float>;
 template [[host_name("kernel_rope_vision_f16")]] kernel kernel_rope_vision_t kernel_rope_vision<half>;
 
+kernel void kernel_dsv4_rope_tail_f32(
+        constant ggml_metal_kargs_dsv4_rope_tail & args,
+        device const char * src0,
+        device const char * src1,
+        device const char * src2,
+        device       char * dst,
+        uint  tid   [[thread_index_in_threadgroup]],
+        ushort3 ntg [[threads_per_threadgroup]],
+        uint3 tgpig [[threadgroup_position_in_grid]]) {
+    const int i1 = tgpig[0];
+    const int i2 = tgpig[1];
+    const int i3 = tgpig[2];
+
+    const int n_nope = args.ne00 - args.n_dims;
+    if (n_nope < 0) {
+        return;
+    }
+
+    device const int32_t * pos = (device const int32_t *) src1;
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(args.n_dims, args.n_ctx_orig, args.freq_base, args.beta_fast, args.beta_slow, corr_dims);
+
+    const float theta_base = (float) pos[i2];
+    const float inv_ndims = -1.f/args.n_dims;
+    const bool is_neox = args.mode == 2;
+
+    for (int i0 = tid; i0 < args.ne00; i0 += ntg.x) {
+        device const char * src_base = src0 + i3*args.nb03 + i2*args.nb02 + i1*args.nb01;
+        device       char * dst_base = dst  + i3*args.nb3  + i2*args.nb2  + i1*args.nb1;
+
+        if (i0 < n_nope) {
+            *((device float *) (dst_base + i0*args.nb0)) = *((device const float *) (src_base + i0*args.nb00));
+            continue;
+        }
+
+        const int r = i0 - n_nope;
+        if (is_neox) {
+            const int n_half = args.n_dims/2;
+            if (r >= n_half) {
+                continue;
+            }
+
+            const int ic = r;
+            const int rel_i0 = 2*ic;
+            const float theta = theta_base * pow(args.freq_base, inv_ndims*rel_i0);
+            const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;
+
+            float cos_theta;
+            float sin_theta;
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, rel_i0, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+            if (args.inverse) {
+                sin_theta = -sin_theta;
+            }
+
+            const int j0 = n_nope + ic;
+            const int j1 = n_nope + ic + n_half;
+            const float x0 = *((device const float *) (src_base + j0*args.nb00));
+            const float x1 = *((device const float *) (src_base + j1*args.nb00));
+
+            *((device float *) (dst_base + j0*args.nb0)) = x0*cos_theta - x1*sin_theta;
+            *((device float *) (dst_base + j1*args.nb0)) = x0*sin_theta + x1*cos_theta;
+        } else {
+            if ((r & 1) != 0) {
+                continue;
+            }
+
+            const int ic = r/2;
+            const float theta = theta_base * pow(args.freq_base, inv_ndims*r);
+            const float freq_factor = args.src2 ? ((device const float *) src2)[ic] : 1.0f;
+
+            float cos_theta;
+            float sin_theta;
+            rope_yarn(theta/freq_factor, args.freq_scale, corr_dims, r, args.ext_factor, args.attn_factor, &cos_theta, &sin_theta);
+            if (args.inverse) {
+                sin_theta = -sin_theta;
+            }
+
+            const int j0 = n_nope + r;
+            const int j1 = j0 + 1;
+            const float x0 = *((device const float *) (src_base + j0*args.nb00));
+            const float x1 = *((device const float *) (src_base + j1*args.nb00));
+
+            *((device float *) (dst_base + j0*args.nb0)) = x0*cos_theta - x1*sin_theta;
+            *((device float *) (dst_base + j1*args.nb0)) = x0*sin_theta + x1*cos_theta;
+        }
+    }
+}
+
 typedef void (im2col_t)(
         constant ggml_metal_kargs_im2col & args,
         device const float * x,
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 476c30797956..8b06c0bd5a49 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1063,6 +1063,11 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "RWKV_WKV7",
     "SOLVE_TRI",
     "GATED_DELTA_NET",
+    "DSV4_HC_SPLIT_SINKHORN",
+    "DSV4_HC_WEIGHTED_SUM",
+    "DSV4_HC_EXPAND",
+    "DSV4_FP8_KV_QUANTIZE",
+    "DSV4_ROPE_TAIL",
 
     "UNARY",
 
@@ -1080,7 +1085,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "GLU",
 };
 
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1173,6 +1178,11 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "rwkv_wkv7(r, w, k, v, a, b, s)",
     "A X = B, A triangular, solve X",
     "gated_delta_net(q, k, v, g, beta, s)",
+    "dsv4_hc_split_sinkhorn(x)",
+    "dsv4_hc_weighted_sum(x)",
+    "dsv4_hc_expand(x)",
+    "dsv4_fp8_kv_quantize(x)",
+    "dsv4_rope_tail(x)",
 
     "unary(x)",
 
@@ -1190,7 +1200,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "glu(x)",
 };
 
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -6230,6 +6240,180 @@ struct ggml_tensor * ggml_gated_delta_net(
     return result;
 }
 
+// ggml_dsv4_hc_split_sinkhorn
+
+struct ggml_tensor * ggml_dsv4_hc_split_sinkhorn(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * mixes,
+        struct ggml_tensor  * scale,
+        struct ggml_tensor  * base,
+        int                   n_hc,
+        int                   sinkhorn_iters,
+        float                 eps) {
+    GGML_ASSERT(mixes->type == GGML_TYPE_F32);
+    GGML_ASSERT(scale->type == GGML_TYPE_F32);
+    GGML_ASSERT(base->type  == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous_rows(mixes));
+    GGML_ASSERT(ggml_is_contiguous(scale));
+    GGML_ASSERT(ggml_is_contiguous(base));
+
+    GGML_ASSERT(n_hc > 0);
+    GGML_ASSERT(n_hc <= 16);  // CPU forward uses a fixed float c[16*16] scratch
+    GGML_ASSERT(sinkhorn_iters > 0);
+    GGML_ASSERT(mixes->ne[0] == (2 + n_hc) * n_hc);
+    GGML_ASSERT(mixes->ne[2] == 1);
+    GGML_ASSERT(mixes->ne[3] == 1);
+    GGML_ASSERT(ggml_nelements(scale) >= 3);
+    GGML_ASSERT(ggml_nelements(base)  >= mixes->ne[0]);
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, mixes);
+
+    ggml_set_op_params_i32(result, 0, n_hc);
+    ggml_set_op_params_i32(result, 1, sinkhorn_iters);
+    ggml_set_op_params_f32(result, 2, eps);
+
+    result->op     = GGML_OP_DSV4_HC_SPLIT_SINKHORN;
+    result->src[0] = mixes;
+    result->src[1] = scale;
+    result->src[2] = base;
+
+    return result;
+}
+
+// ggml_dsv4_hc_weighted_sum
+
+struct ggml_tensor * ggml_dsv4_hc_weighted_sum(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * weights) {
+    GGML_ASSERT(x->type       == GGML_TYPE_F32);
+    GGML_ASSERT(weights->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(x->ne[1] == weights->ne[0]);
+    GGML_ASSERT(x->ne[2] == weights->ne[1]);
+    GGML_ASSERT(x->ne[3] == 1);
+    GGML_ASSERT(weights->ne[2] == 1);
+    GGML_ASSERT(weights->ne[3] == 1);
+
+    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, x->ne[0], x->ne[2]);
+
+    result->op     = GGML_OP_DSV4_HC_WEIGHTED_SUM;
+    result->src[0] = x;
+    result->src[1] = weights;
+
+    return result;
+}
+
+// ggml_dsv4_hc_expand
+
+struct ggml_tensor * ggml_dsv4_hc_expand(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * block_out,
+        struct ggml_tensor  * residual,
+        struct ggml_tensor  * post,
+        struct ggml_tensor  * comb) {
+    GGML_ASSERT(block_out->type == GGML_TYPE_F32);
+    GGML_ASSERT(residual->type  == GGML_TYPE_F32);
+    GGML_ASSERT(post->type      == GGML_TYPE_F32);
+    GGML_ASSERT(comb->type      == GGML_TYPE_F32);
+
+    GGML_ASSERT(block_out->ne[0] == residual->ne[0]);
+    GGML_ASSERT(block_out->ne[1] == residual->ne[2]);
+    GGML_ASSERT(block_out->ne[2] == 1);
+    GGML_ASSERT(block_out->ne[3] == 1);
+    GGML_ASSERT(post->ne[0] == residual->ne[1]);
+    GGML_ASSERT(post->ne[1] == residual->ne[2]);
+    GGML_ASSERT(post->ne[2] == 1);
+    GGML_ASSERT(post->ne[3] == 1);
+    GGML_ASSERT(comb->ne[0] == residual->ne[1]);
+    GGML_ASSERT(comb->ne[1] == residual->ne[1]);
+    GGML_ASSERT(comb->ne[2] == residual->ne[2]);
+    GGML_ASSERT(comb->ne[3] == 1);
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, residual);
+
+    result->op     = GGML_OP_DSV4_HC_EXPAND;
+    result->src[0] = block_out;
+    result->src[1] = residual;
+    result->src[2] = post;
+    result->src[3] = comb;
+
+    return result;
+}
+
+// ggml_dsv4_fp8_kv_quantize
+
+struct ggml_tensor * ggml_dsv4_fp8_kv_quantize(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_rot) {
+    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    GGML_ASSERT(n_rot >= 0);
+    GGML_ASSERT(a->ne[0] > n_rot);
+    GGML_ASSERT((a->ne[0] - n_rot) % 64 == 0);
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params_i32(result, 0, n_rot);
+
+    result->op     = GGML_OP_DSV4_FP8_KV_QUANTIZE;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_dsv4_rope_tail
+
+struct ggml_tensor * ggml_dsv4_rope_tail(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * pos,
+        struct ggml_tensor  * freq_factors,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow,
+        bool                  inverse) {
+    GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
+    GGML_ASSERT(mode == GGML_ROPE_TYPE_NORMAL || mode == GGML_ROPE_TYPE_NEOX);
+    GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16);
+    GGML_ASSERT(pos->type == GGML_TYPE_I32);
+    GGML_ASSERT(ggml_is_vector(pos));
+    GGML_ASSERT(a->ne[2] == pos->ne[0]);
+    GGML_ASSERT(n_dims > 0);
+    GGML_ASSERT(n_dims <= a->ne[0]);
+    GGML_ASSERT(n_dims % 2 == 0);
+
+    if (freq_factors) {
+        GGML_ASSERT(freq_factors->type == GGML_TYPE_F32);
+        GGML_ASSERT(freq_factors->ne[0] >= n_dims / 2);
+    }
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    int32_t params[16] = { n_dims, mode, n_ctx_orig, inverse ? 1 : 0 };
+    memcpy(params +  4, &freq_base,   sizeof(float));
+    memcpy(params +  5, &freq_scale,  sizeof(float));
+    memcpy(params +  6, &ext_factor,  sizeof(float));
+    memcpy(params +  7, &attn_factor, sizeof(float));
+    memcpy(params +  8, &beta_fast,   sizeof(float));
+    memcpy(params +  9, &beta_slow,   sizeof(float));
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_DSV4_ROPE_TAIL;
+    result->src[0] = a;
+    result->src[1] = pos;
+    result->src[2] = freq_factors;
+
+    return result;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 struct ggml_hash_set ggml_hash_set_new(size_t size) {
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index c25f217f990e..8f44c7965e87 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -145,6 +145,10 @@ class LLM:
         INTERLEAVE_MOE_LAYER_STEP         = "{arch}.interleave_moe_layer_step"
         FULL_ATTENTION_INTERVAL           = "{arch}.full_attention_interval"
         ACTIVATION_SPARSITY_SCALE         = "{arch}.activation_sparsity_scale"
+        HASH_LAYER_COUNT                  = "{arch}.hash_layer_count"
+        HYPER_CONNECTION_COUNT            = "{arch}.hyper_connection.count"
+        HYPER_CONNECTION_SINKHORN_ITERS   = "{arch}.hyper_connection.sinkhorn_iterations"
+        HYPER_CONNECTION_EPS              = "{arch}.hyper_connection.epsilon"
         ALTUP_ACTIVE_IDX                  = "{arch}.altup.active_idx"
         ALTUP_NUM_INPUTS                  = "{arch}.altup.num_inputs"
         EMBD_LENGTH_PER_LAYER_INP         = "{arch}.embedding_length_per_layer_input"
@@ -184,6 +188,10 @@ class Attention:
         SHARED_KV_LAYERS             = "{arch}.attention.shared_kv_layers"
         SLIDING_WINDOW_PATTERN       = "{arch}.attention.sliding_window_pattern"
         TEMPERATURE_SCALE            = "{arch}.attention.temperature_scale"
+        COMPRESS_RATIOS              = "{arch}.attention.compress_ratios"
+        COMPRESS_ROPE_FREQ_BASE      = "{arch}.attention.compress_rope_freq_base"
+        OUTPUT_LORA_RANK             = "{arch}.attention.output_lora_rank"
+        OUTPUT_GROUP_COUNT           = "{arch}.attention.output_group_count"
 
         class Indexer:
             HEAD_COUNT = "{arch}.attention.indexer.head_count"
@@ -451,6 +459,7 @@ class MODEL_ARCH(IntEnum):
     DEEPSEEK         = auto()
     DEEPSEEK2        = auto()
     DEEPSEEK2OCR     = auto()
+    DEEPSEEK4        = auto()
     CHATGLM          = auto()
     GLM4             = auto()
     GLM4_MOE         = auto()
@@ -527,6 +536,9 @@ class MODEL_TENSOR(IntEnum):
     TOKEN_TYPES          = auto()
     POS_EMBD             = auto()
     OUTPUT               = auto()
+    OUTPUT_HC_BASE       = auto() # deepseek4 hyper-connection output
+    OUTPUT_HC_FN         = auto() # deepseek4 hyper-connection output
+    OUTPUT_HC_SCALE      = auto() # deepseek4 hyper-connection output
     DENSE_2_OUT          = auto() # embeddinggemma 2_Dense
     DENSE_3_OUT          = auto() # embeddinggemma 3_Dense
     OUTPUT_NORM          = auto()
@@ -650,12 +662,19 @@ class MODEL_TENSOR(IntEnum):
     CHANNEL_MIX_VALUE    = auto()
     ATTN_Q_A             = auto()
     ATTN_Q_B             = auto()
+    ATTN_KV              = auto() # deepseek4 single-tensor combined KV projection
     ATTN_KV_A_MQA        = auto()
     ATTN_KV_B            = auto()
     ATTN_K_B             = auto()
     ATTN_V_B             = auto()
+    ATTN_OUT_A           = auto() # deepseek4 attention output LoRA
+    ATTN_OUT_B           = auto() # deepseek4 attention output LoRA
     ATTN_Q_A_NORM        = auto()
     ATTN_KV_A_NORM       = auto()
+    ATTN_COMPRESSOR_APE  = auto() # deepseek4 attention compressor
+    ATTN_COMPRESSOR_KV   = auto() # deepseek4 attention compressor
+    ATTN_COMPRESSOR_GATE = auto() # deepseek4 attention compressor
+    ATTN_COMPRESSOR_NORM = auto() # deepseek4 attention compressor
     FFN_SUB_NORM         = auto()
     ATTN_SUB_NORM        = auto()
     DEC_ATTN_NORM        = auto()
@@ -717,6 +736,17 @@ class MODEL_TENSOR(IntEnum):
     INDEXER_PROJ         = auto()
     INDEXER_ATTN_K       = auto()
     INDEXER_ATTN_Q_B     = auto()
+    INDEXER_COMPRESSOR_APE  = auto() # deepseek4 indexer compressor
+    INDEXER_COMPRESSOR_KV   = auto() # deepseek4 indexer compressor
+    INDEXER_COMPRESSOR_GATE = auto() # deepseek4 indexer compressor
+    INDEXER_COMPRESSOR_NORM = auto() # deepseek4 indexer compressor
+    HC_ATTN_BASE         = auto() # deepseek4 hyper-connection attention
+    HC_ATTN_FN           = auto() # deepseek4 hyper-connection attention
+    HC_ATTN_SCALE        = auto() # deepseek4 hyper-connection attention
+    HC_FFN_BASE          = auto() # deepseek4 hyper-connection ffn
+    HC_FFN_FN            = auto() # deepseek4 hyper-connection ffn
+    HC_FFN_SCALE         = auto() # deepseek4 hyper-connection ffn
+    FFN_GATE_TID2EID     = auto() # deepseek4 token-id-to-expert-id gating
     # vision
     V_MMPROJ             = auto()
     V_MMPROJ_FC          = auto()
@@ -966,6 +996,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.DEEPSEEK:         "deepseek",
     MODEL_ARCH.DEEPSEEK2:        "deepseek2",
     MODEL_ARCH.DEEPSEEK2OCR:     "deepseek2-ocr",
+    MODEL_ARCH.DEEPSEEK4:        "deepseek4",
     MODEL_ARCH.CHATGLM:          "chatglm",
     MODEL_ARCH.GLM4:             "glm4",
     MODEL_ARCH.GLM4_MOE:         "glm4moe",
@@ -1042,6 +1073,9 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.POS_EMBD:                  "position_embd",
     MODEL_TENSOR.OUTPUT_NORM:               "output_norm",
     MODEL_TENSOR.OUTPUT:                    "output",
+    MODEL_TENSOR.OUTPUT_HC_BASE:            "output_hc_base",
+    MODEL_TENSOR.OUTPUT_HC_FN:              "output_hc_fn",
+    MODEL_TENSOR.OUTPUT_HC_SCALE:           "output_hc_scale",
     MODEL_TENSOR.DENSE_2_OUT:                "dense_2", # embeddinggemma 2_Dense
     MODEL_TENSOR.DENSE_3_OUT:                "dense_3", # embeddinggemma 2_Dense
     MODEL_TENSOR.ROPE_FREQS:                "rope_freqs",
@@ -1164,12 +1198,19 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.CHANNEL_MIX_VALUE:         "blk.{bid}.channel_mix_value",
     MODEL_TENSOR.ATTN_Q_A:                  "blk.{bid}.attn_q_a",
     MODEL_TENSOR.ATTN_Q_B:                  "blk.{bid}.attn_q_b",
+    MODEL_TENSOR.ATTN_KV:                   "blk.{bid}.attn_kv",
     MODEL_TENSOR.ATTN_KV_A_MQA:             "blk.{bid}.attn_kv_a_mqa",
     MODEL_TENSOR.ATTN_KV_B:                 "blk.{bid}.attn_kv_b",
     MODEL_TENSOR.ATTN_K_B:                  "blk.{bid}.attn_k_b",
     MODEL_TENSOR.ATTN_V_B:                  "blk.{bid}.attn_v_b",
+    MODEL_TENSOR.ATTN_OUT_A:                "blk.{bid}.attn_output_a",
+    MODEL_TENSOR.ATTN_OUT_B:                "blk.{bid}.attn_output_b",
     MODEL_TENSOR.ATTN_Q_A_NORM:             "blk.{bid}.attn_q_a_norm",
     MODEL_TENSOR.ATTN_KV_A_NORM:            "blk.{bid}.attn_kv_a_norm",
+    MODEL_TENSOR.ATTN_COMPRESSOR_APE:       "blk.{bid}.attn_compressor_ape",
+    MODEL_TENSOR.ATTN_COMPRESSOR_KV:        "blk.{bid}.attn_compressor_kv",
+    MODEL_TENSOR.ATTN_COMPRESSOR_GATE:      "blk.{bid}.attn_compressor_gate",
+    MODEL_TENSOR.ATTN_COMPRESSOR_NORM:      "blk.{bid}.attn_compressor_norm",
     MODEL_TENSOR.ATTN_SUB_NORM:             "blk.{bid}.attn_sub_norm",
     MODEL_TENSOR.FFN_SUB_NORM:              "blk.{bid}.ffn_sub_norm",
     MODEL_TENSOR.DEC_ATTN_NORM:             "dec.blk.{bid}.attn_norm",
@@ -1231,6 +1272,17 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.INDEXER_PROJ:              "blk.{bid}.indexer.proj",
     MODEL_TENSOR.INDEXER_ATTN_K:            "blk.{bid}.indexer.attn_k",
     MODEL_TENSOR.INDEXER_ATTN_Q_B:          "blk.{bid}.indexer.attn_q_b",
+    MODEL_TENSOR.INDEXER_COMPRESSOR_APE:    "blk.{bid}.indexer_compressor_ape",
+    MODEL_TENSOR.INDEXER_COMPRESSOR_KV:     "blk.{bid}.indexer_compressor_kv",
+    MODEL_TENSOR.INDEXER_COMPRESSOR_GATE:   "blk.{bid}.indexer_compressor_gate",
+    MODEL_TENSOR.INDEXER_COMPRESSOR_NORM:   "blk.{bid}.indexer_compressor_norm",
+    MODEL_TENSOR.HC_ATTN_BASE:              "blk.{bid}.hc_attn_base",
+    MODEL_TENSOR.HC_ATTN_FN:                "blk.{bid}.hc_attn_fn",
+    MODEL_TENSOR.HC_ATTN_SCALE:             "blk.{bid}.hc_attn_scale",
+    MODEL_TENSOR.HC_FFN_BASE:               "blk.{bid}.hc_ffn_base",
+    MODEL_TENSOR.HC_FFN_FN:                 "blk.{bid}.hc_ffn_fn",
+    MODEL_TENSOR.HC_FFN_SCALE:              "blk.{bid}.hc_ffn_scale",
+    MODEL_TENSOR.FFN_GATE_TID2EID:          "blk.{bid}.ffn_gate_tid2eid",
     # vision
     MODEL_TENSOR.V_MMPROJ:                  "mm.{bid}",
     MODEL_TENSOR.V_MMPROJ_FC:               "mm.model.fc",
@@ -2928,6 +2980,49 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_UP_SHEXP,
         MODEL_TENSOR.FFN_EXP_PROBS_B,
     ],
+    MODEL_ARCH.DEEPSEEK4: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.OUTPUT_HC_BASE,
+        MODEL_TENSOR.OUTPUT_HC_FN,
+        MODEL_TENSOR.OUTPUT_HC_SCALE,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_SINKS,
+        MODEL_TENSOR.ATTN_Q_A,
+        MODEL_TENSOR.ATTN_Q_B,
+        MODEL_TENSOR.ATTN_Q_A_NORM,
+        MODEL_TENSOR.ATTN_KV,
+        MODEL_TENSOR.ATTN_KV_A_NORM,
+        MODEL_TENSOR.ATTN_OUT_A,
+        MODEL_TENSOR.ATTN_OUT_B,
+        MODEL_TENSOR.ATTN_COMPRESSOR_APE,
+        MODEL_TENSOR.ATTN_COMPRESSOR_KV,
+        MODEL_TENSOR.ATTN_COMPRESSOR_GATE,
+        MODEL_TENSOR.ATTN_COMPRESSOR_NORM,
+        MODEL_TENSOR.INDEXER_PROJ,
+        MODEL_TENSOR.INDEXER_ATTN_Q_B,
+        MODEL_TENSOR.INDEXER_COMPRESSOR_APE,
+        MODEL_TENSOR.INDEXER_COMPRESSOR_KV,
+        MODEL_TENSOR.INDEXER_COMPRESSOR_GATE,
+        MODEL_TENSOR.INDEXER_COMPRESSOR_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
+        MODEL_TENSOR.FFN_GATE_TID2EID,
+        MODEL_TENSOR.HC_ATTN_BASE,
+        MODEL_TENSOR.HC_ATTN_FN,
+        MODEL_TENSOR.HC_ATTN_SCALE,
+        MODEL_TENSOR.HC_FFN_BASE,
+        MODEL_TENSOR.HC_FFN_FN,
+        MODEL_TENSOR.HC_FFN_SCALE,
+    ],
     MODEL_ARCH.ERNIE4_5_MOE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -4147,6 +4242,8 @@ class GGMLQuantizationType(IntEnum):
 class ExpertGatingFuncType(IntEnum):
     SOFTMAX  = 1
     SIGMOID  = 2
+    SOFTMAX_WEIGHT = 3
+    SQRTSOFTPLUS   = 4
 
 
 # TODO: add GGMLFileType from ggml_ftype in ggml.h
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index a101382719d0..4e3c94de12b9 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -868,6 +868,18 @@ def add_moe_latent_size(self, value: int) -> None:
     def add_nextn_predict_layers(self, count: int) -> None:
         self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
 
+    def add_hash_layer_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.HASH_LAYER_COUNT.format(arch=self.arch), count)
+
+    def add_hyper_connection_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.HYPER_CONNECTION_COUNT.format(arch=self.arch), count)
+
+    def add_hyper_connection_sinkhorn_iters(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.HYPER_CONNECTION_SINKHORN_ITERS.format(arch=self.arch), count)
+
+    def add_hyper_connection_eps(self, value: float) -> None:
+        self.add_float32(Keys.LLM.HYPER_CONNECTION_EPS.format(arch=self.arch), value)
+
     def add_swin_norm(self, value: bool) -> None:
         self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
 
@@ -952,6 +964,18 @@ def add_attn_temperature_length(self, value: int) -> None:
     def add_attn_temperature_scale(self, value: float) -> None:
         self.add_float32(Keys.Attention.TEMPERATURE_SCALE.format(arch=self.arch), value)
 
+    def add_attention_compress_ratios(self, values: Sequence[int]) -> None:
+        self.add_array(Keys.Attention.COMPRESS_RATIOS.format(arch=self.arch), values)
+
+    def add_attention_compress_rope_freq_base(self, value: float) -> None:
+        self.add_float32(Keys.Attention.COMPRESS_ROPE_FREQ_BASE.format(arch=self.arch), value)
+
+    def add_attention_output_lora_rank(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.OUTPUT_LORA_RANK.format(arch=self.arch), value)
+
+    def add_attention_output_group_count(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.OUTPUT_GROUP_COUNT.format(arch=self.arch), value)
+
     def add_pooling_type(self, value: PoolingType) -> None:
         self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
 
diff --git a/models/templates/deepseek-ai-DeepSeek-V4.jinja b/models/templates/deepseek-ai-DeepSeek-V4.jinja
new file mode 100644
index 000000000000..44d5b785ec04
--- /dev/null
+++ b/models/templates/deepseek-ai-DeepSeek-V4.jinja
@@ -0,0 +1,96 @@
+{%- if not add_generation_prompt is defined -%}
+  {%- set add_generation_prompt = false -%}
+{%- endif -%}
+{%- if not thinking is defined -%}
+  {%- if enable_thinking is defined -%}
+    {%- set thinking = enable_thinking -%}
+  {%- else -%}
+    {%- set thinking = false -%}
+  {%- endif -%}
+{%- endif -%}
+{%- set dsml_token = '｜DSML｜' -%}
+{%- set thinking_start_token = '<think>' -%}
+{%- set thinking_end_token = '</think>' -%}
+{%- set tools_header = '## Tools\n\nYou have access to a set of tools to help answer the user question. You can invoke tools by writing a "<' + dsml_token + 'tool_calls>" block like the following:\n\n<' + dsml_token + 'tool_calls>\n<' + dsml_token + 'invoke name="$TOOL_NAME">\n<' + dsml_token + 'parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</' + dsml_token + 'parameter>\n...\n</' + dsml_token + 'invoke>\n<' + dsml_token + 'invoke name="$TOOL_NAME2">\n...\n</' + dsml_token + 'invoke>\n</' + dsml_token + 'tool_calls>\n\nString parameters should be specified as is and set `string="true"`. For all other types (numbers, booleans, arrays, objects), pass the value in JSON format and set `string="false"`.\n\nIf thinking_mode is enabled (triggered by ' + thinking_start_token + '), you MUST output your complete reasoning inside ' + thinking_start_token + '...' + thinking_end_token + ' BEFORE any tool calls or final response.\n\nOtherwise, output directly after ' + thinking_end_token + ' with tool calls or final response.\n\n### Available Tool Schemas\n\n' -%}
+{%- set tools_footer = '\n\nYou MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls.' -%}
+{%- set ns = namespace(system_prompt='', is_first_system=true, pending_assistant=false, pending_tool_result=false) -%}
+{%- for message in messages -%}
+  {%- if message['role'] == 'system' -%}
+    {%- if ns.is_first_system -%}
+      {%- set ns.system_prompt = ns.system_prompt + (message['content'] or '') -%}
+      {%- set ns.is_first_system = false -%}
+    {%- else -%}
+      {%- set ns.system_prompt = ns.system_prompt + '\n\n' + (message['content'] or '') -%}
+    {%- endif -%}
+  {%- endif -%}
+{%- endfor -%}
+{%- if tools is defined and tools -%}
+  {%- set ts = namespace(schemas='') -%}
+  {%- for tool in tools -%}
+    {%- if tool['type'] == 'function' -%}
+      {%- set ts.schemas = ts.schemas + (tool['function'] | tojson) + '\n' -%}
+    {%- endif -%}
+  {%- endfor -%}
+  {%- if ns.system_prompt -%}
+    {%- set ns.system_prompt = ns.system_prompt + '\n\n' + tools_header + ts.schemas + tools_footer -%}
+  {%- else -%}
+    {%- set ns.system_prompt = tools_header + ts.schemas + tools_footer -%}
+  {%- endif -%}
+{%- endif -%}
+{{- bos_token -}}
+{{- ns.system_prompt -}}
+{%- for message in messages -%}
+  {%- if message['role'] == 'user' -%}
+    {{- '<｜User｜>' + (message['content'] or '') -}}
+    {%- set ns.pending_assistant = true -%}
+    {%- set ns.pending_tool_result = true -%}
+  {%- elif message['role'] == 'tool' -%}
+    {%- if not ns.pending_tool_result -%}
+      {{- '<｜User｜>' -}}
+    {%- endif -%}
+    {{- '<tool_result>' + (message['content'] or '') + '</tool_result>' -}}
+    {%- set ns.pending_assistant = true -%}
+    {%- set ns.pending_tool_result = true -%}
+  {%- elif message['role'] == 'assistant' -%}
+    {%- if ns.pending_assistant -%}
+      {{- '<｜Assistant｜>' -}}
+      {%- if thinking and message['reasoning_content'] is defined and message['reasoning_content'] -%}
+        {{- thinking_start_token + message['reasoning_content'] + thinking_end_token -}}
+      {%- else -%}
+        {{- thinking_end_token -}}
+      {%- endif -%}
+    {%- endif -%}
+    {{- (message['content'] or '') -}}
+    {%- if message['tool_calls'] -%}
+      {{- '\n\n<' + dsml_token + 'tool_calls>\n' -}}
+      {%- for tool in message['tool_calls'] -%}
+        {%- set func = tool['function'] -%}
+        {{- '<' + dsml_token + 'invoke name="' + func['name'] + '">\n' -}}
+        {%- set args = func['arguments'] -%}
+        {%- if args is string -%}
+          {%- set args = args | from_json -%}
+        {%- endif -%}
+        {%- for key, val in args.items() -%}
+          {%- if val is string -%}
+            {{- '<' + dsml_token + 'parameter name="' + key + '" string="true">' + val + '</' + dsml_token + 'parameter>\n' -}}
+          {%- else -%}
+            {{- '<' + dsml_token + 'parameter name="' + key + '" string="false">' + (val | tojson) + '</' + dsml_token + 'parameter>\n' -}}
+          {%- endif -%}
+        {%- endfor -%}
+        {{- '</' + dsml_token + 'invoke>\n' -}}
+      {%- endfor -%}
+      {{- '</' + dsml_token + 'tool_calls>' -}}
+    {%- endif -%}
+    {{- '<｜end▁of▁sentence｜>' -}}
+    {%- set ns.pending_assistant = false -%}
+    {%- set ns.pending_tool_result = false -%}
+  {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt and ns.pending_assistant -%}
+  {{- '<｜Assistant｜>' -}}
+  {%- if thinking -%}
+    {{- thinking_start_token -}}
+  {%- else -%}
+    {{- thinking_end_token -}}
+  {%- endif -%}
+{%- endif -%}
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index c9eead18aa39..e789e5a681ae 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -75,6 +75,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_DEEPSEEK,         "deepseek"         },
     { LLM_ARCH_DEEPSEEK2,        "deepseek2"        },
     { LLM_ARCH_DEEPSEEK2OCR,     "deepseek2-ocr"    },
+    { LLM_ARCH_DEEPSEEK4,        "deepseek4"        },
     { LLM_ARCH_CHATGLM,          "chatglm"          },
     { LLM_ARCH_GLM4,             "glm4"             },
     { LLM_ARCH_GLM4_MOE,         "glm4moe"          },
@@ -209,6 +210,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKEN_SHIFT_COUNT,                 "%s.token_shift_count"                 },
     { LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         "%s.interleave_moe_layer_step"         },
     { LLM_KV_FULL_ATTENTION_INTERVAL,           "%s.full_attention_interval"           },
+    { LLM_KV_HASH_LAYER_COUNT,                  "%s.hash_layer_count"                  },
+    { LLM_KV_HYPER_CONNECTION_COUNT,            "%s.hyper_connection.count"            },
+    { LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS,   "%s.hyper_connection.sinkhorn_iterations" },
+    { LLM_KV_HYPER_CONNECTION_EPS,              "%s.hyper_connection.epsilon"          },
 
     { LLM_KV_ATTENTION_HEAD_COUNT,                   "%s.attention.head_count"                   },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,                "%s.attention.head_count_kv"                },
@@ -243,6 +248,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,           "%s.attention.indexer.key_length"           },
     { LLM_KV_ATTENTION_INDEXER_TOP_K,                "%s.attention.indexer.top_k"                },
     { LLM_KV_ATTENTION_SHARED_KV_LAYERS,             "%s.attention.shared_kv_layers"             },
+    { LLM_KV_ATTENTION_COMPRESS_RATIOS,              "%s.attention.compress_ratios"              },
+    { LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE,      "%s.attention.compress_rope_freq_base"      },
+    { LLM_KV_ATTENTION_OUTPUT_LORA_RANK,             "%s.attention.output_lora_rank"             },
+    { LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT,           "%s.attention.output_group_count"           },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,           "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_COUNT_SWA,       "%s.rope.dimension_count_swa"             },
@@ -346,6 +355,9 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_OUTPUT_NORM,                            "output_norm" },
     { LLM_TENSOR_OUTPUT_NORM_LFM2,                       "token_embd_norm" }, // fix for wrong tensor name
     { LLM_TENSOR_OUTPUT,                                 "output" },
+    { LLM_TENSOR_OUTPUT_HC_BASE,                         "output_hc_base" },
+    { LLM_TENSOR_OUTPUT_HC_FN,                           "output_hc_fn" },
+    { LLM_TENSOR_OUTPUT_HC_SCALE,                        "output_hc_scale" },
     { LLM_TENSOR_ROPE_FREQS,                             "rope_freqs" },
     { LLM_TENSOR_ATTN_NORM,                              "blk.%d.attn_norm" },
     { LLM_TENSOR_ATTN_Q,                                 "blk.%d.attn_q" },
@@ -422,8 +434,15 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_ATTN_KV_A_NORM,                         "blk.%d.attn_kv_a_norm" },
     { LLM_TENSOR_ATTN_Q_A,                               "blk.%d.attn_q_a" },
     { LLM_TENSOR_ATTN_Q_B,                               "blk.%d.attn_q_b" },
+    { LLM_TENSOR_ATTN_KV,                                "blk.%d.attn_kv" },
     { LLM_TENSOR_ATTN_KV_A_MQA,                          "blk.%d.attn_kv_a_mqa" },
     { LLM_TENSOR_ATTN_KV_B,                              "blk.%d.attn_kv_b" },
+    { LLM_TENSOR_ATTN_OUT_A,                             "blk.%d.attn_output_a" },
+    { LLM_TENSOR_ATTN_OUT_B,                             "blk.%d.attn_output_b" },
+    { LLM_TENSOR_ATTN_COMPRESSOR_APE,                    "blk.%d.attn_compressor_ape" },
+    { LLM_TENSOR_ATTN_COMPRESSOR_KV,                     "blk.%d.attn_compressor_kv" },
+    { LLM_TENSOR_ATTN_COMPRESSOR_GATE,                   "blk.%d.attn_compressor_gate" },
+    { LLM_TENSOR_ATTN_COMPRESSOR_NORM,                   "blk.%d.attn_compressor_norm" },
     { LLM_TENSOR_PER_LAYER_TOKEN_EMBD,                   "per_layer_token_embd" },
     { LLM_TENSOR_PER_LAYER_MODEL_PROJ,                   "per_layer_model_proj" },
     { LLM_TENSOR_PER_LAYER_PROJ_NORM,                    "per_layer_proj_norm" },
@@ -548,6 +567,17 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_INDEXER_PROJ,                           "blk.%d.indexer.proj" },
     { LLM_TENSOR_INDEXER_ATTN_K,                         "blk.%d.indexer.attn_k" },
     { LLM_TENSOR_INDEXER_ATTN_Q_B,                       "blk.%d.indexer.attn_q_b" },
+    { LLM_TENSOR_INDEXER_COMPRESSOR_APE,                 "blk.%d.indexer_compressor_ape" },
+    { LLM_TENSOR_INDEXER_COMPRESSOR_KV,                  "blk.%d.indexer_compressor_kv" },
+    { LLM_TENSOR_INDEXER_COMPRESSOR_GATE,                "blk.%d.indexer_compressor_gate" },
+    { LLM_TENSOR_INDEXER_COMPRESSOR_NORM,                "blk.%d.indexer_compressor_norm" },
+    { LLM_TENSOR_HC_ATTN_BASE,                           "blk.%d.hc_attn_base" },
+    { LLM_TENSOR_HC_ATTN_FN,                             "blk.%d.hc_attn_fn" },
+    { LLM_TENSOR_HC_ATTN_SCALE,                          "blk.%d.hc_attn_scale" },
+    { LLM_TENSOR_HC_FFN_BASE,                            "blk.%d.hc_ffn_base" },
+    { LLM_TENSOR_HC_FFN_FN,                              "blk.%d.hc_ffn_fn" },
+    { LLM_TENSOR_HC_FFN_SCALE,                           "blk.%d.hc_ffn_scale" },
+    { LLM_TENSOR_FFN_GATE_TID2EID,                       "blk.%d.ffn_gate_tid2eid" },
 };
 
 // declare information about the model weight tensors:
@@ -566,6 +596,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_TOKEN_TYPES,                {LLM_TENSOR_LAYER_INPUT,     GGML_OP_GET_ROWS}},
     {LLM_TENSOR_TOKEN_EMBD_NORM,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},  // do the norms on the first layer (not the input layer)
     {LLM_TENSOR_OUTPUT,                     {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_OUTPUT_HC_BASE,             {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_ADD}},
+    {LLM_TENSOR_OUTPUT_HC_FN,               {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_OUTPUT_HC_SCALE,            {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_SCALE}},
     {LLM_TENSOR_CLS,                        {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CLS_OUT,                    {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CLS_NORM,                   {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
@@ -592,10 +625,15 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_FFN_UP_SHEXP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_Q_A,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_Q_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_KV,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_KV_A_MQA,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_K_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_V_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_OUT_A,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_OUT_B,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_COMPRESSOR_KV,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_COMPRESSOR_GATE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_SINKS,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
     {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -757,6 +795,19 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_INDEXER_PROJ,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_INDEXER_ATTN_K,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_INDEXER_ATTN_Q_B,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_INDEXER_COMPRESSOR_KV,      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_INDEXER_COMPRESSOR_GATE,    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_COMPRESSOR_NORM,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_INDEXER_COMPRESSOR_NORM,    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_HC_ATTN_BASE,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_HC_ATTN_FN,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_HC_ATTN_SCALE,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
+    {LLM_TENSOR_HC_FFN_BASE,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_HC_FFN_FN,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_HC_FFN_SCALE,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
+    {LLM_TENSOR_FFN_GATE_TID2EID,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_ATTN_COMPRESSOR_APE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_INDEXER_COMPRESSOR_APE,     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the
     // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so
     // the model loader doesn't fault on the block index.
@@ -902,6 +953,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
         case LLM_ARCH_OLMO2:
         case LLM_ARCH_OLMOE:
         case LLM_ARCH_DEEPSEEK2:
+        case LLM_ARCH_DEEPSEEK4:
         case LLM_ARCH_GLM_DSA:
         case LLM_ARCH_BITNET:
         case LLM_ARCH_T5:
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 89cf16cc37cf..a1dcb037c7a2 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -79,6 +79,7 @@ enum llm_arch {
     LLM_ARCH_DEEPSEEK,
     LLM_ARCH_DEEPSEEK2,
     LLM_ARCH_DEEPSEEK2OCR,
+    LLM_ARCH_DEEPSEEK4,
     LLM_ARCH_CHATGLM,
     LLM_ARCH_GLM4,
     LLM_ARCH_GLM4_MOE,
@@ -213,6 +214,10 @@ enum llm_kv {
     LLM_KV_TOKEN_SHIFT_COUNT,
     LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
     LLM_KV_FULL_ATTENTION_INTERVAL,
+    LLM_KV_HASH_LAYER_COUNT,
+    LLM_KV_HYPER_CONNECTION_COUNT,
+    LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS,
+    LLM_KV_HYPER_CONNECTION_EPS,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -247,6 +252,10 @@ enum llm_kv {
     LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
     LLM_KV_ATTENTION_INDEXER_TOP_K,
     LLM_KV_ATTENTION_SHARED_KV_LAYERS,
+    LLM_KV_ATTENTION_COMPRESS_RATIOS,
+    LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE,
+    LLM_KV_ATTENTION_OUTPUT_LORA_RANK,
+    LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT,
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_COUNT_SWA,
@@ -354,6 +363,9 @@ enum llm_tensor {
     LLM_TENSOR_DENSE_2_OUT,
     LLM_TENSOR_DENSE_3_OUT,
     LLM_TENSOR_OUTPUT,
+    LLM_TENSOR_OUTPUT_HC_BASE,
+    LLM_TENSOR_OUTPUT_HC_FN,
+    LLM_TENSOR_OUTPUT_HC_SCALE,
     LLM_TENSOR_OUTPUT_NORM,
     LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
     LLM_TENSOR_ROPE_FREQS,
@@ -482,12 +494,19 @@ enum llm_tensor {
     LLM_TENSOR_CHANNEL_MIX_VALUE,
     LLM_TENSOR_ATTN_Q_A,
     LLM_TENSOR_ATTN_Q_B,
+    LLM_TENSOR_ATTN_KV,
     LLM_TENSOR_ATTN_KV_A_MQA,
     LLM_TENSOR_ATTN_KV_B,
     LLM_TENSOR_ATTN_K_B,
     LLM_TENSOR_ATTN_V_B,
+    LLM_TENSOR_ATTN_OUT_A,
+    LLM_TENSOR_ATTN_OUT_B,
     LLM_TENSOR_ATTN_Q_A_NORM,
     LLM_TENSOR_ATTN_KV_A_NORM,
+    LLM_TENSOR_ATTN_COMPRESSOR_APE,
+    LLM_TENSOR_ATTN_COMPRESSOR_KV,
+    LLM_TENSOR_ATTN_COMPRESSOR_GATE,
+    LLM_TENSOR_ATTN_COMPRESSOR_NORM,
     LLM_TENSOR_ATTN_SUB_NORM,
     LLM_TENSOR_FFN_SUB_NORM,
     LLM_TENSOR_DEC_ATTN_NORM,
@@ -549,6 +568,17 @@ enum llm_tensor {
     LLM_TENSOR_INDEXER_PROJ,
     LLM_TENSOR_INDEXER_ATTN_K,
     LLM_TENSOR_INDEXER_ATTN_Q_B,
+    LLM_TENSOR_INDEXER_COMPRESSOR_APE,
+    LLM_TENSOR_INDEXER_COMPRESSOR_KV,
+    LLM_TENSOR_INDEXER_COMPRESSOR_GATE,
+    LLM_TENSOR_INDEXER_COMPRESSOR_NORM,
+    LLM_TENSOR_HC_ATTN_BASE,
+    LLM_TENSOR_HC_ATTN_FN,
+    LLM_TENSOR_HC_ATTN_SCALE,
+    LLM_TENSOR_HC_FFN_BASE,
+    LLM_TENSOR_HC_FFN_FN,
+    LLM_TENSOR_HC_FFN_SCALE,
+    LLM_TENSOR_FFN_GATE_TID2EID,
     LLM_TENSOR_NEXTN_EH_PROJ,
     LLM_TENSOR_NEXTN_EMBED_TOKENS,
     LLM_TENSOR_NEXTN_ENORM,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d62abc4009b8..b1b6aa350735 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -420,7 +420,7 @@ void llama_context::sched_reserve() {
 
     const int64_t t_start_us = ggml_time_us();
 
-    const uint32_t n_seqs = cparams.n_seq_max;
+    const uint32_t n_seqs = model.arch == LLM_ARCH_DEEPSEEK4 ? 1 : cparams.n_seq_max;
     const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
     const size_t max_nodes = this->graph_max_nodes(n_tokens);
@@ -596,6 +596,22 @@ void llama_context::sched_reserve() {
         n_nodes_pp  = ggml_graph_n_nodes(gf);
     }
 
+    // DeepSeek V4 resumed-prompt chunks use the compressed-attention decode
+    // graph, which is larger than the position-zero prefill graph.
+    if (model.arch == LLM_ARCH_DEEPSEEK4 && n_tokens > 1) {
+        const llama_pos reserve_pos0 = std::min<llama_pos>(
+                cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : n_tokens,
+                std::max<uint32_t>(cparams.n_batch, 8u*n_tokens));
+        auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
+                model.hparams.no_alloc, nullptr, reserve_pos0);
+        if (!gf) {
+            throw std::runtime_error("failed to allocate DeepSeek V4 resumed pp buffers");
+        }
+
+        n_splits_pp = std::max(n_splits_pp, ggml_backend_sched_get_n_splits(sched.get()));
+        n_nodes_pp  = std::max(n_nodes_pp,  ggml_graph_n_nodes(gf));
+    }
+
     // reserve with tg (token generation) graph to get the number of splits and nodes
     {
         auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
@@ -2171,6 +2187,15 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
     if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
         return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
     }
+    if (model.arch == LLM_ARCH_DEEPSEEK4) {
+        // DeepSeek V4 has a position-dependent compressed-attention decode path
+        // that creates many temporary tensor objects, especially when a long
+        // prompt is split into non-prefill ubatches. The visible graph node
+        // count is much smaller than the number of GGML objects allocated while
+        // building those graphs, so reserve a larger metadata arena than the
+        // generic tensor-count heuristic would provide.
+        return std::max<uint32_t>(524288u, n_tokens * 192 + 64u * model.n_tensors());
+    }
     uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
     for (const auto & lora : model.loras) {
         res += lora->get_n_nodes();
@@ -2183,7 +2208,7 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
 }
 
 ggml_cgraph * llama_context::graph_reserve(
-        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
+        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes, llama_pos pos0) {
     LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
     GGML_ASSERT(n_outputs >= 1);
 
@@ -2207,6 +2232,14 @@ ggml_cgraph * llama_context::graph_reserve(
 
     llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
     llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
+    if (pos0 != 0 && ubatch.pos != nullptr) {
+        for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+            ubatch.pos[i*ubatch.n_pos] = pos0 + i;
+            for (uint32_t j = 1; j < ubatch.n_pos; ++j) {
+                ubatch.pos[i*ubatch.n_pos + j] = 0;
+            }
+        }
+    }
 
     // set one output token per sequence in order to activate all backend samplers
     std::vector<llama_seq_id> seq_ids(n_seqs);
@@ -3357,6 +3390,29 @@ llama_context * llama_init_from_model(
         params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
     }
 
+    // V4 (DeepSeek4) requires fp16 KV cache: V4's standard SWA K cache,
+    // compressed-attention K cache (cache.attn_k), and indexer K cache
+    // (cache.index_k) all share the same `type_k` and must agree in dtype
+    // because src/models/deepseek4.cpp concatenates the SWA K view with the
+    // compressed K view via ggml_concat (which asserts a->type == b->type).
+    // Furthermore, V4's K activations are post-fp8-quantized
+    // (ggml_dsv4_fp8_kv_quantize), and q8_0's single fp16 scale per 32-element
+    // block cannot faithfully reproduce fp8-quantized value distributions --
+    // pinning to q8_0 corrupts decode silently ("=" loops, "Mirror ..."
+    // garbage). Coerce here, before the SPLIT_MODE_TENSOR / FA / V-quant
+    // shared validations below and before the constructor's flash_attn check,
+    // so those validations see the effective fp16 types and won't reject V4
+    // requests with --cache-type-k|v q8_0. See
+    // docs/plans/v4-port-kv-q8-completion.md.
+    if (model->arch == LLM_ARCH_DEEPSEEK4) {
+        if (params.type_k != GGML_TYPE_F16 || params.type_v != GGML_TYPE_F16) {
+            LLAMA_LOG_WARN("DeepSeek4: forcing fp16 KV cache (--cache-type-k|v are ignored for V4 because compressed/indexer K caches require fp16; "
+                           "see docs/plans/v4-port-kv-q8-completion.md)\n");
+            params.type_k = GGML_TYPE_F16;
+            params.type_v = GGML_TYPE_F16;
+        }
+    }
+
     if (model->split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
         if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
             LLAMA_LOG_INFO("%s: enabling flash_attn since it is required for SPLIT_MODE_TENSOR\n", __func__);
diff --git a/src/llama-context.h b/src/llama-context.h
index e16ac4c618ba..999ba5a800c5 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -240,7 +240,8 @@ struct llama_context {
 
     // reserve a graph with a dummy ubatch of the specified size
     ggml_cgraph * graph_reserve(
-        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
+        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr,
+        llama_pos pos0 = 0);
 
     bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 858c297dd762..bbb74a0661b4 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -500,29 +500,41 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
 }
 
 void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
-    mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
-    mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+    if (self_k_idxs && self_k_idxs->buffer) {
+        mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
+    }
+    if (self_v_idxs && self_v_idxs->buffer) {
+        mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+    }
 
-    mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    if (self_kq_mask && self_kq_mask->buffer) {
+        mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    }
 
-    mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
-    mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+    if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
+        mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
+    }
+    if (self_v_idxs_swa && self_v_idxs_swa->buffer) {
+        mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+    }
 
-    mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+    if (self_kq_mask_swa && self_kq_mask_swa->buffer) {
+        mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+    }
 
-    if (self_k_rot) {
+    if (self_k_rot && self_k_rot->buffer) {
         mctx->get_base()->set_input_k_rot(self_k_rot);
     }
 
-    if (self_v_rot) {
+    if (self_v_rot && self_v_rot->buffer) {
         mctx->get_base()->set_input_v_rot(self_v_rot);
     }
 
-    if (self_k_rot_swa) {
+    if (self_k_rot_swa && self_k_rot_swa->buffer) {
         mctx->get_swa()->set_input_k_rot(self_k_rot_swa);
     }
 
-    if (self_v_rot_swa) {
+    if (self_v_rot_swa && self_v_rot_swa->buffer) {
         mctx->get_swa()->set_input_v_rot(self_v_rot_swa);
     }
 }
@@ -534,14 +546,19 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
 
     bool res = true;
 
-    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
-  //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+    if (self_k_idxs && self_k_idxs->buffer) {
+        res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+      //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+        res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
+    }
 
-    res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
-  //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+    if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
+        res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
+      //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
 
-    res &= can_reuse_kq_mask(self_kq_mask,     mctx->get_base(), params.ubatch, params.cparams);
-    res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(),  params.ubatch, params.cparams);
+        res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
+    }
 
     return res;
 }
@@ -591,7 +608,7 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
 
     const int64_t n_rs = mctx->get_recr()->get_n_rs();
 
-    if (inp_rs->s_copy) {
+    if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
         GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
         int32_t * data = (int32_t *) inp_rs->s_copy->data;
 
@@ -614,10 +631,12 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
 
     res &= can_reuse_kq_mask(inp_attn->self_kq_mask, mctx->get_attn(), params.ubatch, params.cparams);
 
-    res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+    if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
+        res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
 
-    res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
-    res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+        res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
+        res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+    }
 
     res &= inp_rs->head == mctx->get_recr()->get_head();
     res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
@@ -635,7 +654,7 @@ void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) {
 
     const int64_t n_rs = mctx->get_recr()->get_n_rs();
 
-    if (inp_rs->s_copy) {
+    if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
         GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
         int32_t * data = (int32_t *) inp_rs->s_copy->data;
 
@@ -657,10 +676,12 @@ bool llm_graph_input_mem_hybrid_k::can_reuse(const llm_graph_params & params) {
 
     res &= can_reuse_kq_mask(inp_attn->self_kq_mask, mctx->get_attn(), params.ubatch, params.cparams);
 
-    res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+    if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
+        res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
 
-    res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
-    res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+        res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
+        res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+    }
 
     res &= inp_rs->head == mctx->get_recr()->get_head();
     res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
@@ -674,38 +695,46 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
     // base tensors may not be allocated if there are no non-SWA attention layers
     if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
         attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
-        attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
+        if (inp_attn->self_v_idxs && inp_attn->self_v_idxs->buffer) {
+            attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
+        }
 
-        attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+        if (inp_attn->self_kq_mask && inp_attn->self_kq_mask->buffer) {
+            attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+        }
     }
 
     // swa tensors may not be allocated if there are no SWA attention layers
     if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
         attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch);
-        attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
+        if (inp_attn->self_v_idxs_swa && inp_attn->self_v_idxs_swa->buffer) {
+            attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
+        }
 
-        attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
+        if (inp_attn->self_kq_mask_swa && inp_attn->self_kq_mask_swa->buffer) {
+            attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
+        }
     }
 
-    if (inp_attn->self_k_rot) {
+    if (inp_attn->self_k_rot && inp_attn->self_k_rot->buffer) {
         attn_ctx->get_base()->set_input_k_rot(inp_attn->self_k_rot);
     }
 
-    if (inp_attn->self_v_rot) {
+    if (inp_attn->self_v_rot && inp_attn->self_v_rot->buffer) {
         attn_ctx->get_base()->set_input_v_rot(inp_attn->self_v_rot);
     }
 
-    if (inp_attn->self_k_rot_swa) {
+    if (inp_attn->self_k_rot_swa && inp_attn->self_k_rot_swa->buffer) {
         attn_ctx->get_swa()->set_input_k_rot(inp_attn->self_k_rot_swa);
     }
 
-    if (inp_attn->self_v_rot_swa) {
+    if (inp_attn->self_v_rot_swa && inp_attn->self_v_rot_swa->buffer) {
         attn_ctx->get_swa()->set_input_v_rot(inp_attn->self_v_rot_swa);
     }
 
     const int64_t n_rs = mctx->get_recr()->get_n_rs();
 
-    if (inp_rs->s_copy) {
+    if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
         GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
         int32_t * data = (int32_t *) inp_rs->s_copy->data;
 
@@ -741,10 +770,12 @@ bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params)
         res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams);
     }
 
-    res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+    if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
+        res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
 
-    res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
-    res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+        res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
+        res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+    }
 
     res &= inp_rs->head == mctx->get_recr()->get_head();
     res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
@@ -1325,7 +1356,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
          ggml_tensor * gate_up_exps,
          ggml_tensor * up_exps_s,
          ggml_tensor * gate_exps_s,
-         ggml_tensor * down_exps_s) const {
+         ggml_tensor * down_exps_s,
+         ggml_tensor * selected_experts_in) const {
     return build_moe_ffn(
         cur,
         gate_inp,  /* gate_inp_b  */ nullptr,
@@ -1345,7 +1377,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
         /* gate_up_exps_b */ nullptr,
         up_exps_s,
         gate_exps_s,
-        down_exps_s
+        down_exps_s,
+        selected_experts_in
     );
 }
 
@@ -1372,10 +1405,12 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
          ggml_tensor * gate_up_exps_b,
          ggml_tensor * up_exps_s,
          ggml_tensor * gate_exps_s,
-         ggml_tensor * down_exps_s) const {
+         ggml_tensor * down_exps_s,
+         ggml_tensor * selected_experts_in) const {
     const int64_t n_embd   = cur->ne[0];
     const int64_t n_tokens = cur->ne[1];
     const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
+    const bool weight_before_down = arch == LLM_ARCH_DEEPSEEK4; // DeepSeek V4 applies routed weights after SwiGLU and before w2
 
     ggml_tensor * logits = nullptr;
 
@@ -1401,6 +1436,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
             {
                 probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
             } break;
+        case LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS:
+            {
+                probs = ggml_sqrt(ctx0, ggml_softplus(ctx0, logits)); // [n_expert, n_tokens]
+            } break;
         case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT:
             {
                 probs = logits; // [n_expert, n_tokens]
@@ -1455,8 +1494,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     }
 
     // select experts
-    ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
-    cb(selected_experts->src[0], "ffn_moe_argsort", il);
+    ggml_tensor * selected_experts = selected_experts_in;
+    if (selected_experts == nullptr) {
+        selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+        cb(selected_experts->src[0], "ffn_moe_argsort", il);
+    }
     cb(selected_experts, "ffn_moe_topk", il);
 
     if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
@@ -1584,6 +1626,25 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     switch (type_op) {
         case LLM_FFN_SILU:
             if (gate_exps) {
+                if (arch == LLM_ARCH_DEEPSEEK4 && il >= 0) {
+                    const float limit = hparams.swiglu_clamp_exp[il];
+                    constexpr float eps = 1e-6f;
+                    if (limit > eps) {
+                        cur = ggml_clamp(ctx0, cur, -INFINITY, limit);
+                        cb(cur, "ffn_moe_gate_clamped", il);
+
+                        ggml_tensor * gate_act = ggml_silu(ctx0, cur);
+                        cb(gate_act, "ffn_moe_silu", il);
+
+                        up = ggml_clamp(ctx0, up, -limit, limit);
+                        cb(up, "ffn_moe_up_clamped", il);
+
+                        cur = ggml_mul(ctx0, gate_act, up);
+                        cb(cur, "ffn_moe_swiglu_limited", il);
+                        break;
+                    }
+                }
+
                 // Step35: per-layer clamp for routed experts
                 if (arch == LLM_ARCH_STEP35 && il >= 0) {
                     const float limit = hparams.swiglu_clamp_exp[il];
@@ -1648,6 +1709,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
             GGML_ABORT("fatal error");
     }
 
+    if (weight_before_down) {
+        cur = ggml_mul(ctx0, cur, weights);
+        cb(cur, "ffn_moe_weighted_swiglu", il);
+    }
+
     experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
     cb(experts, "ffn_moe_down", il);
 
@@ -1665,7 +1731,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
         cb(experts, "ffn_moe_down_scaled", il);
     }
 
-    if (!weight_before_ffn) {
+    if (!weight_before_ffn && !weight_before_down) {
         experts = ggml_mul(ctx0, experts, weights);
         cb(experts, "ffn_moe_weighted", il);
     }
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 9e55d0a675e0..260334f7302f 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -849,7 +849,8 @@ struct llm_graph_context {
              ggml_tensor * gate_up_exps = nullptr,
              ggml_tensor * up_exps_s = nullptr,
              ggml_tensor * gate_exps_s = nullptr,
-             ggml_tensor * down_exps_s = nullptr) const;
+             ggml_tensor * down_exps_s = nullptr,
+             ggml_tensor * selected_experts_in = nullptr) const;
 
     ggml_tensor * build_moe_ffn(
              ggml_tensor * cur,
@@ -874,7 +875,8 @@ struct llm_graph_context {
              ggml_tensor * gate_up_exps_b = nullptr,
              ggml_tensor * up_exps_s = nullptr,
              ggml_tensor * gate_exps_s = nullptr,
-             ggml_tensor * down_exps_s = nullptr) const;
+             ggml_tensor * down_exps_s = nullptr,
+             ggml_tensor * selected_experts_in = nullptr) const;
 
     //
     // inputs
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 2239309c8fb4..44eaf501f7dc 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -153,6 +153,10 @@ uint32_t llama_hparams::n_embd_v_gqa_max() const {
 }
 
 uint32_t llama_hparams::n_embd_r() const {
+    if (dsv4_state_size != 0) {
+        return dsv4_state_size;
+    }
+
     if (wkv_head_size != 0) {
         // for RWKV models
         return token_shift_count * n_embd;
@@ -177,6 +181,10 @@ uint32_t llama_hparams::n_embd_r() const {
 }
 
 uint32_t llama_hparams::n_embd_s() const {
+    if (dsv4_state_size != 0) {
+        return dsv4_state_size;
+    }
+
     if (wkv_head_size != 0) {
         // corresponds to RWKV's wkv_states size
         return n_embd * wkv_head_size;
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index e2d051edc6cd..3a0438283e77 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -14,6 +14,7 @@ enum llama_expert_gating_func_type {
     LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX        = 1,
     LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID        = 2,
     LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
+    LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS   = 4,
 };
 
 enum llama_swa_type {
@@ -75,6 +76,8 @@ struct llama_hparams {
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
+    uint32_t n_lora_o           = 0;
+    uint32_t n_attn_out_groups  = 0;
     uint32_t n_ff_exp           = 0;
     uint32_t n_ff_shexp         = 0;
     uint32_t n_ff_chexp         = 0;
@@ -91,6 +94,7 @@ struct llama_hparams {
     uint32_t moe_every_n_layers   = 0;
     uint32_t moe_latent_size      = 0;
     uint32_t nextn_predict_layers = 0;
+    uint32_t n_hash_layers        = 0;
 
     bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches)
 
@@ -211,6 +215,14 @@ struct llama_hparams {
     uint32_t indexer_head_size = 0;
     uint32_t indexer_top_k     = 0;
 
+    // DeepSeek V4 hyper-connections and sparse KV compression
+    uint32_t n_hc                    = 1;
+    uint32_t hc_sinkhorn_iters       = 0;
+    float    hc_eps                  = 0.0f;
+    float    compress_rope_freq_base = 0.0f;
+    uint32_t dsv4_state_size         = 0;
+    std::array<uint32_t, LLAMA_MAX_LAYERS> attn_compress_ratio;
+
     // qwen3vl deepstack
     uint32_t n_deepstack_layers = 0;
 
diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp
index 26e2cb4270b0..9b9f17903637 100644
--- a/src/llama-kv-cache-iswa.cpp
+++ b/src/llama-kv-cache-iswa.cpp
@@ -60,14 +60,14 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
     LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
 
     kv_base = std::make_unique<llama_kv_cache>(
-            model, type_k, type_v,
+            model, hparams, type_k, type_v,
             v_trans, offload, unified, size_base, n_seq_max, n_pad,
             0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
 
     LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
 
     kv_swa = std::make_unique<llama_kv_cache>(
-            model, type_k, type_v,
+            model, hparams, type_k, type_v,
             v_trans, offload, unified, size_swa, n_seq_max, n_pad,
             hparams.n_swa, hparams.swa_type, filter_swa, reuse);
 }
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a49a055a6304..92585b671b55 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -79,6 +79,7 @@ static ggml_tensor * ggml_mul_mat_aux(
 
 llama_kv_cache::llama_kv_cache(
         const llama_model & model,
+        const llama_hparams & hparams,
                 ggml_type   type_k,
                 ggml_type   type_v,
                      bool   v_trans,
@@ -91,7 +92,7 @@ llama_kv_cache::llama_kv_cache(
            llama_swa_type   swa_type,
     const layer_filter_cb & filter,
     const  layer_reuse_cb & reuse) :
-    model(model), hparams(model.hparams), v_trans(v_trans),
+    model(model), hparams(hparams), v_trans(v_trans),
     n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
 
     GGML_ASSERT(kv_size % n_pad == 0);
@@ -205,7 +206,7 @@ llama_kv_cache::llama_kv_cache(
         }
 
         const bool has_k = true;
-        const bool has_v = !is_mla;
+        const bool has_v = !is_mla && model.arch != LLM_ARCH_DEEPSEEK4;
 
         ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
         ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
@@ -253,7 +254,7 @@ llama_kv_cache::llama_kv_cache(
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
     for (auto & [buft, ctx] : ctx_map) {
         ggml_backend_buffer_t buf;
-        if (model.hparams.no_alloc) {
+        if (hparams.no_alloc) {
             buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
             for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
                 t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 0b62dc7b2320..0b0a56ce92f4 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -95,6 +95,7 @@ class llama_kv_cache : public llama_memory_i {
 
     llama_kv_cache(
             const llama_model & model,
+            const llama_hparams & hparams,
                     ggml_type   type_k,
                     ggml_type   type_v,
                          bool   v_trans,
diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp
index a59561ea54dd..58dadabc9f62 100644
--- a/src/llama-memory-hybrid-iswa.cpp
+++ b/src/llama-memory-hybrid-iswa.cpp
@@ -1,9 +1,113 @@
 #include "llama-memory-hybrid-iswa.h"
 
+#include "ggml-backend.h"
+
 #include "llama-impl.h"
+#include "llama-io.h"
 #include "llama-model.h"
 #include "llama-context.h"
 
+#include <algorithm>
+#include <cstring>
+#include <limits>
+#include <map>
+#include <stdexcept>
+#include <vector>
+
+namespace {
+
+constexpr uint32_t DSV4_COMPRESSED_KV_STATE_MAGIC   = 0x44535634; // "DSV4"
+constexpr uint32_t DSV4_COMPRESSED_KV_STATE_VERSION = 1;
+constexpr uint32_t DSV4_COMPRESSED_DECODE_UBATCH_MAX = 512;
+
+struct dsv4_row_range {
+    uint32_t begin = 0;
+    uint32_t end   = 0;
+
+    uint32_t size() const {
+        GGML_ASSERT(end >= begin);
+        return end - begin;
+    }
+};
+
+static dsv4_row_range dsv4_make_row_range(uint32_t n_comp, uint32_t ratio, llama_pos p0, llama_pos p1) {
+    GGML_ASSERT(ratio > 0);
+
+    if (n_comp == 0) {
+        return {};
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+    if (p0 >= p1) {
+        return {};
+    }
+
+    const uint64_t row_begin = (uint64_t) p0 / ratio;
+    uint64_t row_end;
+    if (p1 == std::numeric_limits<llama_pos>::max()) {
+        row_end = n_comp;
+    } else {
+        row_end = ((uint64_t) p1 + ratio - 1) / ratio;
+    }
+
+    return {
+        (uint32_t) std::min<uint64_t>(row_begin, n_comp),
+        (uint32_t) std::min<uint64_t>(row_end,   n_comp),
+    };
+}
+
+static size_t dsv4_cache_row_size(const ggml_tensor * t) {
+    GGML_ASSERT(t != nullptr);
+
+    const size_t row_size = ggml_row_size(t->type, t->ne[0]);
+    GGML_ASSERT((size_t) t->nb[1] == row_size);
+    GGML_ASSERT((size_t) t->nb[2] == row_size*(size_t) t->ne[1]);
+
+    return row_size;
+}
+
+static size_t dsv4_cache_offset(const ggml_tensor * t, llama_seq_id seq_id, uint32_t row) {
+    GGML_ASSERT(seq_id >= 0);
+    GGML_ASSERT(row <= (uint32_t) t->ne[1]);
+
+    return (size_t) seq_id*(size_t) t->nb[2] + (size_t) row*(size_t) t->nb[1];
+}
+
+static void dsv4_zero_cache_rows(ggml_tensor * t, llama_seq_id seq_id, uint32_t row_start, uint32_t n_rows) {
+    if (t == nullptr || n_rows == 0) {
+        return;
+    }
+
+    const size_t row_size = dsv4_cache_row_size(t);
+    const size_t n_bytes  = (size_t) n_rows*row_size;
+    const size_t offset   = dsv4_cache_offset(t, seq_id, row_start);
+
+    std::vector<uint8_t> zeros(n_bytes, 0);
+    ggml_backend_tensor_set(t, zeros.data(), offset, n_bytes);
+}
+
+static void dsv4_copy_cache_rows(ggml_tensor * t, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, uint32_t row_start, uint32_t n_rows) {
+    if (t == nullptr || n_rows == 0 || seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    const size_t row_size   = dsv4_cache_row_size(t);
+    const size_t n_bytes    = (size_t) n_rows*row_size;
+    const size_t src_offset = dsv4_cache_offset(t, seq_id_src, row_start);
+    const size_t dst_offset = dsv4_cache_offset(t, seq_id_dst, row_start);
+
+    std::vector<uint8_t> tmp(n_bytes);
+    ggml_backend_tensor_get(t, tmp.data(), src_offset, n_bytes);
+    ggml_backend_tensor_set(t, tmp.data(), dst_offset, n_bytes);
+}
+
+} // namespace
+
 //
 // llama_memory_hybrid_iswa
 //
@@ -59,9 +163,103 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
         filter_recr == nullptr ?
             [&](int32_t il) { return hparams.is_recurrent(il); }
             : filter_recr
-    )) {}
+    )) {
+    if (model.arch != LLM_ARCH_DEEPSEEK4) {
+        return;
+    }
+
+    dsv4_n_seq_max = n_seq_max;
+    dsv4_cache_layers.resize(hparams.n_layer);
+
+    struct ggml_backend_buft_comparator {
+        bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
+            return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
+        }
+    };
+    std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
+
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            ggml_init_params params = {
+                /*.mem_size   =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()),
+                /*.mem_buffer =*/ nullptr,
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+
+            ctx_map.emplace(buft, ctx);
+            return ctx;
+        }
+
+        return it->second.get();
+    };
+
+    for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+        const uint32_t ratio = hparams.attn_compress_ratio[il];
+        if (ratio == 0) {
+            continue;
+        }
+
+        const uint32_t n_comp = std::max<uint32_t>(1, (kv_size + ratio - 1) / ratio);
+
+        const char * dev_name = "CPU";
+        ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
+
+        if (offload) {
+            auto * dev = model.dev_layer(il);
+            buft = ggml_backend_dev_buffer_type(dev);
+            dev_name = ggml_backend_dev_name(dev);
+        }
+
+        LLAMA_LOG_DEBUG("%s: DeepSeek4 compressed KV layer %3d: dev = %s, ratio = %u, rows = %u\n",
+                __func__, il, dev_name, ratio, n_comp);
+
+        ggml_context * ctx = ctx_for_buft(buft);
+        if (!ctx) {
+            throw std::runtime_error("failed to create ggml context for DeepSeek4 compressed KV cache");
+        }
+
+        auto & cache = dsv4_cache_layers[il];
+        cache.n_comp = n_comp;
+        cache.attn_k = ggml_new_tensor_3d(ctx, type_k, hparams.n_embd_head_k(il), n_comp, dsv4_n_seq_max);
+        ggml_format_name(cache.attn_k, "cache_dsv4_attn_k_l%d", il);
+
+        if (ratio == 4) {
+            cache.index_k = ggml_new_tensor_3d(ctx, type_k, hparams.indexer_head_size, n_comp, dsv4_n_seq_max);
+            ggml_format_name(cache.index_k, "cache_dsv4_index_k_l%d", il);
+        }
+    }
+
+    for (auto & [buft, ctx] : ctx_map) {
+        ggml_backend_buffer_t buf;
+        if (model.hparams.no_alloc) {
+            buf = ggml_backend_buft_alloc_buffer(buft, 0);
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
+                t->buffer = buf;
+            }
+        } else {
+            buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
+        }
+        if (!buf) {
+            throw std::runtime_error("failed to allocate DeepSeek4 compressed KV cache buffer");
+        }
+
+        LLAMA_LOG_INFO("%s: %10s DeepSeek4 compressed KV buffer size = %8.2f MiB\n", __func__,
+                ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+
+        ggml_backend_buffer_clear(buf, 0);
+        dsv4_ctxs_bufs.emplace_back(std::move(ctx), buf);
+    }
+}
 
 llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+    const bool dsv4_compressed = has_dsv4_compressed_kv();
+
     do {
         balloc.split_reset();
 
@@ -71,7 +269,23 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr
         while (true) {
             llama_ubatch ubatch;
 
-            if (embd_all) {
+            if (dsv4_compressed) {
+                // DeepSeek V4 compressed attention keeps sequence-local compressor
+                // state and compressed cache rows. Process one sequence set per
+                // ubatch while still allowing multi-sequence batches at the API
+                // level.
+                uint32_t n_ubatch_dsv4 = n_ubatch;
+                const auto & batch = balloc.get_batch();
+                const bool first_split = balloc.get_n_used() == 0;
+                const bool starts_at_zero = batch.pos == nullptr || batch.pos[0] == 0;
+                if (!first_split || !starts_at_zero) {
+                    // Non-prefill compressed-attention chunks build one
+                    // compressor update per token and can otherwise exhaust the
+                    // graph metadata arena on long contexts.
+                    n_ubatch_dsv4 = std::min<uint32_t>(n_ubatch_dsv4, DSV4_COMPRESSED_DECODE_UBATCH_MAX);
+                }
+                ubatch = balloc.split_seq(n_ubatch_dsv4);
+            } else if (embd_all) {
                 // if all tokens are output, split by sequence
                 ubatch = balloc.split_seq(n_ubatch);
             } else {
@@ -128,6 +342,10 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_update(llama_context * l
 }
 
 bool llama_memory_hybrid_iswa::get_can_shift() const {
+    if (has_dsv4_compressed_kv()) {
+        return false;
+    }
+
     // Shifting is trivially supported for recurrent
     return mem_attn->get_can_shift();
 }
@@ -135,6 +353,12 @@ bool llama_memory_hybrid_iswa::get_can_shift() const {
 void llama_memory_hybrid_iswa::clear(bool data) {
     mem_attn->clear(data);
     mem_recr->clear(data);
+
+    if (data) {
+        for (auto & [_, buf] : dsv4_ctxs_bufs) {
+            ggml_backend_buffer_clear(buf.get(), 0);
+        }
+    }
 }
 
 bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
@@ -143,25 +367,39 @@ bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_p
     if (!mem_recr->seq_rm(seq_id, p0, p1)) {
         return false;
     }
-    return mem_attn->seq_rm(seq_id, p0, p1);
+    if (!mem_attn->seq_rm(seq_id, p0, p1)) {
+        return false;
+    }
+    dsv4_seq_rm(seq_id, p0, p1);
+    return true;
 }
 
 void llama_memory_hybrid_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
     mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
     mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    dsv4_seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }
 
 void llama_memory_hybrid_iswa::seq_keep(llama_seq_id seq_id) {
     mem_attn->seq_keep(seq_id);
     mem_recr->seq_keep(seq_id);
+    dsv4_seq_keep(seq_id);
 }
 
 void llama_memory_hybrid_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    if (has_dsv4_compressed_kv() && shift != 0) {
+        GGML_ABORT("DeepSeek V4 compressed KV cache does not support K-shift");
+    }
+
     mem_attn->seq_add(seq_id, p0, p1, shift);
     mem_recr->seq_add(seq_id, p0, p1, shift);
 }
 
 void llama_memory_hybrid_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    if (has_dsv4_compressed_kv() && d != 1) {
+        GGML_ABORT("DeepSeek V4 compressed KV cache does not support position division");
+    }
+
     mem_attn->seq_div(seq_id, p0, p1, d);
     mem_recr->seq_div(seq_id, p0, p1, d);
 }
@@ -181,17 +419,383 @@ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid_iswa::memory_br
     for (const auto & buft_size : mem_recr->memory_breakdown()) {
         mb[buft_size.first] += buft_size.second;
     }
+    for (const auto & [_, buf] : dsv4_ctxs_bufs) {
+        mb[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
+    }
     return mb;
 }
 
 void llama_memory_hybrid_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
     mem_attn->state_write(io, seq_id, flags);
     mem_recr->state_write(io, seq_id, flags);
+    dsv4_state_write(io, seq_id);
 }
 
 void llama_memory_hybrid_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
     mem_attn->state_read(io, seq_id, flags);
     mem_recr->state_read(io, seq_id, flags);
+    dsv4_state_read(io, seq_id);
+}
+
+void llama_memory_hybrid_iswa::dsv4_seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    if (!has_dsv4_compressed_kv()) {
+        return;
+    }
+
+    if (seq_id >= 0) {
+        GGML_ASSERT((uint32_t) seq_id < dsv4_n_seq_max);
+        for (int32_t il = 0; il < (int32_t) dsv4_cache_layers.size(); ++il) {
+            dsv4_clear_rows(seq_id, il, p0, p1);
+        }
+        return;
+    }
+
+    for (uint32_t seq = 0; seq < dsv4_n_seq_max; ++seq) {
+        for (int32_t il = 0; il < (int32_t) dsv4_cache_layers.size(); ++il) {
+            dsv4_clear_rows(seq, il, p0, p1);
+        }
+    }
+}
+
+void llama_memory_hybrid_iswa::dsv4_seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (!has_dsv4_compressed_kv() || seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    GGML_ASSERT(seq_id_src >= 0 && (uint32_t) seq_id_src < dsv4_n_seq_max);
+    GGML_ASSERT(seq_id_dst >= 0 && (uint32_t) seq_id_dst < dsv4_n_seq_max);
+
+    for (int32_t il = 0; il < (int32_t) dsv4_cache_layers.size(); ++il) {
+        dsv4_copy_rows(seq_id_src, seq_id_dst, il, p0, p1);
+    }
+}
+
+void llama_memory_hybrid_iswa::dsv4_seq_keep(llama_seq_id seq_id) {
+    if (!has_dsv4_compressed_kv()) {
+        return;
+    }
+
+    GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max);
+
+    for (uint32_t seq = 0; seq < dsv4_n_seq_max; ++seq) {
+        if ((llama_seq_id) seq == seq_id) {
+            continue;
+        }
+
+        dsv4_clear_seq(seq);
+    }
+}
+
+void llama_memory_hybrid_iswa::dsv4_clear_seq(llama_seq_id seq_id) {
+    GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max);
+
+    for (const auto & layer : dsv4_cache_layers) {
+        dsv4_zero_cache_rows(layer.attn_k,  seq_id, 0, layer.n_comp);
+        dsv4_zero_cache_rows(layer.index_k, seq_id, 0, layer.n_comp);
+    }
+}
+
+void llama_memory_hybrid_iswa::dsv4_clear_rows(llama_seq_id seq_id, int32_t il, llama_pos p0, llama_pos p1) {
+    GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max);
+    GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size());
+
+    const uint32_t ratio = hparams.attn_compress_ratio[il];
+    if (ratio == 0) {
+        return;
+    }
+
+    const auto & layer = dsv4_cache_layers[il];
+    const auto range = dsv4_make_row_range(layer.n_comp, ratio, p0, p1);
+
+    dsv4_zero_cache_rows(layer.attn_k,  seq_id, range.begin, range.size());
+    dsv4_zero_cache_rows(layer.index_k, seq_id, range.begin, range.size());
+}
+
+void llama_memory_hybrid_iswa::dsv4_copy_rows(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, int32_t il, llama_pos p0, llama_pos p1) {
+    GGML_ASSERT(seq_id_src >= 0 && (uint32_t) seq_id_src < dsv4_n_seq_max);
+    GGML_ASSERT(seq_id_dst >= 0 && (uint32_t) seq_id_dst < dsv4_n_seq_max);
+    GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size());
+
+    const uint32_t ratio = hparams.attn_compress_ratio[il];
+    if (ratio == 0) {
+        return;
+    }
+
+    const auto & layer = dsv4_cache_layers[il];
+    const auto range = dsv4_make_row_range(layer.n_comp, ratio, p0, p1);
+
+    dsv4_copy_cache_rows(layer.attn_k,  seq_id_src, seq_id_dst, range.begin, range.size());
+    dsv4_copy_cache_rows(layer.index_k, seq_id_src, seq_id_dst, range.begin, range.size());
+}
+
+uint32_t llama_memory_hybrid_iswa::dsv4_n_state_rows(int32_t il, llama_seq_id seq_id) const {
+    GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max);
+    GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size());
+
+    const uint32_t ratio = hparams.attn_compress_ratio[il];
+    if (ratio == 0) {
+        return 0;
+    }
+
+    const llama_pos pos_max = mem_attn->seq_pos_max(seq_id);
+    if (pos_max < 0) {
+        return 0;
+    }
+
+    const uint64_t n_rows = ((uint64_t) pos_max + 1) / ratio;
+    return (uint32_t) std::min<uint64_t>(n_rows, dsv4_cache_layers[il].n_comp);
+}
+
+void llama_memory_hybrid_iswa::dsv4_state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    if (!has_dsv4_compressed_kv()) {
+        return;
+    }
+
+    GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max));
+
+    std::vector<llama_seq_id> seq_ids;
+    auto seq_has_rows = [&](llama_seq_id seq) {
+        for (int32_t il = 0; il < (int32_t) dsv4_cache_layers.size(); ++il) {
+            if (dsv4_n_state_rows(il, seq) > 0) {
+                return true;
+            }
+        }
+        return false;
+    };
+
+    if (seq_id >= 0) {
+        if (seq_has_rows(seq_id)) {
+            seq_ids.push_back(seq_id);
+        }
+    } else {
+        for (uint32_t seq = 0; seq < dsv4_n_seq_max; ++seq) {
+            if (seq_has_rows(seq)) {
+                seq_ids.push_back(seq);
+            }
+        }
+    }
+
+    const uint32_t magic   = DSV4_COMPRESSED_KV_STATE_MAGIC;
+    const uint32_t version = DSV4_COMPRESSED_KV_STATE_VERSION;
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_seq   = seq_ids.size();
+
+    io.write(&magic,   sizeof(magic));
+    io.write(&version, sizeof(version));
+    io.write(&n_layer, sizeof(n_layer));
+    io.write(&n_seq,   sizeof(n_seq));
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const auto & layer = dsv4_cache_layers[il];
+
+        const uint32_t n_comp = layer.n_comp;
+        io.write(&n_comp, sizeof(n_comp));
+
+        const uint32_t has_attn = layer.attn_k != nullptr;
+        io.write(&has_attn, sizeof(has_attn));
+        if (has_attn) {
+            const int32_t  type_i   = (int32_t) layer.attn_k->type;
+            const uint64_t row_size = dsv4_cache_row_size(layer.attn_k);
+            io.write(&type_i,   sizeof(type_i));
+            io.write(&row_size, sizeof(row_size));
+        }
+
+        const uint32_t has_index = layer.index_k != nullptr;
+        io.write(&has_index, sizeof(has_index));
+        if (has_index) {
+            const int32_t  type_i   = (int32_t) layer.index_k->type;
+            const uint64_t row_size = dsv4_cache_row_size(layer.index_k);
+            io.write(&type_i,   sizeof(type_i));
+            io.write(&row_size, sizeof(row_size));
+        }
+    }
+
+    for (llama_seq_id seq : seq_ids) {
+        io.write(&seq, sizeof(seq));
+
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const auto & layer = dsv4_cache_layers[il];
+            const uint32_t n_rows = dsv4_n_state_rows(il, seq);
+
+            if (layer.attn_k != nullptr) {
+                const uint64_t row_size = dsv4_cache_row_size(layer.attn_k);
+                io.write(&n_rows, sizeof(n_rows));
+                if (n_rows > 0) {
+                    io.write_tensor(layer.attn_k, dsv4_cache_offset(layer.attn_k, seq, 0), (size_t) n_rows*row_size);
+                }
+            }
+
+            if (layer.index_k != nullptr) {
+                const uint64_t row_size = dsv4_cache_row_size(layer.index_k);
+                io.write(&n_rows, sizeof(n_rows));
+                if (n_rows > 0) {
+                    io.write_tensor(layer.index_k, dsv4_cache_offset(layer.index_k, seq, 0), (size_t) n_rows*row_size);
+                }
+            }
+        }
+    }
+}
+
+void llama_memory_hybrid_iswa::dsv4_state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    if (!has_dsv4_compressed_kv()) {
+        return;
+    }
+
+    GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max));
+
+    uint32_t magic;
+    uint32_t version;
+    uint32_t n_layer;
+    uint32_t n_seq;
+
+    io.read(&magic,   sizeof(magic));
+    io.read(&version, sizeof(version));
+    io.read(&n_layer, sizeof(n_layer));
+    io.read(&n_seq,   sizeof(n_seq));
+
+    if (magic != DSV4_COMPRESSED_KV_STATE_MAGIC) {
+        throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: bad magic");
+    }
+    if (version != DSV4_COMPRESSED_KV_STATE_VERSION) {
+        throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: bad version");
+    }
+    if (n_layer != hparams.n_layer || n_layer != dsv4_cache_layers.size()) {
+        throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: mismatched layer count");
+    }
+
+    struct layer_meta {
+        uint32_t n_comp = 0;
+        bool has_attn = false;
+        int32_t attn_type = -1;
+        uint64_t attn_row_size = 0;
+        bool has_index = false;
+        int32_t index_type = -1;
+        uint64_t index_row_size = 0;
+    };
+
+    std::vector<layer_meta> meta(n_layer);
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        auto & m = meta[il];
+        const auto & layer = dsv4_cache_layers[il];
+
+        io.read(&m.n_comp, sizeof(m.n_comp));
+
+        uint32_t has_attn;
+        io.read(&has_attn, sizeof(has_attn));
+        m.has_attn = has_attn != 0;
+        if (m.has_attn) {
+            io.read(&m.attn_type,     sizeof(m.attn_type));
+            io.read(&m.attn_row_size, sizeof(m.attn_row_size));
+        }
+
+        uint32_t has_index;
+        io.read(&has_index, sizeof(has_index));
+        m.has_index = has_index != 0;
+        if (m.has_index) {
+            io.read(&m.index_type,     sizeof(m.index_type));
+            io.read(&m.index_row_size, sizeof(m.index_row_size));
+        }
+
+        const bool local_has_attn  = layer.attn_k  != nullptr;
+        const bool local_has_index = layer.index_k != nullptr;
+
+        if (m.n_comp != layer.n_comp || m.has_attn != local_has_attn || m.has_index != local_has_index) {
+            throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: mismatched layer layout");
+        }
+        if (local_has_attn) {
+            const int32_t  type_i   = (int32_t) layer.attn_k->type;
+            const uint64_t row_size = dsv4_cache_row_size(layer.attn_k);
+            if (m.attn_type != type_i || m.attn_row_size != row_size) {
+                throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: mismatched attention cache type");
+            }
+        }
+        if (local_has_index) {
+            const int32_t  type_i   = (int32_t) layer.index_k->type;
+            const uint64_t row_size = dsv4_cache_row_size(layer.index_k);
+            if (m.index_type != type_i || m.index_row_size != row_size) {
+                throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: mismatched index cache type");
+            }
+        }
+    }
+
+    if (seq_id == -1) {
+        for (auto & [_, buf] : dsv4_ctxs_bufs) {
+            ggml_backend_buffer_clear(buf.get(), 0);
+        }
+    } else {
+        dsv4_clear_seq(seq_id);
+    }
+
+    // scratch buffer for skipping additional blocks in targeted-restore mode
+    std::vector<uint8_t> skip_buf;
+
+    // For targeted restore (seq_id != -1) we follow the public API contract
+    // documented at llama.h:836 (llama_state_seq_set_data) and exercised by
+    // examples/save-load-state/save-load-state.cpp: the first encountered
+    // serialized block is REMAPPED into the requested destination seq_id.
+    // Subsequent blocks (atypical multi-seq payloads) are skipped to avoid
+    // silently merging multiple source sequences into one destination.
+    bool restored_one = false;
+
+    for (uint32_t is = 0; is < n_seq; ++is) {
+        llama_seq_id src_seq_id;
+        io.read(&src_seq_id, sizeof(src_seq_id));
+
+        const bool skip_block = (seq_id != -1 && restored_one);
+
+        const llama_seq_id dst_seq_id = (seq_id == -1) ? src_seq_id : seq_id;
+        if (!skip_block && (dst_seq_id < 0 || (uint32_t) dst_seq_id >= dsv4_n_seq_max)) {
+            throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: invalid sequence id");
+        }
+
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const auto & layer = dsv4_cache_layers[il];
+
+            if (layer.attn_k != nullptr) {
+                uint32_t n_rows;
+                io.read(&n_rows, sizeof(n_rows));
+                if (n_rows > layer.n_comp) {
+                    throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: too many attention rows");
+                }
+                if (n_rows > 0) {
+                    const size_t row_size = dsv4_cache_row_size(layer.attn_k);
+                    const size_t nbytes   = (size_t) n_rows * row_size;
+                    if (skip_block) {
+                        // advance io past this block's bytes without restoring
+                        skip_buf.resize(nbytes);
+                        io.read(skip_buf.data(), nbytes);
+                    } else {
+                        io.read_tensor(layer.attn_k,
+                                dsv4_cache_offset(layer.attn_k, dst_seq_id, 0), nbytes);
+                    }
+                }
+            }
+
+            if (layer.index_k != nullptr) {
+                uint32_t n_rows;
+                io.read(&n_rows, sizeof(n_rows));
+                if (n_rows > layer.n_comp) {
+                    throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: too many index rows");
+                }
+                if (n_rows > 0) {
+                    const size_t row_size = dsv4_cache_row_size(layer.index_k);
+                    const size_t nbytes   = (size_t) n_rows * row_size;
+                    if (skip_block) {
+                        // advance io past this block's bytes without restoring
+                        skip_buf.resize(nbytes);
+                        io.read(skip_buf.data(), nbytes);
+                    } else {
+                        io.read_tensor(layer.index_k,
+                                dsv4_cache_offset(layer.index_k, dst_seq_id, 0), nbytes);
+                    }
+                }
+            }
+        }
+
+        if (!skip_block && seq_id != -1) {
+            restored_one = true;
+        }
+    }
 }
 
 llama_kv_cache_iswa * llama_memory_hybrid_iswa::get_mem_attn() const {
@@ -202,6 +806,41 @@ llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const {
     return mem_recr.get();
 }
 
+bool llama_memory_hybrid_iswa::has_dsv4_compressed_kv() const {
+    for (const auto & layer : dsv4_cache_layers) {
+        if (layer.n_comp != 0) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+uint32_t llama_memory_hybrid_iswa::get_dsv4_n_comp(int32_t il) const {
+    GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size());
+    return dsv4_cache_layers[il].n_comp;
+}
+
+ggml_tensor * llama_memory_hybrid_iswa::get_dsv4_attn_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const {
+    GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size());
+    GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max);
+
+    ggml_tensor * t = dsv4_cache_layers[il].attn_k;
+    GGML_ASSERT(t != nullptr);
+
+    return ggml_view_2d(ctx, t, t->ne[0], t->ne[1], t->nb[1], seq_id*t->nb[2]);
+}
+
+ggml_tensor * llama_memory_hybrid_iswa::get_dsv4_index_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const {
+    GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size());
+    GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max);
+
+    ggml_tensor * t = dsv4_cache_layers[il].index_k;
+    GGML_ASSERT(t != nullptr);
+
+    return ggml_view_2d(ctx, t, t->ne[0], t->ne[1], t->nb[1], seq_id*t->nb[2]);
+}
+
 //
 // llama_memory_hybrid_iswa_context
 //
@@ -209,6 +848,7 @@ llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const {
 llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_status status) : status(status) {}
 
 llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem) :
+    mem(mem),
     ctx_attn(mem->get_mem_attn()->init_full()),
     ctx_recr(mem->get_mem_recr()->init_full()),
     status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
@@ -218,6 +858,7 @@ llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
         llama_memory_hybrid_iswa * mem,
                    llama_context * lctx,
                             bool   optimize) :
+    mem(mem),
     ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
     ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
     status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
@@ -228,6 +869,7 @@ llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
                     slot_info_vec_t   sinfos_base,
                     slot_info_vec_t   sinfos_swa,
           std::vector<llama_ubatch>   ubatches) :
+    mem(mem),
     ubatches(std::move(ubatches)),
     // note: here we copy the ubatches. not sure if this is ideal
     ctx_attn(new llama_kv_cache_iswa_context(mem->get_mem_attn(), std::move(sinfos_base), std::move(sinfos_swa), this->ubatches)),
@@ -275,3 +917,22 @@ const llama_kv_cache_iswa_context * llama_memory_hybrid_iswa_context::get_attn()
 const llama_memory_recurrent_context * llama_memory_hybrid_iswa_context::get_recr() const {
     return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
 }
+
+bool llama_memory_hybrid_iswa_context::has_dsv4_compressed_kv() const {
+    return mem != nullptr && mem->has_dsv4_compressed_kv();
+}
+
+uint32_t llama_memory_hybrid_iswa_context::get_dsv4_n_comp(int32_t il) const {
+    GGML_ASSERT(mem != nullptr);
+    return mem->get_dsv4_n_comp(il);
+}
+
+ggml_tensor * llama_memory_hybrid_iswa_context::get_dsv4_attn_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const {
+    GGML_ASSERT(mem != nullptr);
+    return mem->get_dsv4_attn_k(ctx, il, seq_id);
+}
+
+ggml_tensor * llama_memory_hybrid_iswa_context::get_dsv4_index_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const {
+    GGML_ASSERT(mem != nullptr);
+    return mem->get_dsv4_index_k(ctx, il, seq_id);
+}
diff --git a/src/llama-memory-hybrid-iswa.h b/src/llama-memory-hybrid-iswa.h
index c9d3f9f57c50..8d11f081615c 100644
--- a/src/llama-memory-hybrid-iswa.h
+++ b/src/llama-memory-hybrid-iswa.h
@@ -83,11 +83,39 @@ class llama_memory_hybrid_iswa : public llama_memory_i {
     llama_kv_cache_iswa * get_mem_attn() const;
     llama_memory_recurrent * get_mem_recr() const;
 
+    bool has_dsv4_compressed_kv() const;
+    uint32_t get_dsv4_n_comp(int32_t il) const;
+    ggml_tensor * get_dsv4_attn_k (ggml_context * ctx, int32_t il, llama_seq_id seq_id) const;
+    ggml_tensor * get_dsv4_index_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const;
+
 private:
     const llama_hparams & hparams;
 
     const std::unique_ptr<llama_kv_cache_iswa> mem_attn;
     const std::unique_ptr<llama_memory_recurrent> mem_recr;
+
+    struct dsv4_cache_layer {
+        uint32_t n_comp = 0;
+        ggml_tensor * attn_k  = nullptr;
+        ggml_tensor * index_k = nullptr;
+    };
+
+    uint32_t dsv4_n_seq_max = 0;
+    std::vector<dsv4_cache_layer> dsv4_cache_layers;
+    std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> dsv4_ctxs_bufs;
+
+    void dsv4_seq_rm  (llama_seq_id seq_id, llama_pos p0, llama_pos p1);
+    void dsv4_seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);
+    void dsv4_seq_keep(llama_seq_id seq_id);
+
+    void dsv4_clear_seq(llama_seq_id seq_id);
+    void dsv4_clear_rows(llama_seq_id seq_id, int32_t il, llama_pos p0, llama_pos p1);
+    void dsv4_copy_rows (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, int32_t il, llama_pos p0, llama_pos p1);
+
+    uint32_t dsv4_n_state_rows(int32_t il, llama_seq_id seq_id) const;
+
+    void dsv4_state_write(llama_io_write_i & io, llama_seq_id seq_id) const;
+    void dsv4_state_read (llama_io_read_i  & io, llama_seq_id seq_id);
 };
 
 class llama_memory_hybrid_iswa_context : public llama_memory_context_i {
@@ -128,7 +156,14 @@ class llama_memory_hybrid_iswa_context : public llama_memory_context_i {
     const llama_kv_cache_iswa_context * get_attn() const;
     const llama_memory_recurrent_context * get_recr() const;
 
+    bool has_dsv4_compressed_kv() const;
+    uint32_t get_dsv4_n_comp(int32_t il) const;
+    ggml_tensor * get_dsv4_attn_k (ggml_context * ctx, int32_t il, llama_seq_id seq_id) const;
+    ggml_tensor * get_dsv4_index_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const;
+
 private:
+    llama_memory_hybrid_iswa * mem = nullptr;
+
     // the index of the next ubatch to process
     size_t i_next = 0;
 
diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp
index fd305cab79c2..529022ded18d 100644
--- a/src/llama-memory-hybrid.cpp
+++ b/src/llama-memory-hybrid.cpp
@@ -33,6 +33,7 @@ llama_memory_hybrid::llama_memory_hybrid(
     hparams(model.hparams),
     mem_attn(new llama_kv_cache(
         model,
+        model.hparams,
         type_k,
         type_v,
         v_trans,
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index c645d0785ab7..3343790070c4 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -392,6 +392,7 @@ namespace GGUFMeta {
         return get_arr(llm_kv(kid), result, required);
     }
 
+    template bool llama_model_loader::get_arr<std::vector<uint32_t>>(enum llm_kv kid, std::vector<uint32_t> & result, bool required);
     template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
 
     template<typename T>
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 8bf20a716eba..d35fb4db23a2 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -170,6 +170,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
             return new llama_model_deepseek2(params);
         case LLM_ARCH_DEEPSEEK2OCR:
             return new llama_model_deepseek2ocr(params);
+        case LLM_ARCH_DEEPSEEK4:
+            return new llama_model_deepseek4(params);
         case LLM_ARCH_GLM_DSA:
             return new llama_model_glm_dsa(params);
         case LLM_ARCH_MISTRAL4:
@@ -777,6 +779,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_310B_A15B:     return "310B.A15B";
         case LLM_TYPE_355B_A32B:     return "355B.A32B";
         case LLM_TYPE_397B_A17B:     return "397B.A17B";
+        case LLM_TYPE_685B_A37B:     return "685B.A37B";
         case LLM_TYPE_744B_A40B:     return "744B.A40B";
         case LLM_TYPE_E2B:           return "E2B";
         case LLM_TYPE_E4B:           return "E4B";
@@ -1768,6 +1771,27 @@ void llama_model::print_info() const {
             LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
         }
 
+        if (arch == LLM_ARCH_DEEPSEEK4) {
+            LLAMA_LOG_INFO("%s: n_lora_q              = %d\n",     __func__, hparams.n_lora_q);
+            LLAMA_LOG_INFO("%s: n_lora_o              = %d\n",     __func__, hparams.n_lora_o);
+            LLAMA_LOG_INFO("%s: n_attn_out_groups     = %d\n",     __func__, hparams.n_attn_out_groups);
+            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
+            LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
+            LLAMA_LOG_INFO("%s: n_swa                 = %d\n",     __func__, hparams.n_swa);
+            LLAMA_LOG_INFO("%s: compress_rope_freq_base = %.1f\n", __func__, hparams.compress_rope_freq_base);
+            LLAMA_LOG_INFO("%s: indexer_n_head        = %d\n",     __func__, hparams.indexer_n_head);
+            LLAMA_LOG_INFO("%s: indexer_head_size     = %d\n",     __func__, hparams.indexer_head_size);
+            LLAMA_LOG_INFO("%s: indexer_top_k         = %d\n",     __func__, hparams.indexer_top_k);
+            LLAMA_LOG_INFO("%s: n_hash_layers         = %d\n",     __func__, hparams.n_hash_layers);
+            LLAMA_LOG_INFO("%s: n_hc                  = %d\n",     __func__, hparams.n_hc);
+            LLAMA_LOG_INFO("%s: hc_sinkhorn_iters     = %d\n",     __func__, hparams.hc_sinkhorn_iters);
+            LLAMA_LOG_INFO("%s: hc_eps                = %.1e\n",   __func__, hparams.hc_eps);
+            LLAMA_LOG_INFO("%s: nextn_predict_layers  = %d\n",     __func__, hparams.nextn_predict_layers);
+            LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
+            LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
+            LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+        }
+
         if (arch == LLM_ARCH_QWEN2MOE) {
             LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
             LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n",     __func__, hparams.n_ff_shexp);
@@ -1943,6 +1967,58 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
             {
                 res = nullptr;
             } break;
+        case LLM_ARCH_DEEPSEEK4:
+            {
+                llama_memory_i::layer_filter_cb filter_attn = [&](int32_t) {
+                    return true;
+                };
+                llama_memory_i::layer_filter_cb filter_recr = [&](int32_t il) {
+                    return hparams.attn_compress_ratio[il] != 0;
+                };
+
+                // V4's standard SWA K cache, compressed-attention K cache
+                // (cache.attn_k), and indexer K cache (cache.index_k) all
+                // share the same `type_k` and must agree in dtype because
+                // src/models/deepseek4.cpp concatenates the SWA K view with
+                // the compressed K view via ggml_concat (which asserts
+                // a->type == b->type). Furthermore, V4's K activations are
+                // post-fp8-quantized (ggml_dsv4_fp8_kv_quantize), and q8_0's
+                // single fp16 scale per 32-element block cannot faithfully
+                // reproduce fp8-quantized value distributions -- pinning to
+                // q8_0 corrupts decode silently ("=" loops, "Mirror ..."
+                // garbage). Force fp16 unconditionally for V4 KV caches.
+                //
+                // NOTE: the user-facing WARN and the params.type_k/type_v
+                // coercion already happen earlier in llama_init_from_model
+                // (src/llama-context.cpp), BEFORE the shared
+                // SPLIT_MODE_TENSOR / V-quant-requires-FA validations run,
+                // so users requesting q8_0 KV with V4 don't trip those
+                // checks. The fp16 pin here is a defense-in-depth safety
+                // net for any direct callers of create_memory() that
+                // bypass llama_init_from_model. See
+                // docs/plans/v4-port-kv-q8-completion.md.
+                ggml_type v4_type_k = GGML_TYPE_F16;
+                ggml_type v4_type_v = GGML_TYPE_F16;
+
+                res = new llama_memory_hybrid_iswa(
+                        /* model             */ *this,
+                        /* attn_type_k       */ v4_type_k,
+                        /* attn_type_v       */ v4_type_v,
+                        /* attn_v_trans      */ !cparams.flash_attn,
+                        /* attn_swa_full     */ params.swa_full,
+                        /* attn_kv_size      */ cparams.n_ctx_seq,
+                        /* attn_n_ubatch     */ cparams.n_ubatch,
+                        /* attn_n_pad        */ 1,
+                        /* recurrent_type_r  */ GGML_TYPE_F32,
+                        /* recurrent_type_s  */ GGML_TYPE_F32,
+                        /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
+                        /* n_seq_max         */ cparams.n_seq_max,
+                        /* n_rs_seq          */ cparams.n_rs_seq,
+                        /* offload           */ cparams.offload_kqv,
+                        /* unified           */ cparams.kv_unified,
+                        /* filter_attn       */ std::move(filter_attn),
+                        /* filter_recr       */ std::move(filter_recr));
+            } break;
         // Models that need standard caching should rely on recurrent/hybrid
         // checks
         default:
@@ -2069,6 +2145,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
 
                         res = new llama_kv_cache(
                                 *this,
+                                hparams,
                                 params.type_k,
                                 params.type_v,
                                 !cparams.flash_attn,
@@ -2258,6 +2335,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_DEEPSEEK:
         case LLM_ARCH_DEEPSEEK2:
         case LLM_ARCH_DEEPSEEK2OCR:
+        case LLM_ARCH_DEEPSEEK4:
         case LLM_ARCH_PLM:
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GRANITE:
diff --git a/src/llama-model.h b/src/llama-model.h
index 01c87a75271f..fe1ea971a16c 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -137,6 +137,7 @@ enum llm_type {
     LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
     LLM_TYPE_355B_A32B, // GLM-4.5
     LLM_TYPE_397B_A17B, // Qwen3.5
+    LLM_TYPE_685B_A37B, // DeepSeek V4-Flash
     LLM_TYPE_744B_A40B, // GLM-5
     LLM_TYPE_E2B,
     LLM_TYPE_E4B,
@@ -257,6 +258,15 @@ struct llama_layer {
     struct ggml_tensor * wv_enc    = nullptr;
     struct ggml_tensor * wo_enc    = nullptr;
     struct ggml_tensor * wqkv_gate = nullptr;
+    struct ggml_tensor * attn_kv   = nullptr;
+    struct ggml_tensor * attn_wo_a = nullptr;
+    struct ggml_tensor * attn_wo_b = nullptr;
+
+    // DeepSeek V4 KV compressors
+    struct ggml_tensor * attn_compressor_ape  = nullptr;
+    struct ggml_tensor * attn_compressor_kv   = nullptr;
+    struct ggml_tensor * attn_compressor_gate = nullptr;
+    struct ggml_tensor * attn_compressor_norm = nullptr;
 
     // relative position bias
     struct ggml_tensor * attn_rel_b       = nullptr;
@@ -322,6 +332,7 @@ struct llama_layer {
     struct ggml_tensor * ffn_up_b   = nullptr; // b3
     struct ggml_tensor * ffn_act    = nullptr;
     struct ggml_tensor * ffn_exp_probs_b = nullptr;
+    struct ggml_tensor * ffn_gate_tid2eid = nullptr;
 
     // mamba proj
     struct ggml_tensor * ssm_in  = nullptr;
@@ -483,6 +494,18 @@ struct llama_layer {
     struct ggml_tensor * indexer_proj     = nullptr;
     struct ggml_tensor * indexer_attn_k   = nullptr;
     struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias
+    struct ggml_tensor * indexer_compressor_ape  = nullptr;
+    struct ggml_tensor * indexer_compressor_kv   = nullptr;
+    struct ggml_tensor * indexer_compressor_gate = nullptr;
+    struct ggml_tensor * indexer_compressor_norm = nullptr;
+
+    // DeepSeek V4 hyper-connection weights
+    struct ggml_tensor * hc_attn_base  = nullptr;
+    struct ggml_tensor * hc_attn_fn    = nullptr;
+    struct ggml_tensor * hc_attn_scale = nullptr;
+    struct ggml_tensor * hc_ffn_base   = nullptr;
+    struct ggml_tensor * hc_ffn_fn     = nullptr;
+    struct ggml_tensor * hc_ffn_scale  = nullptr;
 
     // gemma4 layer output scale
     struct ggml_tensor * out_scale = nullptr;
@@ -531,6 +554,9 @@ struct llama_model {
     struct ggml_tensor * output_norm_b   = nullptr;
     struct ggml_tensor * output          = nullptr;
     struct ggml_tensor * output_b        = nullptr;
+    struct ggml_tensor * output_hc_base  = nullptr;
+    struct ggml_tensor * output_hc_fn    = nullptr;
+    struct ggml_tensor * output_hc_scale = nullptr;
     struct ggml_tensor * output_norm_enc = nullptr;
 
 
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 43e05c3d56fe..6914df768c1e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -292,6 +292,14 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
     // quantize only 2D and 3D tensors (experts)
     if (ggml_n_dims(tensor) < 2) return false;
 
+    // do not quantize integer tensors (e.g. DeepSeek V4 ffn_gate_tid2eid which
+    // stores expert-id indices as I32). Quantization makes no sense for non
+    // floating-point data; the dequantize path also explicitly rejects them.
+    if (tensor->type == GGML_TYPE_I8  ||
+        tensor->type == GGML_TYPE_I16 ||
+        tensor->type == GGML_TYPE_I32 ||
+        tensor->type == GGML_TYPE_I64) return false;
+
     const std::string name = ggml_get_name(tensor);
 
     // This used to be a regex, but <regex> has an extreme cost to compile times.
diff --git a/src/models/deepseek4.cpp b/src/models/deepseek4.cpp
new file mode 100644
index 000000000000..71d803014fd6
--- /dev/null
+++ b/src/models/deepseek4.cpp
@@ -0,0 +1,1583 @@
+#include "models.h"
+
+#include "ggml-backend.h"
+#include "llama-kv-cache-iswa.h"
+#include "llama-memory-hybrid-iswa.h"
+#include "llama-memory-recurrent.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+void llama_model_deepseek4::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,       hparams.n_lora_q);
+    ml.get_key(LLM_KV_ATTENTION_OUTPUT_LORA_RANK,  hparams.n_lora_o);
+    ml.get_key(LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT,hparams.n_attn_out_groups);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
+    if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS;
+    }
+
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa, false);
+    if (hparams.n_swa > 0) {
+        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+        hparams.set_swa_pattern(0, false);
+        hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+        hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+    }
+    ml.get_key(LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE, hparams.compress_rope_freq_base, false);
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,      hparams.indexer_n_head, false);
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,      hparams.indexer_head_size, false);
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K,           hparams.indexer_top_k, false);
+    ml.get_key(LLM_KV_HASH_LAYER_COUNT,                  hparams.n_hash_layers);
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
+    ml.get_key(LLM_KV_HYPER_CONNECTION_COUNT,            hparams.n_hc);
+    ml.get_key(LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS,   hparams.hc_sinkhorn_iters);
+    ml.get_key(LLM_KV_HYPER_CONNECTION_EPS,              hparams.hc_eps);
+    ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP,           hparams.swiglu_clamp_exp, hparams.n_layer, false);
+
+    std::vector<uint32_t> compress_ratios;
+    ml.get_arr(LLM_KV_ATTENTION_COMPRESS_RATIOS, compress_ratios);
+    if (compress_ratios.size() < hparams.n_layer) {
+        throw std::runtime_error(format("DeepSeek V4 compress ratio count mismatch: got %zu, expected %u",
+                    compress_ratios.size(), hparams.n_layer));
+    }
+    std::copy_n(compress_ratios.begin(), hparams.n_layer, hparams.attn_compress_ratio.begin());
+
+    for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+        const uint32_t ratio = hparams.attn_compress_ratio[il];
+        if (ratio == 0) {
+            continue;
+        }
+
+        const uint32_t coff = ratio == 4 ? 2 : 1;
+        uint32_t state_size = coff * ratio * coff * hparams.n_embd_head_k(il);
+        if (ratio == 4) {
+            state_size += coff * ratio * coff * hparams.indexer_head_size;
+        }
+        hparams.dsv4_state_size = std::max(hparams.dsv4_state_size, state_size);
+    }
+
+    type = LLM_TYPE_UNKNOWN;
+}
+
+void llama_model_deepseek4::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t q_lora_rank       = hparams.n_lora_q;
+    const int64_t o_lora_rank       = hparams.n_lora_o;
+    const int64_t n_out_groups      = hparams.n_attn_out_groups;
+    const int64_t n_ff_exp          = hparams.n_ff_exp;
+    const int64_t n_expert_shared   = hparams.n_expert_shared;
+    const int64_t n_hc              = hparams.n_hc;
+    const int64_t hc_dim            = n_hc * n_embd;
+    const int64_t hc_mix            = (2 + n_hc) * n_hc;
+
+    if (n_out_groups == 0) {
+        throw std::runtime_error("DeepSeek V4 requires attention output groups");
+    }
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    output_norm     = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM,     "weight"), {n_embd}, 0);
+    output          = create_tensor(tn(LLM_TENSOR_OUTPUT,          "weight"), {n_embd, n_vocab}, 0);
+    output_hc_base  = create_tensor(tn(LLM_TENSOR_OUTPUT_HC_BASE,  "weight"), {n_hc}, 0);
+    output_hc_fn    = create_tensor(tn(LLM_TENSOR_OUTPUT_HC_FN,    "weight"), {hc_dim, n_hc}, 0);
+    output_hc_scale = create_tensor(tn(LLM_TENSOR_OUTPUT_HC_SCALE, "weight"), {1}, 0);
+
+    auto create_deepseek4_compressor = [&](llama_layer & layer, int bid, int64_t compress_ratio, int64_t head_size, bool indexer) {
+        const int64_t coff = compress_ratio == 4 ? 2 : 1;
+        ggml_tensor *& ape  = indexer ? layer.indexer_compressor_ape  : layer.attn_compressor_ape;
+        ggml_tensor *& kv   = indexer ? layer.indexer_compressor_kv   : layer.attn_compressor_kv;
+        ggml_tensor *& gate = indexer ? layer.indexer_compressor_gate : layer.attn_compressor_gate;
+        ggml_tensor *& norm = indexer ? layer.indexer_compressor_norm : layer.attn_compressor_norm;
+
+        ape  = create_tensor(tn(indexer ? LLM_TENSOR_INDEXER_COMPRESSOR_APE  : LLM_TENSOR_ATTN_COMPRESSOR_APE,  "weight", bid), {coff * head_size, compress_ratio}, 0);
+        kv   = create_tensor(tn(indexer ? LLM_TENSOR_INDEXER_COMPRESSOR_KV   : LLM_TENSOR_ATTN_COMPRESSOR_KV,   "weight", bid), {n_embd, coff * head_size}, 0);
+        gate = create_tensor(tn(indexer ? LLM_TENSOR_INDEXER_COMPRESSOR_GATE : LLM_TENSOR_ATTN_COMPRESSOR_GATE, "weight", bid), {n_embd, coff * head_size}, 0);
+        norm = create_tensor(tn(indexer ? LLM_TENSOR_INDEXER_COMPRESSOR_NORM : LLM_TENSOR_ATTN_COMPRESSOR_NORM, "weight", bid), {head_size}, 0);
+    };
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        const int64_t compress_ratio = hparams.attn_compress_ratio[i];
+
+        layer.hc_attn_base  = create_tensor(tn(LLM_TENSOR_HC_ATTN_BASE,  "weight", i), {hc_mix}, 0);
+        layer.hc_attn_fn    = create_tensor(tn(LLM_TENSOR_HC_ATTN_FN,    "weight", i), {hc_dim, hc_mix}, 0);
+        layer.hc_attn_scale = create_tensor(tn(LLM_TENSOR_HC_ATTN_SCALE, "weight", i), {3}, 0);
+        layer.hc_ffn_base   = create_tensor(tn(LLM_TENSOR_HC_FFN_BASE,   "weight", i), {hc_mix}, 0);
+        layer.hc_ffn_fn     = create_tensor(tn(LLM_TENSOR_HC_FFN_FN,     "weight", i), {hc_dim, hc_mix}, 0);
+        layer.hc_ffn_scale  = create_tensor(tn(LLM_TENSOR_HC_FFN_SCALE,  "weight", i), {3}, 0);
+
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
+        layer.ffn_norm       = create_tensor(tn(LLM_TENSOR_FFN_NORM,       "weight", i), {n_embd}, 0);
+        layer.attn_sinks     = create_tensor(tn(LLM_TENSOR_ATTN_SINKS,     "weight", i), {n_head}, 0);
+        layer.attn_q_a_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM,  "weight", i), {q_lora_rank}, 0);
+        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.wq_a    = create_tensor(tn(LLM_TENSOR_ATTN_Q_A,    "weight", i), {n_embd, q_lora_rank}, 0);
+        layer.wq_b    = create_tensor(tn(LLM_TENSOR_ATTN_Q_B,    "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
+        layer.attn_kv = create_tensor(tn(LLM_TENSOR_ATTN_KV,     "weight", i), {n_embd, n_embd_head_k}, 0);
+        layer.attn_wo_a = create_tensor(tn(LLM_TENSOR_ATTN_OUT_A, "weight", i), {n_head * n_embd_head_v / n_out_groups, n_out_groups * o_lora_rank}, 0);
+        layer.attn_wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_B, "weight", i), {n_out_groups * o_lora_rank, n_embd}, 0);
+
+        if (compress_ratio > 0) {
+            create_deepseek4_compressor(layer, i, compress_ratio, n_embd_head_k, false);
+        }
+        if (compress_ratio == 4) {
+            layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, 0);
+            layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, 0);
+            create_deepseek4_compressor(layer, i, compress_ratio, hparams.indexer_head_size, true);
+        }
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+        if (static_cast<uint32_t>(i) < hparams.n_hash_layers) {
+            layer.ffn_gate_tid2eid = create_tensor(tn(LLM_TENSOR_FFN_GATE_TID2EID, "weight", i), {n_expert_used, n_vocab}, 0);
+            layer.ffn_exp_probs_b  = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B,  "bias",   i), {n_expert}, TENSOR_NOT_REQUIRED);
+        } else {
+            layer.ffn_exp_probs_b  = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B,  "bias",   i), {n_expert}, 0);
+            layer.ffn_gate_tid2eid = create_tensor(tn(LLM_TENSOR_FFN_GATE_TID2EID, "weight", i), {n_expert_used, n_vocab}, TENSOR_NOT_REQUIRED);
+        }
+
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd,   n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
+
+        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd,   n_ff_exp * n_expert_shared}, 0);
+        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_exp * n_expert_shared, n_embd}, 0);
+        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd,   n_ff_exp * n_expert_shared}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_deepseek4::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+namespace {
+
+struct dsv4_hc_mix {
+    ggml_tensor * x;
+    ggml_tensor * mixes;
+    ggml_tensor * pre;
+    ggml_tensor * post;
+    ggml_tensor * comb;
+};
+
+struct dsv4_state_pair {
+    ggml_tensor * kv;
+    ggml_tensor * score;
+};
+
+struct dsv4_decode_compressor {
+    ggml_tensor * kv_state;
+    ggml_tensor * score_state;
+    ggml_tensor * kv_comp;
+};
+
+struct dsv4_state_layout {
+    int64_t width;
+    int64_t rows;
+    int64_t elems;
+};
+
+enum class dsv4_mask_kind {
+    RAW_WINDOW,
+    COMPRESS_CAUSAL,
+    ATTN_STATIC,
+};
+
+struct dsv4_mask_entry {
+    ggml_tensor   * tensor = nullptr;
+    dsv4_mask_kind kind;
+    int64_t         n_raw = 0;
+    int64_t         n_comp = 0;
+    int64_t         window = 0;
+    int64_t         ratio = 0;
+};
+
+class dsv4_graph_inputs : public llm_graph_input_i {
+public:
+    ggml_tensor * add_mask(
+            ggml_context  * ctx,
+            dsv4_mask_kind kind,
+            int64_t        n0,
+            int64_t        n1,
+            int64_t        n_raw,
+            int64_t        n_comp,
+            int64_t        window,
+            int64_t        ratio,
+            const char   * name) {
+        ggml_tensor * t = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n0, n1, 1, 1);
+        ggml_set_input(t);
+        ggml_set_name(t, name);
+        masks.push_back({ t, kind, n_raw, n_comp, window, ratio });
+        return t;
+    }
+
+    void set_input(const llama_ubatch * ubatch) override {
+        for (const auto & mask : masks) {
+            GGML_ASSERT(mask.tensor != nullptr);
+            if (mask.tensor->buffer == nullptr) {
+                continue;
+            }
+
+            const int64_t n0 = mask.tensor->ne[0];
+            const int64_t n1 = mask.tensor->ne[1];
+
+            std::vector<float> data(n0*n1, -INFINITY);
+
+            switch (mask.kind) {
+                case dsv4_mask_kind::RAW_WINDOW:
+                    fill_raw_window(data, n0, n1, mask.window, ubatch);
+                    break;
+                case dsv4_mask_kind::COMPRESS_CAUSAL:
+                    fill_compress_causal(data, n0, n1, mask.ratio, 0, ubatch);
+                    break;
+                case dsv4_mask_kind::ATTN_STATIC:
+                    fill_raw_window(data, n0, n1, mask.window, ubatch);
+                    fill_compress_causal(data, n0, n1, mask.ratio, mask.n_raw, ubatch);
+                    break;
+            }
+
+            ggml_backend_tensor_set(mask.tensor, data.data(), 0, data.size()*sizeof(float));
+        }
+    }
+
+private:
+    static void fill_raw_window(
+            std::vector<float> & data,
+            int64_t              n0,
+            int64_t              n1,
+            int64_t              window,
+            const llama_ubatch * ubatch) {
+        GGML_ASSERT((int64_t) ubatch->n_tokens == n1);
+
+        for (int64_t iq = 0; iq < n1; ++iq) {
+            const llama_pos p1 = ubatch->pos ? ubatch->pos[iq] : (llama_pos) iq;
+
+            for (int64_t ik = 0; ik < std::min<int64_t>(n0, ubatch->n_tokens); ++ik) {
+                const llama_pos p0 = ubatch->pos ? ubatch->pos[ik] : (llama_pos) ik;
+
+                if (p0 > p1) {
+                    continue;
+                }
+
+                if (window > 0 && p1 - p0 >= window) {
+                    continue;
+                }
+
+                data[iq*n0 + ik] = 0.0f;
+            }
+        }
+    }
+
+    static void fill_compress_causal(
+            std::vector<float> & data,
+            int64_t              n0,
+            int64_t              n1,
+            int64_t              ratio,
+            int64_t              offset,
+            const llama_ubatch * ubatch) {
+        GGML_ASSERT(ratio > 0);
+
+        const int64_t n_comp = n0 - offset;
+        for (int64_t iq = 0; iq < n1; ++iq) {
+            const llama_pos p1 = ubatch->pos ? ubatch->pos[iq] : (llama_pos) iq;
+            const int64_t n_visible = (p1 + 1) / ratio;
+
+            for (int64_t ic = 0; ic < std::min<int64_t>(n_comp, n_visible); ++ic) {
+                data[iq*n0 + offset + ic] = 0.0f;
+            }
+        }
+    }
+
+    std::vector<dsv4_mask_entry> masks;
+};
+
+struct dsv4_rope_cfg {
+    int32_t n_ctx_orig;
+    float   freq_base;
+    float   freq_scale;
+    float   ext_factor;
+    float   attn_factor;
+    float   beta_fast;
+    float   beta_slow;
+};
+
+static ggml_tensor * dsv4_view_scale(ggml_context * ctx, ggml_tensor * scale, int64_t idx) {
+    return ggml_view_2d(ctx, scale, 1, 1, scale->nb[0], idx * scale->nb[0]);
+}
+
+static ggml_tensor * dsv4_add_scalar(ggml_context * ctx, ggml_tensor * x, float value) {
+    ggml_tensor * shape = x;
+    x = ggml_cont(ctx, x);
+    x = ggml_reshape_1d(ctx, x, ggml_nelements(x));
+    x = ggml_scale_bias(ctx, x, 1.0f, value);
+    return ggml_reshape(ctx, x, shape);
+}
+
+static ggml_tensor * dsv4_mul_scalar(ggml_context * ctx, ggml_tensor * x, float value) {
+    ggml_tensor * shape = x;
+    x = ggml_cont(ctx, x);
+    x = ggml_reshape_1d(ctx, x, ggml_nelements(x));
+    x = ggml_scale(ctx, x, value);
+    return ggml_reshape(ctx, x, shape);
+}
+
+static ggml_tensor * dsv4_arange_i32(ggml_context * ctx, int64_t begin, int64_t end) {
+    ggml_tensor * t = ggml_arange(ctx, (float) begin, (float) end, 1.0f);
+    return ggml_cast(ctx, t, GGML_TYPE_I32);
+}
+
+static ggml_tensor * dsv4_new_filled_2d(ggml_context * ctx, int64_t n0, int64_t n1, float value) {
+    return ggml_fill(ctx, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n0, n1), value);
+}
+
+static ggml_tensor * dsv4_new_filled_3d(ggml_context * ctx, int64_t n0, int64_t n1, int64_t n2, float value) {
+    return ggml_fill(ctx, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n0, n1, n2), value);
+}
+
+static dsv4_state_layout dsv4_make_state_layout(int64_t compress_ratio, int64_t head_dim) {
+    const int64_t coff = compress_ratio == 4 ? 2 : 1;
+    const int64_t width = coff * head_dim;
+    const int64_t rows  = coff * compress_ratio;
+    return { width, rows, width * rows };
+}
+
+static ggml_tensor * dsv4_view_cols(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        int64_t        n0,
+        int64_t        n1,
+        int64_t        off0,
+        int64_t        off1) {
+    return ggml_view_2d(ctx, x, n0, n1, x->nb[1], off1*x->nb[1] + off0*x->nb[0]);
+}
+
+static ggml_tensor * dsv4_view_state_segment(
+        ggml_context * ctx,
+        ggml_tensor  * state,
+        int64_t        offset,
+        int64_t        width,
+        int64_t        rows) {
+    return ggml_view_2d(ctx, state, width, rows, width*state->nb[0], offset*state->nb[0]);
+}
+
+static void dsv4_store_state_segment(
+        ggml_context * ctx,
+        ggml_cgraph  * gf,
+        ggml_tensor  * src,
+        ggml_tensor  * dst,
+        int64_t        state_size,
+        int64_t        head,
+        int64_t        offset) {
+    const int64_t n = ggml_nelements(src);
+    src = ggml_cont(ctx, src);
+    src = ggml_reshape_1d(ctx, src, n);
+
+    ggml_tensor * view = ggml_view_1d(ctx, dst, n, (head*state_size + offset)*ggml_element_size(dst));
+    ggml_build_forward_expand(gf, ggml_cpy(ctx, src, view));
+}
+
+static void dsv4_store_cache_rows(
+        ggml_context * ctx,
+        ggml_cgraph  * gf,
+        ggml_tensor  * cache,
+        ggml_tensor  * src,
+        int64_t        row_start,
+        int64_t        n_rows) {
+    if (n_rows <= 0) {
+        return;
+    }
+
+    src = ggml_cont(ctx, src);
+    src = ggml_reshape_2d(ctx, src, cache->ne[0], n_rows);
+
+    // Avoid ggml_set_rows here: on multi-GPU, sched routes set_rows by SOURCE
+    // device, but the cache destination has its own device affinity → illegal
+    // memory access when those differ. ggml_cpy into a contiguous view of
+    // cache routes correctly by dst affinity (same pattern as
+    // dsv4_store_state_segment, which works in production multi-GPU).
+    ggml_tensor * cache_view = ggml_view_2d(ctx, cache,
+            cache->ne[0], n_rows,
+            cache->nb[1],
+            row_start * cache->nb[1]);
+    ggml_build_forward_expand(gf, ggml_cpy(ctx, src, cache_view));
+}
+
+static dsv4_rope_cfg dsv4_make_rope_cfg(
+        const llama_hparams & hparams,
+        const llama_cparams  & cparams,
+        uint32_t              compress_ratio) {
+    if (compress_ratio == 0) {
+        return {
+            0,
+            hparams.rope_freq_base_train,
+            1.0f,
+            0.0f,
+            1.0f,
+            cparams.yarn_beta_fast,
+            cparams.yarn_beta_slow,
+        };
+    }
+
+    float attn_factor = 1.0f;
+    if (cparams.yarn_ext_factor != 0.0f && cparams.rope_freq_scale > 0.0f) {
+        // DeepSeek V4 uses YaRN-style frequency interpolation for compressed RoPE,
+        // but the reference implementation does not apply YaRN's magnitude scale.
+        attn_factor /= 1.0f + 0.1f * std::log(1.0f / cparams.rope_freq_scale);
+    }
+
+    return {
+        (int32_t) cparams.n_ctx_orig_yarn,
+        hparams.compress_rope_freq_base > 0.0f ? hparams.compress_rope_freq_base : cparams.rope_freq_base,
+        cparams.rope_freq_scale,
+        cparams.yarn_ext_factor,
+        attn_factor,
+        cparams.yarn_beta_fast,
+        cparams.yarn_beta_slow,
+    };
+}
+
+static ggml_tensor * dsv4_view_base(ggml_context * ctx, ggml_tensor * base, int64_t n, int64_t off) {
+    return ggml_view_2d(ctx, base, n, 1, base->nb[0], off * base->nb[0]);
+}
+
+static ggml_tensor * dsv4_apply_rope_tail(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        ggml_tensor  * inp_pos,
+        int64_t        n_embd_head,
+        int64_t        n_head,
+        int64_t        n_tokens,
+        int64_t        n_rot,
+        int            rope_type,
+        int32_t        n_ctx_orig,
+        float          freq_base,
+        float          freq_scale,
+        float          ext_factor,
+        float          attn_factor,
+        float          beta_fast,
+        float          beta_slow,
+        bool           inverse) {
+    GGML_ASSERT(x->ne[0] == n_embd_head);
+    GGML_ASSERT(x->ne[1] == n_head);
+    GGML_ASSERT(x->ne[2] == n_tokens);
+
+    if (n_rot == n_embd_head) {
+        return inverse
+            ? ggml_rope_ext_back(ctx, x, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow)
+            : ggml_rope_ext     (ctx, x, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+    }
+
+    const int64_t n_nope = n_embd_head - n_rot;
+    GGML_ASSERT(n_nope > 0);
+
+    return ggml_dsv4_rope_tail(ctx, x, inp_pos, nullptr, n_rot, rope_type,
+            n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
+            beta_fast, beta_slow, inverse);
+}
+
+static dsv4_hc_mix dsv4_hc_pre(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        ggml_tensor  * hc_fn,
+        ggml_tensor  * hc_scale,
+        ggml_tensor  * hc_base,
+        int64_t        n_embd,
+        int64_t        n_hc,
+        int64_t        n_tokens,
+        float          norm_eps,
+        int            sinkhorn_iters,
+        float          hc_eps) {
+    const int64_t hc_dim = n_embd * n_hc;
+    ggml_tensor * flat = ggml_cont(ctx, ggml_reshape_2d(ctx, x, hc_dim, n_tokens));
+    flat = ggml_rms_norm(ctx, flat, norm_eps);
+    ggml_tensor * mixes = ggml_mul_mat(ctx, hc_fn, flat); // [mix_hc, n_tokens]
+    ggml_tensor * split = ggml_dsv4_hc_split_sinkhorn(ctx, mixes, hc_scale, hc_base, n_hc, sinkhorn_iters, hc_eps);
+    ggml_tensor * pre = ggml_view_2d(ctx, split, n_hc, n_tokens, split->nb[1], 0);
+    ggml_tensor * post = ggml_view_2d(ctx, split, n_hc, n_tokens, split->nb[1], n_hc * split->nb[0]);
+    ggml_tensor * comb = ggml_view_2d(ctx, split, n_hc * n_hc, n_tokens, split->nb[1], 2 * n_hc * split->nb[0]);
+    if (n_tokens != 1) {
+        pre = ggml_cont(ctx, pre);
+        post = ggml_cont(ctx, post);
+        comb = ggml_cont(ctx, comb);
+    }
+    comb = ggml_reshape_3d(ctx, comb, n_hc, n_hc, n_tokens); // [src_hc, dst_hc, n_tokens]
+    ggml_tensor * y = ggml_dsv4_hc_weighted_sum(ctx, x, pre);
+    return { y, mixes, pre, post, comb };
+}
+
+static ggml_tensor * dsv4_hc_post(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        ggml_tensor  * residual,
+        ggml_tensor  * post,
+        ggml_tensor  * comb,
+        int64_t        n_embd,
+        int64_t        n_hc,
+        int64_t        n_tokens) {
+    GGML_ASSERT(x->ne[0] == n_embd);
+    GGML_ASSERT(x->ne[1] == n_tokens);
+    GGML_ASSERT(residual->ne[0] == n_embd);
+    GGML_ASSERT(residual->ne[1] == n_hc);
+    GGML_ASSERT(residual->ne[2] == n_tokens);
+    GGML_ASSERT(post->ne[0] == n_hc);
+    GGML_ASSERT(post->ne[1] == n_tokens);
+    GGML_ASSERT(comb->ne[0] == n_hc);
+    GGML_ASSERT(comb->ne[1] == n_hc);
+    GGML_ASSERT(comb->ne[2] == n_tokens);
+
+    return ggml_dsv4_hc_expand(ctx, x, residual, post, comb);
+}
+
+static ggml_tensor * dsv4_hc_head(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        ggml_tensor  * hc_fn,
+        ggml_tensor  * hc_scale,
+        ggml_tensor  * hc_base,
+        int64_t        n_embd,
+        int64_t        n_hc,
+        int64_t        n_tokens,
+        float          norm_eps,
+        float          hc_eps) {
+    const int64_t hc_dim = n_embd * n_hc;
+
+    ggml_tensor * flat = ggml_cont(ctx, ggml_reshape_2d(ctx, x, hc_dim, n_tokens));
+    flat = ggml_rms_norm(ctx, flat, norm_eps);
+
+    ggml_tensor * pre = ggml_mul_mat(ctx, hc_fn, flat); // [hc, n_tokens]
+    pre = ggml_mul(ctx, pre, dsv4_view_scale(ctx, hc_scale, 0));
+    pre = ggml_add(ctx, pre, dsv4_view_base(ctx, hc_base, n_hc, 0));
+    pre = dsv4_add_scalar(ctx, ggml_sigmoid(ctx, pre), hc_eps);
+
+    return ggml_dsv4_hc_weighted_sum(ctx, x, pre);
+}
+
+static ggml_tensor * dsv4_grouped_out(
+        ggml_context * ctx,
+        ggml_tensor  * o,
+        ggml_tensor  * wo_a,
+        ggml_tensor  * wo_b,
+        int64_t        n_embd_head,
+        int64_t        n_head,
+        int64_t        n_groups,
+        int64_t        o_lora_rank,
+        int64_t        n_tokens) {
+    GGML_ASSERT(n_head % n_groups == 0);
+
+    const int64_t group_heads = n_head / n_groups;
+    const int64_t group_dim   = n_embd_head * group_heads;
+
+    o = ggml_cont(ctx, o);
+    o = ggml_reshape_3d(ctx, o, group_dim, n_groups, n_tokens);
+
+    ggml_tensor * wo_a_g = ggml_reshape_3d(ctx, wo_a, group_dim, o_lora_rank, n_groups);
+    ggml_tensor * ids = ggml_arange(ctx, 0.0f, float(n_groups), 1.0f);
+    ids = ggml_cast(ctx, ids, GGML_TYPE_I32);
+    ids = ggml_repeat_4d(ctx, ids, n_groups, n_tokens, 1, 1);
+
+    ggml_tensor * low = ggml_mul_mat_id(ctx, wo_a_g, o, ids); // [o_lora_rank, n_groups, n_tokens]
+    low = ggml_reshape_2d(ctx, low, o_lora_rank * n_groups, n_tokens);
+
+    return ggml_mul_mat(ctx, wo_b, low);
+}
+
+static ggml_tensor * dsv4_softmax_pool_ratio(
+        ggml_context * ctx,
+        ggml_tensor  * kv,
+        ggml_tensor  * score) {
+    score = ggml_soft_max(ctx, score);
+    ggml_tensor * pooled = ggml_mul(ctx, kv, score);
+    pooled = ggml_sum_rows(ctx, pooled);
+    return ggml_reshape_2d(ctx, pooled, kv->ne[1], kv->ne[2]);
+}
+
+static ggml_tensor * dsv4_shift_overlap_state(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        float          pad_value) {
+    const int64_t n_embd  = x->ne[0];
+    const int64_t ratio   = x->ne[1];
+    const int64_t n_comp  = x->ne[2];
+
+    ggml_tensor * first = ggml_view_3d(ctx, x, n_embd, ratio, 1,
+            x->nb[1], x->nb[2], 0);
+    ggml_tensor * pad = ggml_fill(ctx, ggml_cont(ctx, first), pad_value);
+
+    if (n_comp == 1) {
+        return pad;
+    }
+
+    ggml_tensor * prev = ggml_view_3d(ctx, x, n_embd, ratio, n_comp - 1,
+            x->nb[1], x->nb[2], 0);
+    return ggml_concat(ctx, pad, prev, 2);
+}
+
+static ggml_tensor * dsv4_build_compressor_prefill(
+        ggml_context       * ctx,
+        ggml_tensor        * x,
+        ggml_tensor        * wkv,
+        ggml_tensor        * wgate,
+        ggml_tensor        * ape,
+        ggml_tensor        * norm,
+        ggml_tensor        * pos,
+        int64_t              n_embd_head,
+        int64_t              n_rot,
+        int64_t              n_tokens,
+        int64_t              compress_ratio,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg,
+        float                norm_eps) {
+    GGML_ASSERT(compress_ratio > 0);
+    const int64_t n_comp = n_tokens / compress_ratio;
+    GGML_ASSERT(n_comp > 0);
+
+    const int64_t coff = compress_ratio == 4 ? 2 : 1;
+    const int64_t n_kv = coff * n_embd_head;
+    const int64_t cutoff = n_comp * compress_ratio;
+
+    ggml_tensor * kv = ggml_mul_mat(ctx, wkv, x);       // [coff*head_dim, n_tokens]
+    ggml_tensor * score = ggml_mul_mat(ctx, wgate, x);  // [coff*head_dim, n_tokens]
+
+    kv = ggml_view_3d(ctx, kv, n_kv, compress_ratio, n_comp,
+            kv->nb[1],
+            kv->nb[1] * compress_ratio,
+            0);
+    score = ggml_view_3d(ctx, score, n_kv, compress_ratio, n_comp,
+            score->nb[1],
+            score->nb[1] * compress_ratio,
+            0);
+    GGML_ASSERT(cutoff <= n_tokens);
+
+    ggml_tensor * ape_f = ape->type == GGML_TYPE_F32 ? ape : ggml_cast(ctx, ape, GGML_TYPE_F32);
+    score = ggml_add(ctx, score, ggml_repeat(ctx, ape_f, score));
+
+    if (coff == 1) {
+        kv = ggml_cont(ctx, ggml_permute(ctx, kv, 1, 0, 2, 3));       // [ratio, head_dim, n_comp]
+        score = ggml_cont(ctx, ggml_permute(ctx, score, 1, 0, 2, 3)); // [ratio, head_dim, n_comp]
+        kv = dsv4_softmax_pool_ratio(ctx, kv, score);                // [head_dim, n_comp]
+    } else {
+        ggml_tensor * kv_prev = ggml_view_3d(ctx, kv, n_embd_head, compress_ratio, n_comp,
+                kv->nb[1], kv->nb[2], 0);
+        ggml_tensor * kv_curr = ggml_view_3d(ctx, kv, n_embd_head, compress_ratio, n_comp,
+                kv->nb[1], kv->nb[2], n_embd_head * kv->nb[0]);
+        ggml_tensor * score_prev = ggml_view_3d(ctx, score, n_embd_head, compress_ratio, n_comp,
+                score->nb[1], score->nb[2], 0);
+        ggml_tensor * score_curr = ggml_view_3d(ctx, score, n_embd_head, compress_ratio, n_comp,
+                score->nb[1], score->nb[2], n_embd_head * score->nb[0]);
+
+        kv_prev    = dsv4_shift_overlap_state(ctx, kv_prev,    0.0f);
+        score_prev = dsv4_shift_overlap_state(ctx, score_prev, -INFINITY);
+
+        kv_prev    = ggml_cont(ctx, ggml_permute(ctx, kv_prev,    1, 0, 2, 3)); // [ratio, head_dim, n_comp]
+        kv_curr    = ggml_cont(ctx, ggml_permute(ctx, kv_curr,    1, 0, 2, 3));
+        score_prev = ggml_cont(ctx, ggml_permute(ctx, score_prev, 1, 0, 2, 3));
+        score_curr = ggml_cont(ctx, ggml_permute(ctx, score_curr, 1, 0, 2, 3));
+
+        kv    = ggml_concat(ctx, kv_prev,    kv_curr,    0); // [2*ratio, head_dim, n_comp]
+        score = ggml_concat(ctx, score_prev, score_curr, 0);
+        kv = dsv4_softmax_pool_ratio(ctx, kv, score);        // [head_dim, n_comp]
+    }
+
+    kv = ggml_rms_norm(ctx, kv, norm_eps);
+    kv = ggml_mul(ctx, kv, norm);
+    kv = ggml_reshape_3d(ctx, kv, n_embd_head, 1, n_comp);
+
+    kv = dsv4_apply_rope_tail(ctx, kv, pos,
+            n_embd_head, 1, n_comp, n_rot, rope_type,
+            rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+            rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false);
+
+    return kv;
+}
+
+static dsv4_state_pair dsv4_build_compressor_prefill_state(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        ggml_tensor  * wkv,
+        ggml_tensor  * wgate,
+        ggml_tensor  * ape,
+        int64_t        head_dim,
+        int64_t        n_tokens,
+        int64_t        compress_ratio) {
+    const dsv4_state_layout layout = dsv4_make_state_layout(compress_ratio, head_dim);
+
+    const int64_t cutoff    = (n_tokens / compress_ratio) * compress_ratio;
+    const int64_t remainder = n_tokens - cutoff;
+
+    ggml_tensor * kv    = ggml_mul_mat(ctx, wkv,    x); // [width, n_tokens]
+    ggml_tensor * score = ggml_mul_mat(ctx, wgate,  x);
+    ggml_tensor * ape_f = ape->type == GGML_TYPE_F32 ? ape : ggml_cast(ctx, ape, GGML_TYPE_F32);
+
+    if (compress_ratio == 4) {
+        ggml_tensor * kv_prev    = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, 0.0f);
+        ggml_tensor * score_prev = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, -INFINITY);
+
+        if (cutoff >= compress_ratio) {
+            kv_prev = ggml_view_2d(ctx, kv, layout.width, compress_ratio, kv->nb[1], (cutoff - compress_ratio)*kv->nb[1]);
+            score_prev = ggml_view_2d(ctx, score, layout.width, compress_ratio, score->nb[1], (cutoff - compress_ratio)*score->nb[1]);
+            score_prev = ggml_add(ctx, score_prev, ape_f);
+        }
+
+        ggml_tensor * kv_curr    = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, 0.0f);
+        ggml_tensor * score_curr = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, -INFINITY);
+
+        if (remainder > 0) {
+            ggml_tensor * kv_rem = ggml_view_2d(ctx, kv, layout.width, remainder, kv->nb[1], cutoff*kv->nb[1]);
+            ggml_tensor * sc_rem = ggml_view_2d(ctx, score, layout.width, remainder, score->nb[1], cutoff*score->nb[1]);
+            sc_rem = ggml_add(ctx, sc_rem, ggml_view_2d(ctx, ape_f, layout.width, remainder, ape_f->nb[1], 0));
+
+            if (remainder == compress_ratio) {
+                kv_curr = kv_rem;
+                score_curr = sc_rem;
+            } else {
+                kv_curr = ggml_concat(ctx, kv_rem,
+                        dsv4_new_filled_2d(ctx, layout.width, compress_ratio - remainder, 0.0f), 1);
+                score_curr = ggml_concat(ctx, sc_rem,
+                        dsv4_new_filled_2d(ctx, layout.width, compress_ratio - remainder, -INFINITY), 1);
+            }
+        }
+
+        return {
+            ggml_concat(ctx, kv_prev,    kv_curr,    1),
+            ggml_concat(ctx, score_prev, score_curr, 1),
+        };
+    }
+
+    ggml_tensor * kv_state    = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, 0.0f);
+    ggml_tensor * score_state = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, -INFINITY);
+
+    if (remainder > 0) {
+        ggml_tensor * kv_rem = ggml_view_2d(ctx, kv, layout.width, remainder, kv->nb[1], cutoff*kv->nb[1]);
+        ggml_tensor * sc_rem = ggml_view_2d(ctx, score, layout.width, remainder, score->nb[1], cutoff*score->nb[1]);
+        sc_rem = ggml_add(ctx, sc_rem, ggml_view_2d(ctx, ape_f, layout.width, remainder, ape_f->nb[1], 0));
+
+        if (remainder == compress_ratio) {
+            kv_state = kv_rem;
+            score_state = sc_rem;
+        } else {
+            kv_state = ggml_concat(ctx, kv_rem,
+                    dsv4_new_filled_2d(ctx, layout.width, compress_ratio - remainder, 0.0f), 1);
+            score_state = ggml_concat(ctx, sc_rem,
+                    dsv4_new_filled_2d(ctx, layout.width, compress_ratio - remainder, -INFINITY), 1);
+        }
+    }
+
+    return { kv_state, score_state };
+}
+
+static ggml_tensor * dsv4_pool_decode_state(
+        ggml_context * ctx,
+        ggml_tensor  * kv,
+        ggml_tensor  * score,
+        ggml_tensor  * norm,
+        ggml_tensor  * pos,
+        int64_t        head_dim,
+        int64_t        n_rot,
+        int            rope_type,
+        const dsv4_rope_cfg & rope_cfg,
+        float          norm_eps) {
+    const int64_t n_rows = kv->ne[1];
+    kv    = ggml_reshape_3d(ctx, ggml_cont(ctx, ggml_transpose(ctx, kv)),    n_rows, head_dim, 1);
+    score = ggml_reshape_3d(ctx, ggml_cont(ctx, ggml_transpose(ctx, score)), n_rows, head_dim, 1);
+
+    ggml_tensor * pooled = dsv4_softmax_pool_ratio(ctx, kv, score);
+    pooled = ggml_rms_norm(ctx, pooled, norm_eps);
+    pooled = ggml_mul(ctx, pooled, norm);
+    pooled = ggml_reshape_3d(ctx, pooled, head_dim, 1, 1);
+
+    return dsv4_apply_rope_tail(ctx, pooled, pos,
+            head_dim, 1, 1, n_rot, rope_type,
+            rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+            rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false);
+}
+
+static dsv4_decode_compressor dsv4_build_compressor_decode_projected(
+        ggml_context       * ctx,
+        ggml_tensor        * kv_cur,
+        ggml_tensor        * sc_cur,
+        ggml_tensor        * prev_kv_state,
+        ggml_tensor        * prev_score_state,
+        ggml_tensor        * norm,
+        int64_t              head_dim,
+        int64_t              n_rot,
+        int64_t              pos,
+        int64_t              compress_ratio,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg,
+        float                norm_eps);
+
+static dsv4_decode_compressor dsv4_build_compressor_decode(
+        ggml_context       * ctx,
+        ggml_tensor        * x,
+        ggml_tensor        * prev_kv_state,
+        ggml_tensor        * prev_score_state,
+        ggml_tensor        * wkv,
+        ggml_tensor        * wgate,
+        ggml_tensor        * ape,
+        ggml_tensor        * norm,
+        int64_t              head_dim,
+        int64_t              n_rot,
+        int64_t              pos,
+        int64_t              compress_ratio,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg,
+        float                norm_eps) {
+    const dsv4_state_layout layout = dsv4_make_state_layout(compress_ratio, head_dim);
+    const int64_t pos_mod = pos % compress_ratio;
+
+    ggml_tensor * kv_cur = ggml_mul_mat(ctx, wkv, x);       // [width, 1]
+    ggml_tensor * sc_cur = ggml_mul_mat(ctx, wgate, x);
+    ggml_tensor * ape_f  = ape->type == GGML_TYPE_F32 ? ape : ggml_cast(ctx, ape, GGML_TYPE_F32);
+    sc_cur = ggml_add(ctx, sc_cur, ggml_view_2d(ctx, ape_f, layout.width, 1, ape_f->nb[1], pos_mod*ape_f->nb[1]));
+
+    return dsv4_build_compressor_decode_projected(ctx,
+            kv_cur, sc_cur,
+            prev_kv_state, prev_score_state,
+            norm,
+            head_dim, n_rot, pos, compress_ratio,
+            rope_type, rope_cfg, norm_eps);
+}
+
+static dsv4_decode_compressor dsv4_build_compressor_decode_projected(
+        ggml_context       * ctx,
+        ggml_tensor        * kv_cur,
+        ggml_tensor        * sc_cur,
+        ggml_tensor        * prev_kv_state,
+        ggml_tensor        * prev_score_state,
+        ggml_tensor        * norm,
+        int64_t              head_dim,
+        int64_t              n_rot,
+        int64_t              pos,
+        int64_t              compress_ratio,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg,
+        float                norm_eps) {
+    const dsv4_state_layout layout = dsv4_make_state_layout(compress_ratio, head_dim);
+    const int64_t pos_mod = pos % compress_ratio;
+    const int64_t row = compress_ratio == 4 ? compress_ratio + pos_mod : pos_mod;
+    const bool should_compress = (pos + 1) % compress_ratio == 0;
+
+    // Single-row write via cpy-into-view. ggml_set_rows would crash on
+    // multi-GPU (sched routes by src device while dst is on a different
+    // device; see dsv4_store_cache_rows for the same problem and fix).
+    //
+    // We need to return a FULL-shape view of dst (downstream code at
+    // dsv4_view_cols slices the full state by columns/rows) AND establish
+    // a data dependency on the cpy. We mimic ggml_set_rows's internal
+    // construction: create a view_tensor of dst (which inherits dst's full
+    // shape), then manually set src[0] to the cpy result so sched orders
+    // the cpy before any consumer reading from this view.
+    auto cpy_into_row = [&](ggml_tensor * dst, ggml_tensor * row_src) -> ggml_tensor * {
+        ggml_tensor * row_view = ggml_view_2d(ctx, dst,
+                dst->ne[0], 1,
+                dst->nb[1],
+                row * dst->nb[1]);
+        ggml_tensor * cpy = ggml_cpy(ctx, row_src, row_view);
+        ggml_tensor * full_state = ggml_view_tensor(ctx, dst);
+        full_state->src[0] = cpy;  // dependency: full_state's consumers wait for cpy
+        return full_state;
+    };
+    ggml_tensor * kv_state    = cpy_into_row(prev_kv_state,    kv_cur);
+    ggml_tensor * score_state = cpy_into_row(prev_score_state, sc_cur);
+    ggml_tensor * kv_comp = nullptr;
+
+    if (should_compress) {
+        ggml_tensor * kv_pool;
+        ggml_tensor * score_pool;
+
+        if (compress_ratio == 4) {
+            ggml_tensor * kv_prev = dsv4_view_cols(ctx, kv_state,    head_dim, compress_ratio, 0,        0);
+            ggml_tensor * kv_curr = dsv4_view_cols(ctx, kv_state,    head_dim, compress_ratio, head_dim, compress_ratio);
+            ggml_tensor * sc_prev = dsv4_view_cols(ctx, score_state, head_dim, compress_ratio, 0,        0);
+            ggml_tensor * sc_curr = dsv4_view_cols(ctx, score_state, head_dim, compress_ratio, head_dim, compress_ratio);
+
+            kv_pool    = ggml_concat(ctx, kv_prev, kv_curr, 1);
+            score_pool = ggml_concat(ctx, sc_prev, sc_curr, 1);
+
+            ggml_tensor * shifted_kv    = dsv4_view_cols(ctx, kv_state,    layout.width, compress_ratio, 0, compress_ratio);
+            ggml_tensor * shifted_score = dsv4_view_cols(ctx, score_state, layout.width, compress_ratio, 0, compress_ratio);
+            kv_state    = ggml_concat(ctx, shifted_kv,    shifted_kv,    1);
+            score_state = ggml_concat(ctx, shifted_score, shifted_score, 1);
+        } else {
+            kv_pool    = kv_state;
+            score_pool = score_state;
+        }
+
+        ggml_tensor * comp_pos = dsv4_arange_i32(ctx, pos + 1 - compress_ratio, pos + 2 - compress_ratio);
+        kv_comp = dsv4_pool_decode_state(ctx, kv_pool, score_pool, norm, comp_pos,
+                head_dim, n_rot, rope_type, rope_cfg, norm_eps);
+    }
+
+    return { kv_state, score_state, kv_comp };
+}
+
+static dsv4_decode_compressor dsv4_build_compressor_decode_chunk(
+        ggml_context       * ctx,
+        ggml_tensor        * x,
+        ggml_tensor        * prev_kv_state,
+        ggml_tensor        * prev_score_state,
+        ggml_tensor        * wkv,
+        ggml_tensor        * wgate,
+        ggml_tensor        * ape,
+        ggml_tensor        * norm,
+        const llama_ubatch & ubatch,
+        int64_t              head_dim,
+        int64_t              n_rot,
+        int64_t              n_tokens,
+        int64_t              compress_ratio,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg,
+        float                norm_eps) {
+    const dsv4_state_layout layout = dsv4_make_state_layout(compress_ratio, head_dim);
+
+    ggml_tensor * kv_all = ggml_mul_mat(ctx, wkv,   x); // [width, n_tokens]
+    ggml_tensor * sc_all = ggml_mul_mat(ctx, wgate, x);
+    ggml_tensor * ape_f  = ape->type == GGML_TYPE_F32 ? ape : ggml_cast(ctx, ape, GGML_TYPE_F32);
+
+    ggml_tensor * kv_state    = prev_kv_state;
+    ggml_tensor * score_state = prev_score_state;
+    ggml_tensor * kv_comp     = nullptr;
+
+    for (int64_t i = 0; i < n_tokens; ++i) {
+        const llama_pos pos = ubatch.pos ? ubatch.pos[i] : (llama_pos) i;
+        const int64_t pos_mod = pos % compress_ratio;
+
+        ggml_tensor * kv_cur = ggml_view_2d(ctx, kv_all, layout.width, 1, kv_all->nb[1], i*kv_all->nb[1]);
+        ggml_tensor * sc_cur = ggml_view_2d(ctx, sc_all, layout.width, 1, sc_all->nb[1], i*sc_all->nb[1]);
+        sc_cur = ggml_add(ctx, sc_cur, ggml_view_2d(ctx, ape_f, layout.width, 1, ape_f->nb[1], pos_mod*ape_f->nb[1]));
+
+        dsv4_decode_compressor dec = dsv4_build_compressor_decode_projected(ctx,
+                kv_cur,
+                sc_cur,
+                kv_state,
+                score_state,
+                norm,
+                head_dim,
+                n_rot,
+                pos,
+                compress_ratio,
+                rope_type,
+                rope_cfg,
+                norm_eps);
+
+        kv_state    = dec.kv_state;
+        score_state = dec.score_state;
+        if (dec.kv_comp != nullptr) {
+            kv_comp = kv_comp == nullptr ? dec.kv_comp : ggml_concat(ctx, kv_comp, dec.kv_comp, 2);
+        }
+    }
+
+    return { kv_state, score_state, kv_comp };
+}
+
+static ggml_tensor * dsv4_build_indexer_scores_prefill(
+        ggml_context       * ctx,
+        ggml_tensor        * x,
+        ggml_tensor        * qr,
+        ggml_tensor        * index_kv,
+        ggml_tensor        * wq_b,
+        ggml_tensor        * wproj,
+        ggml_tensor        * pos,
+        ggml_tensor        * causal_mask,
+        int64_t              n_index_head,
+        int64_t              n_index_head_size,
+        int64_t              n_tokens,
+        int64_t              n_rot,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg) {
+    ggml_tensor * q = ggml_mul_mat(ctx, wq_b, qr);
+    q = ggml_reshape_3d(ctx, q, n_index_head_size, n_index_head, n_tokens);
+    q = dsv4_apply_rope_tail(ctx, q, pos,
+            n_index_head_size, n_index_head, n_tokens, n_rot, rope_type,
+            rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+            rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false);
+
+    ggml_tensor * k = ggml_permute(ctx, index_kv, 0, 2, 1, 3); // [head_dim, n_comp, 1]
+    q = ggml_permute(ctx, q, 0, 2, 1, 3);                     // [head_dim, n_tokens, n_heads]
+
+    ggml_tensor * score = ggml_mul_mat(ctx, k, q);            // [n_comp, n_tokens, n_heads]
+    score = ggml_relu(ctx, score);
+
+    ggml_tensor * weights = ggml_mul_mat(ctx, wproj, x);      // [n_heads, n_tokens]
+    const float scale = 1.0f / std::sqrt(float(n_index_head_size) * float(n_index_head));
+    weights = dsv4_mul_scalar(ctx, weights, scale);
+    weights = ggml_reshape_3d(ctx, weights, 1, n_index_head, n_tokens);
+    weights = ggml_permute(ctx, weights, 0, 2, 1, 3);         // [1, n_tokens, n_heads]
+
+    score = ggml_mul(ctx, score, weights);
+    score = ggml_cont(ctx, ggml_permute(ctx, score, 1, 2, 0, 3)); // [n_heads, n_comp, n_tokens]
+    score = ggml_sum_rows(ctx, score);                            // [1, n_comp, n_tokens]
+    score = ggml_reshape_2d(ctx, score, index_kv->ne[2], n_tokens);
+
+    return ggml_add(ctx, score, causal_mask);
+}
+
+static ggml_tensor * dsv4_build_indexer_scores_decode(
+        ggml_context       * ctx,
+        ggml_tensor        * x,
+        ggml_tensor        * qr,
+        ggml_tensor        * index_kv,
+        ggml_tensor        * wq_b,
+        ggml_tensor        * wproj,
+        ggml_tensor        * pos,
+        int64_t              n_index_head,
+        int64_t              n_index_head_size,
+        int64_t              n_comp,
+        int64_t              n_rot,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg) {
+    ggml_tensor * q = ggml_mul_mat(ctx, wq_b, qr);
+    q = ggml_reshape_3d(ctx, q, n_index_head_size, n_index_head, 1);
+    q = dsv4_apply_rope_tail(ctx, q, pos,
+            n_index_head_size, n_index_head, 1, n_rot, rope_type,
+            rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+            rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false);
+
+    ggml_tensor * k = ggml_reshape_3d(ctx, index_kv, n_index_head_size, 1, n_comp);
+    k = ggml_permute(ctx, k, 0, 2, 1, 3); // [head_dim, n_comp, 1]
+    q = ggml_permute(ctx, q, 0, 2, 1, 3); // [head_dim, 1, n_heads]
+
+    ggml_tensor * score = ggml_mul_mat(ctx, k, q); // [n_comp, 1, n_heads]
+    score = ggml_relu(ctx, score);
+
+    ggml_tensor * weights = ggml_mul_mat(ctx, wproj, x); // [n_heads, 1]
+    const float scale = 1.0f / std::sqrt(float(n_index_head_size) * float(n_index_head));
+    weights = dsv4_mul_scalar(ctx, weights, scale);
+    weights = ggml_reshape_3d(ctx, weights, 1, n_index_head, 1);
+    weights = ggml_permute(ctx, weights, 0, 2, 1, 3); // [1, 1, n_heads]
+
+    score = ggml_mul(ctx, score, weights);
+    score = ggml_cont(ctx, ggml_permute(ctx, score, 1, 2, 0, 3)); // [n_heads, n_comp, 1]
+    score = ggml_sum_rows(ctx, score);
+    return ggml_reshape_2d(ctx, score, n_comp, 1);
+}
+
+static ggml_tensor * dsv4_build_compressed_mask_from_topk(
+        ggml_context * ctx,
+        ggml_tensor  * scores,
+        ggml_tensor  * topk) {
+    const int64_t n_comp   = scores->ne[0];
+    const int64_t n_tokens = scores->ne[1];
+
+    ggml_tensor * scores_rows = ggml_reshape_3d(ctx, scores, 1, scores->ne[0], scores->ne[1]);
+    ggml_tensor * selected_scores = ggml_get_rows(ctx, scores_rows, topk); // [1, top_k, n_tokens]
+    ggml_tensor * valid = ggml_step(ctx, dsv4_add_scalar(ctx, selected_scores, 1.0e30f));
+    ggml_tensor * values = dsv4_mul_scalar(ctx, dsv4_add_scalar(ctx, valid, -1.0f), 1.0e9f);
+
+    ggml_tensor * mask = dsv4_new_filled_3d(ctx, 1, n_comp, n_tokens, -INFINITY);
+    mask = ggml_set_rows(ctx, mask, values, topk);
+    return ggml_reshape_2d(ctx, mask, n_comp, n_tokens);
+}
+
+static ggml_tensor * dsv4_cache_view_3d(ggml_context * ctx, ggml_tensor * cache, int64_t n_rows) {
+    ggml_tensor * view = ggml_view_2d(ctx, cache, cache->ne[0], n_rows, cache->nb[1], 0);
+    return ggml_reshape_3d(ctx, view, cache->ne[0], 1, n_rows);
+}
+
+} // namespace
+
+llama_model_deepseek4::graph::graph(const llama_model & model, const llm_graph_params & params) :
+	llm_graph_context(params) {
+
+    const int64_t n_hc        = hparams.n_hc;
+    const int64_t n_lora_q    = hparams.n_lora_q;
+    const int64_t n_lora_o    = hparams.n_lora_o;
+    const int64_t n_out_group = hparams.n_attn_out_groups;
+
+    GGML_ASSERT(n_hc > 0);
+    GGML_ASSERT(n_lora_q > 0);
+    GGML_ASSERT(n_lora_o > 0);
+    GGML_ASSERT(n_out_group > 0);
+    GGML_ASSERT(n_embd_head_k == n_embd_head_v);
+    ggml_tensor * inpL = build_inp_embd(model.tok_embd);
+    ggml_tensor * inp_tokens = res->t_inp_tokens;
+    ggml_tensor * inp_pos = build_inp_pos();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    auto * inp_mem  = build_inp_mem_hybrid_iswa();
+    auto * inp_attn = inp_mem->get_attn();
+    auto * inp_rs   = inp_mem->get_recr();
+    const auto * mctx_dsv4 = inp_mem->mctx;
+    dsv4_graph_inputs * inp_dsv4 = nullptr;
+    auto get_dsv4_inputs = [&]() {
+        if (inp_dsv4 == nullptr) {
+            auto inputs = std::make_unique<dsv4_graph_inputs>();
+            inp_dsv4 = inputs.get();
+            res->add_input(std::move(inputs));
+        }
+        return inp_dsv4;
+    };
+
+    inpL = ggml_reshape_3d(ctx0, inpL, n_embd, 1, n_tokens);
+    inpL = ggml_repeat_4d(ctx0, inpL, n_embd, n_hc, n_tokens, 1);
+    inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_hc, n_tokens);
+
+    const float kq_scale = 1.0f / std::sqrt(float(n_embd_head_k));
+
+    for (int il = 0; il < n_layer; ++il) {
+        const auto & layer = model.layers[il];
+        const uint32_t compress_ratio = hparams.attn_compress_ratio[il];
+        const dsv4_rope_cfg rope_cfg = dsv4_make_rope_cfg(hparams, cparams, compress_ratio);
+        const bool is_prefill = ubatch.pos == nullptr || ubatch.pos[0] == 0;
+
+        if (compress_ratio != 0) {
+            if (compress_ratio != 4 && compress_ratio != 128) {
+                throw std::runtime_error("DeepSeek V4 unsupported attention compression ratio " + std::to_string(compress_ratio));
+            }
+            // The hybrid memory splitter emits one sequence set per ubatch
+            // for compressed DeepSeek V4 attention.
+            GGML_ASSERT(ubatch.n_seqs == 1);
+        }
+
+        ggml_tensor * residual = inpL;
+        dsv4_hc_mix mix = dsv4_hc_pre(ctx0, inpL,
+                layer.hc_attn_fn, layer.hc_attn_scale, layer.hc_attn_base,
+                n_embd, n_hc, n_tokens, norm_rms_eps, hparams.hc_sinkhorn_iters, hparams.hc_eps);
+        ggml_tensor * cur = mix.x;
+        cb(cur, "hc_attn_pre", il);
+        cb(mix.mixes, "hc_attn_pre_mixes", il);
+        cb(mix.pre, "hc_attn_pre_weights", il);
+        cb(mix.post, "hc_attn_pre_post_weights", il);
+        cb(mix.comb, "hc_attn_pre_comb", il);
+        cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+        ggml_tensor * qr = ggml_mul_mat(ctx0, layer.wq_a, cur);
+        cb(qr, "q_lora", il);
+        qr = build_norm(qr, layer.attn_q_a_norm, nullptr, LLM_NORM_RMS, il);
+        cb(qr, "q_lora_norm", il);
+
+        ggml_tensor * q = ggml_mul_mat(ctx0, layer.wq_b, qr);
+        q = ggml_reshape_3d(ctx0, q, n_embd_head_k, n_head, n_tokens);
+        q = ggml_rms_norm(ctx0, q, norm_rms_eps);
+        cb(q, "Qnorm", il);
+        q = dsv4_apply_rope_tail(ctx0, q, inp_pos,
+                n_embd_head_k, n_head, n_tokens, n_rot, rope_type,
+                rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+                rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false);
+        cb(q, "Qcur", il);
+        ggml_tensor * kv = ggml_mul_mat(ctx0, layer.attn_kv, cur);
+        kv = build_norm(kv, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
+        kv = ggml_reshape_3d(ctx0, kv, n_embd_head_k, 1, n_tokens);
+        cb(kv, "KVnorm", il);
+        kv = dsv4_apply_rope_tail(ctx0, kv, inp_pos,
+                n_embd_head_k, 1, n_tokens, n_rot, rope_type,
+                rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+                rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false);
+        cb(kv, "KVrope", il);
+        kv = ggml_dsv4_fp8_kv_quantize(ctx0, kv, n_rot);
+        cb(kv, "KVcur", il);
+
+        const auto * mctx_swa = inp_attn->mctx->get_swa();
+        ggml_build_forward_expand(gf, q);
+        ggml_build_forward_expand(gf, kv);
+        ggml_build_forward_expand(gf, mctx_swa->cpy_k(ctx0, kv, inp_attn->get_k_idxs_swa(), il));
+
+        if (compress_ratio == 0) {
+            ggml_tensor * k_cache = mctx_swa->get_k(ctx0, il);
+            k_cache = ggml_reshape_3d(ctx0, k_cache, n_embd_head_k, 1, k_cache->ne[2]);
+            cur = build_attn_mha(q, k_cache, k_cache, nullptr, inp_attn->get_kq_mask_swa(),
+                    layer.attn_sinks, nullptr, kq_scale, il);
+            cb(cur, "kqv_out", il);
+        } else {
+            ggml_tensor * k_all = kv;
+            ggml_tensor * v_all = kv;
+            ggml_tensor * attn_mask = nullptr;
+            const llama_seq_id seq_id = ubatch.seq_id[0][0];
+            auto store_attn_cache_rows = [&](ggml_tensor * src, int64_t row_start, int64_t n_rows) {
+                for (int32_t is = 0; is < ubatch.n_seq_id[0]; ++is) {
+                    const llama_seq_id dst_seq_id = ubatch.seq_id[0][is];
+                    dsv4_store_cache_rows(ctx0, gf, mctx_dsv4->get_dsv4_attn_k(ctx0, il, dst_seq_id), src, row_start, n_rows);
+                }
+            };
+            auto store_index_cache_rows = [&](ggml_tensor * src, int64_t row_start, int64_t n_rows) {
+                for (int32_t is = 0; is < ubatch.n_seq_id[0]; ++is) {
+                    const llama_seq_id dst_seq_id = ubatch.seq_id[0][is];
+                    dsv4_store_cache_rows(ctx0, gf, mctx_dsv4->get_dsv4_index_k(ctx0, il, dst_seq_id), src, row_start, n_rows);
+                }
+            };
+            const int64_t state_size = hparams.n_embd_r();
+            const dsv4_state_layout attn_state_layout = dsv4_make_state_layout(compress_ratio, n_embd_head_k);
+
+            ggml_tensor * prev_kv_state_all = build_rs(inp_rs, inp_rs->mctx->get_r_l(il), state_size, ubatch.n_seqs);
+            ggml_tensor * prev_sc_state_all = build_rs(inp_rs, inp_rs->mctx->get_s_l(il), state_size, ubatch.n_seqs);
+            ggml_tensor * prev_attn_kv_state = dsv4_view_state_segment(ctx0, prev_kv_state_all, 0, attn_state_layout.width, attn_state_layout.rows);
+            ggml_tensor * prev_attn_sc_state = dsv4_view_state_segment(ctx0, prev_sc_state_all, 0, attn_state_layout.width, attn_state_layout.rows);
+
+            const int64_t n_comp = n_tokens / compress_ratio;
+            if (is_prefill) {
+                dsv4_state_pair state = dsv4_build_compressor_prefill_state(ctx0, cur,
+                        layer.attn_compressor_kv,
+                        layer.attn_compressor_gate,
+                        layer.attn_compressor_ape,
+                        n_embd_head_k,
+                        n_tokens,
+                        compress_ratio);
+                dsv4_store_state_segment(ctx0, gf, state.kv,    inp_rs->mctx->get_r_l(il), state_size, inp_rs->head, 0);
+                dsv4_store_state_segment(ctx0, gf, state.score, inp_rs->mctx->get_s_l(il), state_size, inp_rs->head, 0);
+
+                if (compress_ratio == 4) {
+                    const dsv4_state_layout index_state_layout = dsv4_make_state_layout(compress_ratio, hparams.indexer_head_size);
+                    dsv4_state_pair index_state = dsv4_build_compressor_prefill_state(ctx0, cur,
+                            layer.indexer_compressor_kv,
+                            layer.indexer_compressor_gate,
+                            layer.indexer_compressor_ape,
+                            hparams.indexer_head_size,
+                            n_tokens,
+                            compress_ratio);
+                    dsv4_store_state_segment(ctx0, gf, index_state.kv,    inp_rs->mctx->get_r_l(il), state_size, inp_rs->head, attn_state_layout.elems);
+                    dsv4_store_state_segment(ctx0, gf, index_state.score, inp_rs->mctx->get_s_l(il), state_size, inp_rs->head, attn_state_layout.elems);
+                    GGML_ASSERT(attn_state_layout.elems + index_state_layout.elems <= state_size);
+                }
+            }
+
+            if (is_prefill && n_comp > 0) {
+                ggml_tensor * comp_pos = ggml_arange(ctx0, 0.0f, float(n_comp * compress_ratio), float(compress_ratio));
+                comp_pos = ggml_cast(ctx0, comp_pos, GGML_TYPE_I32);
+
+                ggml_tensor * kv_comp = dsv4_build_compressor_prefill(ctx0, cur,
+                        layer.attn_compressor_kv,
+                        layer.attn_compressor_gate,
+                        layer.attn_compressor_ape,
+                        layer.attn_compressor_norm,
+                        comp_pos,
+                        n_embd_head_k, n_rot, n_tokens, compress_ratio, rope_type, rope_cfg, norm_rms_eps);
+                kv_comp = ggml_dsv4_fp8_kv_quantize(ctx0, kv_comp, n_rot);
+                cb(kv_comp, "KVcompress", il);
+
+                store_attn_cache_rows(kv_comp, 0, n_comp);
+
+                k_all = ggml_concat(ctx0, kv, kv_comp, 2);
+                v_all = k_all;
+
+                if (compress_ratio == 4) {
+                    ggml_tensor * raw_mask = get_dsv4_inputs()->add_mask(ctx0,
+                            dsv4_mask_kind::RAW_WINDOW,
+                            n_tokens, n_tokens,
+                            n_tokens, n_comp, hparams.n_swa, compress_ratio,
+                            "dsv4_attn_raw_window_mask");
+                    ggml_tensor * index_mask = get_dsv4_inputs()->add_mask(ctx0,
+                            dsv4_mask_kind::COMPRESS_CAUSAL,
+                            n_comp, n_tokens,
+                            0, n_comp, 0, compress_ratio,
+                            "dsv4_indexer_causal_mask");
+
+                    ggml_tensor * index_kv = dsv4_build_compressor_prefill(ctx0, cur,
+                            layer.indexer_compressor_kv,
+                            layer.indexer_compressor_gate,
+                            layer.indexer_compressor_ape,
+                            layer.indexer_compressor_norm,
+                            comp_pos,
+                            hparams.indexer_head_size, n_rot, n_tokens, compress_ratio, rope_type, rope_cfg, norm_rms_eps);
+                    cb(index_kv, "indexer_KVcompress", il);
+
+                    store_index_cache_rows(index_kv, 0, n_comp);
+
+                    ggml_tensor * index_scores = dsv4_build_indexer_scores_prefill(ctx0,
+                            cur, qr, index_kv,
+                            layer.indexer_attn_q_b,
+                            layer.indexer_proj,
+                            inp_pos,
+                            index_mask,
+                            hparams.indexer_n_head,
+                            hparams.indexer_head_size,
+                            n_tokens,
+                            n_rot,
+                            rope_type,
+                            rope_cfg);
+                    cb(index_scores, "indexer_scores", il);
+
+                    const int top_k = std::min<int64_t>(hparams.indexer_top_k, n_comp);
+                    ggml_tensor * topk = ggml_argsort_top_k(ctx0, index_scores, top_k);
+                    cb(topk, "indexer_topk", il);
+
+                    ggml_tensor * comp_mask = dsv4_build_compressed_mask_from_topk(ctx0, index_scores, topk);
+                    cb(comp_mask, "dsv4_attn_compress_mask", il);
+
+                    attn_mask = ggml_concat(ctx0, raw_mask, comp_mask, 0);
+                } else {
+                    attn_mask = get_dsv4_inputs()->add_mask(ctx0,
+                            dsv4_mask_kind::ATTN_STATIC,
+                            n_tokens + n_comp, n_tokens,
+                            n_tokens, n_comp, hparams.n_swa, compress_ratio,
+                            "dsv4_attn_static_mask");
+                }
+            } else {
+                attn_mask = get_dsv4_inputs()->add_mask(ctx0,
+                        dsv4_mask_kind::RAW_WINDOW,
+                        n_tokens, n_tokens,
+                        n_tokens, 0, hparams.n_swa, compress_ratio,
+                        "dsv4_attn_raw_window_mask");
+            }
+
+            if (!is_prefill) {
+                const llama_pos first_pos = ubatch.pos ? ubatch.pos[0] : 0;
+                const llama_pos last_pos  = ubatch.pos ? ubatch.pos[n_tokens - 1] : n_tokens - 1;
+                const int64_t n_comp_before  = first_pos / compress_ratio;
+                const int64_t n_comp_visible = (last_pos + 1) / compress_ratio;
+                const int64_t n_comp_cache = mctx_dsv4->get_dsv4_n_comp(il);
+                GGML_ASSERT(n_comp_visible <= n_comp_cache);
+
+                dsv4_decode_compressor dec = n_tokens == 1
+                    ? dsv4_build_compressor_decode(ctx0, cur,
+                            prev_attn_kv_state,
+                            prev_attn_sc_state,
+                            layer.attn_compressor_kv,
+                            layer.attn_compressor_gate,
+                            layer.attn_compressor_ape,
+                            layer.attn_compressor_norm,
+                            n_embd_head_k,
+                            n_rot,
+                            first_pos,
+                            compress_ratio,
+                            rope_type,
+                            rope_cfg,
+                            norm_rms_eps)
+                    : dsv4_build_compressor_decode_chunk(ctx0, cur,
+                            prev_attn_kv_state,
+                            prev_attn_sc_state,
+                            layer.attn_compressor_kv,
+                            layer.attn_compressor_gate,
+                            layer.attn_compressor_ape,
+                            layer.attn_compressor_norm,
+                            ubatch,
+                            n_embd_head_k,
+                            n_rot,
+                            n_tokens,
+                            compress_ratio,
+                            rope_type,
+                            rope_cfg,
+                            norm_rms_eps);
+
+                dsv4_store_state_segment(ctx0, gf, dec.kv_state,    inp_rs->mctx->get_r_l(il), state_size, inp_rs->head, 0);
+                dsv4_store_state_segment(ctx0, gf, dec.score_state, inp_rs->mctx->get_s_l(il), state_size, inp_rs->head, 0);
+
+                if (dec.kv_comp != nullptr) {
+                    dec.kv_comp = ggml_dsv4_fp8_kv_quantize(ctx0, dec.kv_comp, n_rot);
+                    store_attn_cache_rows(dec.kv_comp, n_comp_before, n_comp_visible - n_comp_before);
+                }
+
+                ggml_tensor * k_raw = mctx_swa->get_k(ctx0, il);
+                k_raw = ggml_reshape_3d(ctx0, k_raw, n_embd_head_k, 1, k_raw->ne[2]);
+                k_all = k_raw;
+                v_all = k_raw;
+                attn_mask = inp_attn->self_kq_mask_swa;
+
+                if (n_comp_visible > 0) {
+                    ggml_tensor * kv_comp_cache = dsv4_cache_view_3d(ctx0, mctx_dsv4->get_dsv4_attn_k(ctx0, il, seq_id), n_comp_visible);
+                    // V4's KV cache is F16 (forced via llama-model.cpp); CUDA's
+                    // ggml_concat asserts F32 (ggml-cuda/concat.cu) — every other
+                    // architecture's concat takes F32 inputs from mul_mat/norm/rope,
+                    // so the assertion is correct. Cast both F16 inputs to F32 for
+                    // the concat, then cast the result back to F16 to preserve the
+                    // f16-KV-pin invariant for downstream attention. Metal's concat
+                    // is type-agnostic; on Metal these casts are accepted but the
+                    // intermediate F32 round-trip is wasted work. CPU concat handles
+                    // both types so it's also a no-op cost there.
+                    ggml_tensor * k_raw_f32   = ggml_cast(ctx0, k_raw,   GGML_TYPE_F32);
+                    ggml_tensor * comp_f32    = ggml_cast(ctx0, kv_comp_cache, GGML_TYPE_F32);
+                    ggml_tensor * concat_f32  = ggml_concat(ctx0, k_raw_f32, comp_f32, 2);
+                    k_all = ggml_cast(ctx0, concat_f32, GGML_TYPE_F16);
+                    v_all = k_all;
+
+                    ggml_tensor * comp_mask = nullptr;
+                    if (compress_ratio == 4) {
+                        const dsv4_state_layout index_state_layout = dsv4_make_state_layout(compress_ratio, hparams.indexer_head_size);
+                        ggml_tensor * prev_index_kv_state = dsv4_view_state_segment(ctx0, prev_kv_state_all,
+                                attn_state_layout.elems, index_state_layout.width, index_state_layout.rows);
+                        ggml_tensor * prev_index_sc_state = dsv4_view_state_segment(ctx0, prev_sc_state_all,
+                                attn_state_layout.elems, index_state_layout.width, index_state_layout.rows);
+
+                        dsv4_decode_compressor index_dec = n_tokens == 1
+                            ? dsv4_build_compressor_decode(ctx0, cur,
+                                    prev_index_kv_state,
+                                    prev_index_sc_state,
+                                    layer.indexer_compressor_kv,
+                                    layer.indexer_compressor_gate,
+                                    layer.indexer_compressor_ape,
+                                    layer.indexer_compressor_norm,
+                                    hparams.indexer_head_size,
+                                    n_rot,
+                                    first_pos,
+                                    compress_ratio,
+                                    rope_type,
+                                    rope_cfg,
+                                    norm_rms_eps)
+                            : dsv4_build_compressor_decode_chunk(ctx0, cur,
+                                    prev_index_kv_state,
+                                    prev_index_sc_state,
+                                    layer.indexer_compressor_kv,
+                                    layer.indexer_compressor_gate,
+                                    layer.indexer_compressor_ape,
+                                    layer.indexer_compressor_norm,
+                                    ubatch,
+                                    hparams.indexer_head_size,
+                                    n_rot,
+                                    n_tokens,
+                                    compress_ratio,
+                                    rope_type,
+                                    rope_cfg,
+                                    norm_rms_eps);
+
+                        dsv4_store_state_segment(ctx0, gf, index_dec.kv_state,    inp_rs->mctx->get_r_l(il), state_size, inp_rs->head, attn_state_layout.elems);
+                        dsv4_store_state_segment(ctx0, gf, index_dec.score_state, inp_rs->mctx->get_s_l(il), state_size, inp_rs->head, attn_state_layout.elems);
+
+                        if (index_dec.kv_comp != nullptr) {
+                            store_index_cache_rows(index_dec.kv_comp, n_comp_before, n_comp_visible - n_comp_before);
+                        }
+
+                        if (n_tokens == 1 && n_comp_visible <= hparams.indexer_top_k) {
+                            comp_mask = get_dsv4_inputs()->add_mask(ctx0,
+                                    dsv4_mask_kind::COMPRESS_CAUSAL,
+                                    n_comp_visible, n_tokens,
+                                    0, n_comp_visible, 0, compress_ratio,
+                                    "dsv4_attn_compress_mask");
+                        } else {
+                            ggml_tensor * index_cache = dsv4_cache_view_3d(ctx0, mctx_dsv4->get_dsv4_index_k(ctx0, il, seq_id), n_comp_visible);
+                            index_cache = ggml_reshape_2d(ctx0, index_cache, hparams.indexer_head_size, n_comp_visible);
+                            ggml_tensor * index_scores = n_tokens == 1
+                                ? dsv4_build_indexer_scores_decode(ctx0,
+                                        cur, qr, index_cache,
+                                        layer.indexer_attn_q_b,
+                                        layer.indexer_proj,
+                                        inp_pos,
+                                        hparams.indexer_n_head,
+                                        hparams.indexer_head_size,
+                                        n_comp_visible,
+                                        n_rot,
+                                        rope_type,
+                                        rope_cfg)
+                                : dsv4_build_indexer_scores_prefill(ctx0,
+                                        cur, qr, dsv4_cache_view_3d(ctx0, mctx_dsv4->get_dsv4_index_k(ctx0, il, seq_id), n_comp_visible),
+                                        layer.indexer_attn_q_b,
+                                        layer.indexer_proj,
+                                        inp_pos,
+                                        get_dsv4_inputs()->add_mask(ctx0,
+                                                dsv4_mask_kind::COMPRESS_CAUSAL,
+                                                n_comp_visible, n_tokens,
+                                                0, n_comp_visible, 0, compress_ratio,
+                                                "dsv4_indexer_decode_causal_mask"),
+                                        hparams.indexer_n_head,
+                                        hparams.indexer_head_size,
+                                        n_tokens,
+                                        n_rot,
+                                        rope_type,
+                                        rope_cfg);
+                            cb(index_scores, "indexer_scores", il);
+
+                            const int top_k = std::min<int64_t>(hparams.indexer_top_k, n_comp_visible);
+                            ggml_tensor * topk = ggml_argsort_top_k(ctx0, index_scores, top_k);
+                            cb(topk, "indexer_topk", il);
+
+                            comp_mask = dsv4_build_compressed_mask_from_topk(ctx0, index_scores, topk);
+                        }
+                    } else {
+                        comp_mask = get_dsv4_inputs()->add_mask(ctx0,
+                                dsv4_mask_kind::COMPRESS_CAUSAL,
+                                n_comp_visible, n_tokens,
+                                0, n_comp_visible, 0, compress_ratio,
+                                "dsv4_attn_compress_mask");
+                    }
+
+                    attn_mask = ggml_concat(ctx0, attn_mask, comp_mask, 0);
+                }
+            }
+
+            ggml_tensor * attn_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, attn_mask, GGML_TYPE_F16) : attn_mask;
+            cur = build_attn_mha(q, k_all, v_all, nullptr, attn_mask_cnv, layer.attn_sinks, nullptr, kq_scale, il);
+            cb(cur, "kqv_out", il);
+        }
+        cur = ggml_reshape_3d(ctx0, cur, n_embd_head_v, n_head, n_tokens);
+        cur = dsv4_apply_rope_tail(ctx0, cur, inp_pos,
+                n_embd_head_v, n_head, n_tokens, n_rot, rope_type,
+                rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+                rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, true);
+        cur = dsv4_grouped_out(ctx0, cur, layer.attn_wo_a, layer.attn_wo_b,
+                n_embd_head_v, n_head, n_out_group, n_lora_o, n_tokens);
+        cb(cur, "attn_out", il);
+        inpL = dsv4_hc_post(ctx0, cur, residual, mix.post, mix.comb, n_embd, n_hc, n_tokens);
+        cb(inpL, "hc_attn_post", il);
+
+        residual = inpL;
+        mix = dsv4_hc_pre(ctx0, inpL,
+                layer.hc_ffn_fn, layer.hc_ffn_scale, layer.hc_ffn_base,
+                n_embd, n_hc, n_tokens, norm_rms_eps, hparams.hc_sinkhorn_iters, hparams.hc_eps);
+        cur = mix.x;
+        cb(cur, "hc_ffn_pre", il);
+        cb(mix.mixes, "hc_ffn_pre_mixes", il);
+        cb(mix.pre, "hc_ffn_pre_weights", il);
+        cb(mix.post, "hc_ffn_pre_post_weights", il);
+        cb(mix.comb, "hc_ffn_pre_comb", il);
+        cur = build_norm(cur, layer.ffn_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+        ggml_tensor * selected = nullptr;
+        if ((uint32_t) il < hparams.n_hash_layers && !cparams.warmup) {
+            GGML_ASSERT(inp_tokens != nullptr &&
+                "DeepSeek V4 hash routing requires token-id input; embedding-only / multimodal input not supported");
+            selected = ggml_get_rows(ctx0, layer.ffn_gate_tid2eid, inp_tokens);
+            cb(selected, "ffn_moe_hash_topk", il);
+        }
+
+        ggml_tensor * moe_out = build_moe_ffn(cur,
+                layer.ffn_gate_inp,
+                layer.ffn_up_exps,
+                layer.ffn_gate_exps,
+                layer.ffn_down_exps,
+                layer.ffn_exp_probs_b,
+                n_expert, n_expert_used,
+                LLM_FFN_SILU, hparams.expert_weights_norm,
+                hparams.expert_weights_scale,
+                (llama_expert_gating_func_type) hparams.expert_gating_func,
+                il,
+                nullptr,
+                nullptr,
+                nullptr,
+                nullptr,
+                nullptr,
+                selected);
+        cb(moe_out, "ffn_moe_out", il);
+        ggml_tensor * ffn_shexp = build_ffn(cur,
+                layer.ffn_up_shexp,   nullptr, nullptr,
+                layer.ffn_gate_shexp, nullptr, nullptr,
+                layer.ffn_down_shexp, nullptr, nullptr,
+                nullptr,
+                LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(ffn_shexp, "ffn_shexp", il);
+
+        cur = ggml_add(ctx0, moe_out, ffn_shexp);
+        cb(cur, "ffn_out", il);
+        inpL = dsv4_hc_post(ctx0, cur, residual, mix.post, mix.comb, n_embd, n_hc, n_tokens);
+        cb(inpL, "hc_ffn_post", il);
+    }
+    if (inp_out_ids) {
+        inpL = ggml_reshape_2d(ctx0, inpL, n_embd * n_hc, n_tokens);
+        inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_hc, n_outputs);
+    }
+
+    ggml_tensor * cur = dsv4_hc_head(ctx0, inpL,
+            model.output_hc_fn, model.output_hc_scale, model.output_hc_base,
+            n_embd, n_hc, inp_out_ids ? n_outputs : n_tokens,
+            norm_rms_eps, hparams.hc_eps);
+    cb(cur, "result_hc", -1);
+
+    cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    cur = ggml_mul_mat(ctx0, model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/src/models/models.h b/src/models/models.h
index 4e40536a5ea3..2ebf8666dcbd 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -1020,6 +1020,19 @@ struct llama_model_deepseek2 : public llama_model_base {
 };
 
 
+struct llama_model_deepseek4 : public llama_model_base {
+    llama_model_deepseek4(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
 struct llama_model_deepseek2ocr : public llama_model_base {
     llama_model_deepseek2ocr(const struct llama_model_params & params) : llama_model_base(params) {}
     void load_arch_hparams(llama_model_loader & ml) override;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 76f7cb5a867d..24905b43d8af 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4951,6 +4951,271 @@ struct test_rope : public test_case {
     }
 };
 
+// V4 partial-RoPE: leaves the non-RoPE prefix unchanged, applies RoPE to the tail.
+// Reference:    ggml/include/ggml.h (ggml_dsv4_rope_tail).
+// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_rope_tail).
+// Constraints (ggml.c, ggml_dsv4_rope_tail): mode in {NORMAL, NEOX};
+// a->ne[2] == pos->ne[0]; n_dims > 0 && n_dims <= a->ne[0] && n_dims % 2 == 0;
+// if freq_factors, freq_factors->ne[0] >= n_dims/2.
+struct test_dsv4_rope_tail : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    int n_dims;
+    int mode;
+    int n_ctx;
+    float fs;       // freq_scale
+    float ef;       // ext_factor
+    float af;       // attn_factor
+    bool ff;        // use freq_factors
+    bool inverse;
+
+    std::string vars() override {
+        return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, inverse);
+    }
+
+    test_dsv4_rope_tail(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {64, 8, 4, 1},
+            int n_dims = 32, int mode = GGML_ROPE_TYPE_NORMAL, int n_ctx = 128,
+            float fs = 1.0f, float ef = 0.0f, float af = 0.0f,
+            bool ff = false, bool inverse = false)
+        : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx),
+          fs(fs), ef(ef), af(af), ff(ff), inverse(inverse) {}
+
+    // NMSE tolerance: 1e-5. Rationale: RoPE is trig + multiply, no
+    // accumulation. Matches test_rope's de-facto behavior on this backend pair.
+    double max_nmse_err() override {
+        return 1e-5;
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        // Constraint: a->ne[2] == pos->ne[0].
+        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
+        ggml_set_name(pos, "pos");
+
+        ggml_tensor * freq = nullptr;
+        if (ff) {
+            // Constraint: freq_factors->ne[0] >= n_dims/2.
+            freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims / 2);
+            ggml_set_name(freq, "freq");
+        }
+
+        ggml_tensor * out = ggml_dsv4_rope_tail(
+            ctx, a, pos, freq,
+            n_dims, mode, n_ctx,
+            10000.0f, fs, ef, af, 1.0f, 1.0f,
+            inverse);
+        ggml_set_name(out, "out");
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        // Match test_rope's pattern: positions are random within [0, n_ctx) so
+        // the test exercises a representative distribution of RoPE phases on
+        // every run, not just sequential 0..N-1.
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                std::vector<int> data(ggml_nelements(t));
+                for (size_t i = 0; i < data.size(); ++i) {
+                    data[i] = rand() % n_ctx;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, data.size() * sizeof(int));
+            } else {
+                init_tensor_uniform(t, -1.0f, 1.0f);
+            }
+        }
+    }
+};
+
+// V4 hyper-connection splitter with Sinkhorn normalization.
+// Reference:    ggml/include/ggml.h (ggml_dsv4_hc_split_sinkhorn).
+// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_hc_split_sinkhorn).
+// Constraints (ggml.c, ggml_dsv4_hc_split_sinkhorn): mixes->ne[0] == (2 + n_hc) * n_hc;
+// mixes->ne[2] == 1; mixes->ne[3] == 1; nelements(scale) >= 3;
+// nelements(base) >= mixes->ne[0].
+struct test_dsv4_hc_split_sinkhorn : public test_case {
+    const int n_hc;
+    const int64_t n_rows;
+    const int sinkhorn_iters;
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR4(n_hc, n_rows, sinkhorn_iters, eps);
+    }
+
+    test_dsv4_hc_split_sinkhorn(int n_hc = 4, int64_t n_rows = 16,
+                                int sinkhorn_iters = 4, float eps = 1e-6f)
+        : n_hc(n_hc), n_rows(n_rows), sinkhorn_iters(sinkhorn_iters), eps(eps) {}
+
+    // NMSE tolerance: 1e-3. Rationale: 4 iterations of normalization compound
+    // floating-point rounding; per-iteration eps division amplifies relative
+    // error on near-zero entries. Spec calls for "1e-3 rel"; NMSE 1e-3 is the
+    // matching budget.
+    double max_nmse_err() override {
+        return 1e-3;
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // Hard constraint: mixes->ne[0] MUST equal (2 + n_hc) * n_hc.
+        const int64_t mix_dim = (int64_t)(2 + n_hc) * n_hc;
+
+        ggml_tensor * mixes = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, mix_dim, n_rows);
+        ggml_set_param(mixes);
+        ggml_set_name(mixes, "mixes");
+
+        // scale: nelements(scale) >= 3. Constructor uses scale as a 1D
+        // parameter buffer. Use a 1D tensor of size 3 (the minimum).
+        ggml_tensor * scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3);
+        ggml_set_param(scale);
+        ggml_set_name(scale, "scale");
+
+        // base: nelements(base) >= mixes->ne[0]. Use a 1D tensor of size mix_dim.
+        ggml_tensor * base = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mix_dim);
+        ggml_set_param(base);
+        ggml_set_name(base, "base");
+
+        ggml_tensor * out = ggml_dsv4_hc_split_sinkhorn(ctx, mixes, scale, base, n_hc, sinkhorn_iters, eps);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
+// V4 hyper-connection weighted-sum: out[embd, token] = sum_hc weights[hc, token] * x[embd, hc, token].
+// Reference:    ggml/include/ggml.h (ggml_dsv4_hc_weighted_sum).
+// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_hc_weighted_sum).
+// Constraints (ggml.c, ggml_dsv4_hc_weighted_sum):
+//   x       shape {n_embd, n_hc, n_tokens, 1}
+//   weights shape {n_hc,   n_tokens, 1, 1}
+struct test_dsv4_hc_weighted_sum : public test_case {
+    const int64_t n_embd;
+    const int64_t n_hc;
+    const int64_t n_tokens;
+
+    std::string vars() override {
+        return VARS_TO_STR3(n_embd, n_hc, n_tokens);
+    }
+
+    test_dsv4_hc_weighted_sum(int64_t n_embd = 128, int64_t n_hc = 4, int64_t n_tokens = 16)
+        : n_embd(n_embd), n_hc(n_hc), n_tokens(n_tokens) {}
+
+    // NMSE tolerance: 1e-5. Rationale: weighted sum with n_hc<=16 terms;
+    // accumulation error is small; pure F32 multiply-add.
+    double max_nmse_err() override {
+        return 1e-5;
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, n_hc, n_tokens);
+        ggml_set_param(x);
+        ggml_set_name(x, "x");
+
+        ggml_tensor * weights = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_hc, n_tokens);
+        ggml_set_param(weights);
+        ggml_set_name(weights, "weights");
+
+        ggml_tensor * out = ggml_dsv4_hc_weighted_sum(ctx, x, weights);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
+// V4 hyper-connection expand: out[embd, hc, token] = post[hc, token] * block_out[embd, token]
+//                                                    + (comb[:, :, token]^T @ residual[:, :, token])[embd, hc].
+// Reference:    ggml/include/ggml.h (ggml_dsv4_hc_expand).
+// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_hc_expand).
+// Constraints (ggml.c, ggml_dsv4_hc_expand):
+//   block_out shape {n_embd, n_tokens, 1, 1}    (2D, NOT 3D)
+//   residual  shape {n_embd, n_hc,    n_tokens, 1}
+//   post      shape {n_hc,   n_tokens, 1, 1}
+//   comb      shape {n_hc,   n_hc,    n_tokens, 1}
+struct test_dsv4_hc_expand : public test_case {
+    const int64_t n_embd;
+    const int64_t n_hc;
+    const int64_t n_tokens;
+
+    std::string vars() override {
+        return VARS_TO_STR3(n_embd, n_hc, n_tokens);
+    }
+
+    test_dsv4_hc_expand(int64_t n_embd = 128, int64_t n_hc = 4, int64_t n_tokens = 16)
+        : n_embd(n_embd), n_hc(n_hc), n_tokens(n_tokens) {}
+
+    // NMSE tolerance: 1e-5. Rationale: one matmul along n_hc (small) plus a
+    // pointwise scale; minimal accumulation noise in F32.
+    double max_nmse_err() override {
+        return 1e-5;
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // block_out is 2D: {n_embd, n_tokens}. ne[2]==1, ne[3]==1.
+        ggml_tensor * block_out = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
+        ggml_set_param(block_out);
+        ggml_set_name(block_out, "block_out");
+
+        ggml_tensor * residual = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, n_hc, n_tokens);
+        ggml_set_param(residual);
+        ggml_set_name(residual, "residual");
+
+        ggml_tensor * post = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_hc, n_tokens);
+        ggml_set_param(post);
+        ggml_set_name(post, "post");
+
+        ggml_tensor * comb = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_hc, n_hc, n_tokens);
+        ggml_set_param(comb);
+        ggml_set_name(comb, "comb");
+
+        ggml_tensor * out = ggml_dsv4_hc_expand(ctx, block_out, residual, post, comb);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
+// V4 FP8 KV-cache simulation: quantizes/dequantizes the non-RoPE prefix
+// in E4M3FN blocks, leaves the RoPE tail unchanged.
+// Reference:    ggml/include/ggml.h (ggml_dsv4_fp8_kv_quantize).
+// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_fp8_kv_quantize).
+// Constraints (ggml.c, ggml_dsv4_fp8_kv_quantize): n_rot >= 0; a->ne[0] > n_rot;
+// (a->ne[0] - n_rot) % 64 == 0  (block size is 64 for the FP8 prefix).
+struct test_dsv4_fp8_kv_quantize : public test_case {
+    const std::array<int64_t, 4> ne_a;
+    const int n_rot;
+
+    std::string vars() override {
+        return VARS_TO_STR2(ne_a, n_rot);
+    }
+
+    test_dsv4_fp8_kv_quantize(std::array<int64_t, 4> ne_a = {192, 8, 4, 1},
+                              int n_rot = 64)
+        : ne_a(ne_a), n_rot(n_rot) {}
+
+    // NMSE tolerance: 1e-3. Rationale: FP8 e4m3 represents ~7 bits of mantissa;
+    // the quantize-dequantize round-trip's NMSE is dominated by representable
+    // precision, not by accumulation. The spec's "1e-3 abs (FP8 inherently
+    // lossy)" maps to NMSE 1e-3 because each sample's squared error is bounded
+    // by the FP8 ULP^2 at the local scale, normalized by signal power yields
+    // roughly the same order.
+    double max_nmse_err() override {
+        return 1e-3;
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // Constraint check at construction time so test fails fast on a bad shape.
+        GGML_ASSERT(ne_a[0] > n_rot && "(ne_a[0] > n_rot) required");
+        GGML_ASSERT((ne_a[0] - n_rot) % 64 == 0 && "(ne_a[0]-n_rot) %% 64 == 0 required");
+
+        ggml_tensor * a = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_a.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_dsv4_fp8_kv_quantize(ctx, a, n_rot);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
 // GGML_OP_POOL2D
 struct test_pool2d : public test_case {
     enum ggml_op_pool pool_type;
@@ -8707,6 +8972,53 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         }
     }
 
+    // V4-port: dsv4_rope_tail (partial-RoPE) test cases
+    for (bool inverse : {false, true}) {
+        for (bool ff : {false, true}) {
+            // F32, default shape
+            test_cases.emplace_back(new test_dsv4_rope_tail(
+                GGML_TYPE_F32, {64, 8, 4, 1}, 32, GGML_ROPE_TYPE_NORMAL, 128,
+                1.0f, 0.0f, 0.0f, ff, inverse));
+        }
+    }
+    // Edge: larger head_dim, NEOX mode (exercises the second supported mode path).
+    test_cases.emplace_back(new test_dsv4_rope_tail(
+        GGML_TYPE_F32, {128, 16, 8, 1}, 64, GGML_ROPE_TYPE_NEOX, 256,
+        1.0f, 0.0f, 0.0f, false, false));
+    // (F16 dtype variant intentionally NOT registered: the reference op path
+    //  for dsv4_rope_tail requires an F32 src0 on the backends that implement
+    //  it, so an F16 case would surface as NOT_SUPPORTED and silently pass
+    //  without exercising the op. F32-only here.)
+
+    // V4-port: dsv4_hc_split_sinkhorn test cases.
+    // For n_hc=4 -> mix_dim = (2+4)*4 = 24.
+    // For n_hc=8 -> mix_dim = (2+8)*8 = 80.
+    test_cases.emplace_back(new test_dsv4_hc_split_sinkhorn(4, 16, 4, 1e-6f));
+    test_cases.emplace_back(new test_dsv4_hc_split_sinkhorn(4, 32, 4, 1e-6f));
+    test_cases.emplace_back(new test_dsv4_hc_split_sinkhorn(4, 16, 8, 1e-6f));
+    test_cases.emplace_back(new test_dsv4_hc_split_sinkhorn(8, 16, 4, 1e-6f));
+
+    // V4-port: dsv4_hc_weighted_sum test cases (n_embd, n_hc, n_tokens).
+    test_cases.emplace_back(new test_dsv4_hc_weighted_sum(128, 4, 16));
+    test_cases.emplace_back(new test_dsv4_hc_weighted_sum(512, 4, 32));
+    test_cases.emplace_back(new test_dsv4_hc_weighted_sum(64,  8, 8));
+
+    // V4-port: dsv4_hc_expand test cases (n_embd, n_hc, n_tokens).
+    test_cases.emplace_back(new test_dsv4_hc_expand(128, 4, 16));
+    test_cases.emplace_back(new test_dsv4_hc_expand(512, 4, 32));
+    test_cases.emplace_back(new test_dsv4_hc_expand(64,  8, 8));
+
+    // V4-port: dsv4_fp8_kv_quantize test cases.
+    // Constraint: (ne_a[0] - n_rot) % 64 == 0. Valid examples:
+    //   ne_a[0]=128, n_rot=64   -> prefix=64  (1 block)
+    //   ne_a[0]=192, n_rot=64   -> prefix=128 (2 blocks)
+    //   ne_a[0]=256, n_rot=64   -> prefix=192 (3 blocks)
+    //   ne_a[0]=192, n_rot=128  -> prefix=64  (1 block)
+    test_cases.emplace_back(new test_dsv4_fp8_kv_quantize({128, 8, 4, 1}, 64));
+    test_cases.emplace_back(new test_dsv4_fp8_kv_quantize({192, 8, 4, 1}, 64));
+    test_cases.emplace_back(new test_dsv4_fp8_kv_quantize({256, 16, 8, 1}, 64));
+    test_cases.emplace_back(new test_dsv4_fp8_kv_quantize({192, 16, 8, 1}, 128));
+
     for (int v : { 0, 1, 2, 3 }) {
         for (int dim : { 0, 1, 2, 3, }) {
             test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp
index 3f7f3a11dfa3..671bea226b6a 100644
--- a/tools/imatrix/imatrix.cpp
+++ b/tools/imatrix/imatrix.cpp
@@ -229,6 +229,19 @@ static void compute_cossim(std::vector<tensor_statistics> & tstats) {
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
     GGML_UNUSED(user_data);
 
+    // imatrix only records calibration statistics from matrix multiplications
+    // (MUL_MAT and MUL_MAT_ID). Reject every other op early -- before the
+    // src0 dereference below -- so we don't crash on graph nodes that legitimately
+    // have null t->src[0] (e.g. leaf inputs, GGML_OP_NONE) or that are V4-specific
+    // DSV4 ops whose outputs aren't consumed by anything that benefits from
+    // imatrix data. The cb_eval callback is invoked for every scheduled node, so
+    // this filter also runs for graph nodes that the original code only happened
+    // not to crash on by accident on pre-V4 architectures. See
+    // docs/plans/v4-port-imatrix-diagnosis.md.
+    if (t->op != GGML_OP_MUL_MAT && t->op != GGML_OP_MUL_MAT_ID) {
+        return false;
+    }
+
     const struct ggml_tensor * src0 = t->src[0];
     const struct ggml_tensor * src1 = t->src[1];
     std::string wname = filter_tensor_name(src0->name);
@@ -239,7 +252,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
     // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
     if (ask) {
         if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
-        if (t->op != GGML_OP_MUL_MAT) return false;
         // why are small batches ignored (<16 tokens)?
         if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
         if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
@@ -1240,6 +1252,19 @@ int main(int argc, char ** argv) {
         params.n_ctx      = n_kv;
 
         params.n_batch = std::min(params.n_batch, n_kv);
+
+        // V4 fix: imatrix raises n_parallel (=> cparams.n_seq_max) so it can
+        // fan out chunks across multiple sequences for throughput. With
+        // kv_unified=false (the default) this allocates per-stream KV
+        // buffers, which collide with V4's compressed-attention graph: V4
+        // unconditionally forces n_seqs=1 for LLM_ARCH_DEEPSEEK4 and its
+        // compressed-attention reshape hard-codes n_stream == 1, so it
+        // aborts on the elements-mismatch assertion in ggml_reshape_3d when
+        // n_stream > 1. Forcing kv_unified=true keeps a single shared KV
+        // buffer (n_stream=1) without reducing imatrix's ubatch
+        // parallelism, and is benign for non-V4 archs. See
+        // docs/plans/v4-port-imatrix-diagnosis.md.
+        params.kv_unified = true;
     }
 
     g_collector.set_params(params);