diff --git a/common/chat.cpp b/common/chat.cpp
index 70b9f5dc2c58..38f7a2ed744a 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -1661,6 +1661,7 @@ static common_chat_params common_chat_params_init_gigachat_v3(
 static common_chat_params common_chat_params_init_deepseek_v3_2(const common_chat_template &    tmpl,
                                                                  const autoparser::generation_params & inputs) {
     common_chat_params data;
+    const auto & src = tmpl.source();
 
     data.prompt            = common_chat_template_direct_apply_impl(tmpl, inputs);
     data.format            = COMMON_CHAT_FORMAT_PEG_NATIVE;
@@ -1681,8 +1682,9 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha
     const std::string DSML         = "｜DSML｜";
     const std::string THINK_START  = "<think>";
     const std::string THINK_END    = "</think>";
-    const std::string FC_START     = "<" + DSML + "function_calls>";
-    const std::string FC_END       = "</" + DSML + "function_calls>";
+    const std::string FC_NAME      = src.find("function_calls") != std::string::npos ? "function_calls" : "tool_calls";
+    const std::string FC_START     = "<" + DSML + FC_NAME + ">";
+    const std::string FC_END       = "</" + DSML + FC_NAME + ">";
     const std::string INVOKE_START = "<" + DSML + "invoke";
     const std::string INVOKE_END   = "</" + DSML + "invoke>";
     const std::string PARAM_START  = "<" + DSML + "parameter";
@@ -2093,12 +2095,12 @@ std::optional<common_chat_params> common_chat_try_specialized_template(
         return common_chat_params_init_gigachat_v3(tmpl, params);
     }
 
-    // DeepSeek V3.2 format detection: template defines dsml_token and uses it for tool calls.
+    // DeepSeek DSML format detection: template defines dsml_token and uses it for tool calls.
     // The template source contains the token as a variable assignment, not as a literal in markup.
     if (src.find("dsml_token") != std::string::npos &&
-        src.find("function_calls") != std::string::npos &&
+        (src.find("function_calls") != std::string::npos || src.find("tool_calls") != std::string::npos) &&
         src.find("DSML") != std::string::npos) {
-        LOG_DBG("Using specialized template: DeepSeek V3.2\n");
+        LOG_DBG("Using specialized template: DeepSeek DSML\n");
         return common_chat_params_init_deepseek_v3_2(tmpl, params);
     }
 
diff --git a/conversion/__init__.py b/conversion/__init__.py
index 2c38123dff8d..bba37a5cbbc7 100644
--- a/conversion/__init__.py
+++ b/conversion/__init__.py
@@ -47,6 +47,7 @@
     "DeepseekForCausalLM": "deepseek",
     "DeepseekV2ForCausalLM": "deepseek",
     "DeepseekV3ForCausalLM": "deepseek",
+    "DeepseekV4ForCausalLM": "deepseek",
     "DistilBertForMaskedLM": "bert",
     "DistilBertForSequenceClassification": "bert",
     "DistilBertModel": "bert",
diff --git a/conversion/deepseek.py b/conversion/deepseek.py
index e149fcbf752e..86a3046b9e98 100644
--- a/conversion/deepseek.py
+++ b/conversion/deepseek.py
@@ -1,18 +1,26 @@
 from __future__ import annotations
 
+import concurrent.futures
+import ctypes
+import math
+import os
 import re
 
-from typing import Any, Callable, Iterable, TYPE_CHECKING
+from pathlib import Path
+from typing import Any, Callable, Iterable, Sequence, TYPE_CHECKING
 
+import numpy as np
 import torch
 
 if TYPE_CHECKING:
     from torch import Tensor
 
-from .base import MmprojModel, ModelBase, TextModel, gguf, logger
+from .base import LazyTorchTensor, MmprojModel, ModelBase, TextModel, gguf, logger
 
 from .qwen import QwenModel
 
+TORCH_FLOAT8_E8M0FNU = getattr(torch, "float8_e8m0fnu", None)
+
 
 @ModelBase.register("DeepseekOCRForCausalLM")
 class DeepseekOCRVisionModel(MmprojModel):
@@ -386,3 +394,648 @@ def prepare_tensors(self):
             experts = [k for d in self._experts for k in d.keys()]
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@ModelBase.register("DeepseekV4ForCausalLM")
+class DeepseekV4Model(TextModel):
+    model_arch = gguf.MODEL_ARCH.DEEPSEEK4
+
+    # Optional DeepSeek V4 debug / expert-quant knobs. In the pre-#17114
+    # monolithic convert_hf_to_gguf.py these were ModelBase.__init__ params
+    # wired to --deepseek4-* CLI flags. The refactored conversion/base.py
+    # ModelBase.__init__ does not accept them, so they default here; the
+    # standard DeepseekV4ForCausalLM conversion path does not require them.
+    deepseek4_max_layers: int | None = None
+    deepseek4_expert_outtypes: str | None = None
+    deepseek4_expert_workers: int = 1
+
+    _experts: list[dict[str, Tensor]] | None = None
+
+    _fp4_table = torch.tensor([
+        0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0,
+        0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0,
+    ], dtype=torch.float32)
+
+    _qtype_aliases: dict[str, gguf.GGMLQuantizationType] = {
+        "q8_0": gguf.GGMLQuantizationType.Q8_0,
+        "q2_k": gguf.GGMLQuantizationType.Q2_K,
+        "iq2_xxs": gguf.GGMLQuantizationType.IQ2_XXS,
+        "iq2_xs": gguf.GGMLQuantizationType.IQ2_XS,
+        "tq1_0": gguf.GGMLQuantizationType.TQ1_0,
+        "tq2_0": gguf.GGMLQuantizationType.TQ2_0,
+    }
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self._deepseek4_original_block_count = self.block_count
+        if self.deepseek4_max_layers is not None:
+            if self.deepseek4_max_layers <= 0:
+                raise ValueError("--deepseek4-max-layers must be positive")
+            if self.deepseek4_max_layers > self.block_count:
+                raise ValueError(
+                    f"--deepseek4-max-layers={self.deepseek4_max_layers} exceeds model layer count {self.block_count}"
+                )
+            self.block_count = self.deepseek4_max_layers
+            self.hparams["num_hidden_layers"] = self.block_count
+            self.hparams["n_layers"] = self.block_count
+            self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
+            logger.warning(
+                "DeepSeek V4 debug export: writing only the first %d/%d transformer layers",
+                self.block_count,
+                self._deepseek4_original_block_count,
+            )
+
+        self._deepseek4_expert_qtypes = self._parse_expert_outtype_spec(self.deepseek4_expert_outtypes)
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def set_gguf_parameters(self):
+        self.hparams["num_key_value_heads"] = self.hparams.get("num_key_value_heads", 1)
+
+        super().set_gguf_parameters()
+        hparams = self.hparams
+
+        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+        self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+        self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+        self.gguf_writer.add_attention_output_lora_rank(hparams["o_lora_rank"])
+        self.gguf_writer.add_attention_output_group_count(hparams["o_groups"])
+        self.gguf_writer.add_attention_compress_ratios(hparams["compress_ratios"])
+        self.gguf_writer.add_attention_compress_rope_freq_base(hparams["compress_rope_theta"])
+
+        self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+        self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
+        self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+        self.gguf_writer.add_expert_weights_scale(hparams.get("routed_scaling_factor", 1.0))
+        self.gguf_writer.add_hash_layer_count(min(hparams["num_hash_layers"], self.block_count))
+        if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None:
+            self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
+        if (swiglu_limit := hparams.get("swiglu_limit")) is not None and float(swiglu_limit) > 0.0:
+            self.gguf_writer.add_swiglu_clamp_exp([float(swiglu_limit)] * self.block_count)
+
+        if (sliding_window := hparams.get("sliding_window")) is not None:
+            self.gguf_writer.add_sliding_window(sliding_window)
+
+        self.gguf_writer.add_indexer_head_count(hparams["index_n_heads"])
+        self.gguf_writer.add_indexer_key_length(hparams["index_head_dim"])
+        self.gguf_writer.add_indexer_top_k(hparams["index_topk"])
+
+        if self.deepseek4_max_layers is None and (num_nextn_predict_layers := hparams.get("num_nextn_predict_layers")) is not None:
+            self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
+
+        self.gguf_writer.add_hyper_connection_count(hparams["hc_mult"])
+        self.gguf_writer.add_hyper_connection_sinkhorn_iters(hparams["hc_sinkhorn_iters"])
+        self.gguf_writer.add_hyper_connection_eps(hparams["hc_eps"])
+
+    @staticmethod
+    def _strip_model_prefix(name: str) -> str:
+        return name.removeprefix("model.")
+
+    def _skip_layer_tensor(self, stripped_name: str) -> bool:
+        if self.deepseek4_max_layers is None:
+            return False
+        match = re.match(r"layers\.(\d+)\.", stripped_name)
+        return match is not None and int(match.group(1)) >= self.block_count
+
+    @staticmethod
+    def _is_low_bit_ftype(ftype: gguf.LlamaFileType) -> bool:
+        return ftype in (
+            gguf.LlamaFileType.MOSTLY_TQ1_0,
+            gguf.LlamaFileType.MOSTLY_TQ2_0,
+            gguf.LlamaFileType.MOSTLY_Q2_K,
+            gguf.LlamaFileType.MOSTLY_IQ2_XXS,
+            gguf.LlamaFileType.MOSTLY_IQ2_XS,
+        )
+
+    @staticmethod
+    def _qtype_for_ftype(ftype: gguf.LlamaFileType) -> gguf.GGMLQuantizationType | None:
+        return {
+            gguf.LlamaFileType.MOSTLY_TQ1_0: gguf.GGMLQuantizationType.TQ1_0,
+            gguf.LlamaFileType.MOSTLY_TQ2_0: gguf.GGMLQuantizationType.TQ2_0,
+            gguf.LlamaFileType.MOSTLY_Q2_K: gguf.GGMLQuantizationType.Q2_K,
+            gguf.LlamaFileType.MOSTLY_IQ2_XXS: gguf.GGMLQuantizationType.IQ2_XXS,
+            gguf.LlamaFileType.MOSTLY_IQ2_XS: gguf.GGMLQuantizationType.IQ2_XS,
+            gguf.LlamaFileType.MOSTLY_Q8_0: gguf.GGMLQuantizationType.Q8_0,
+        }.get(ftype)
+
+    @classmethod
+    def _parse_qtype_name(cls, name: str) -> gguf.GGMLQuantizationType:
+        qtype = cls._qtype_aliases.get(name.strip().lower())
+        if qtype is None:
+            allowed = ", ".join(sorted(cls._qtype_aliases))
+            raise ValueError(f"unknown DeepSeek V4 expert outtype {name!r}; expected one of: {allowed}")
+        return qtype
+
+    @classmethod
+    def _parse_expert_outtype_spec(cls, spec: str | None) -> dict[str, gguf.GGMLQuantizationType]:
+        if spec is None:
+            return {}
+
+        result: dict[str, gguf.GGMLQuantizationType] = {}
+        for item in spec.split(","):
+            item = item.strip()
+            if not item:
+                continue
+            if "=" not in item:
+                qtype = cls._parse_qtype_name(item)
+                result.update({"w1": qtype, "w2": qtype, "w3": qtype})
+                continue
+            key, value = (part.strip().lower() for part in item.split("=", 1))
+            if key not in ("w1", "w2", "w3", "gate", "down", "up"):
+                raise ValueError(f"unknown DeepSeek V4 expert tensor selector {key!r}")
+            wid = {"gate": "w1", "down": "w2", "up": "w3"}.get(key, key)
+            result[wid] = cls._parse_qtype_name(value)
+        return result
+
+    @staticmethod
+    def _scale_to_float(scale: Tensor) -> Tensor:
+        if TORCH_FLOAT8_E8M0FNU is not None and scale.dtype == TORCH_FLOAT8_E8M0FNU:
+            return scale.float()
+
+        if scale.dtype in (torch.uint8, torch.int8):
+            e = scale.view(torch.uint8).to(torch.int32)
+            bits = torch.where(
+                e == 0,
+                torch.full_like(e, 0x00400000),
+                e << 23,
+            )
+            return bits.view(torch.float32)
+
+        return scale.float()
+
+    @staticmethod
+    def _scale_to_e8m0_bytes(scale: Tensor) -> Tensor:
+        if TORCH_FLOAT8_E8M0FNU is not None and scale.dtype == TORCH_FLOAT8_E8M0FNU:
+            return scale.view(torch.uint8)
+        if scale.dtype in (torch.uint8, torch.int8):
+            return scale.view(torch.uint8)
+
+        scale = scale.float()
+        e = torch.where(
+            scale > 0,
+            torch.floor(torch.log2(scale)).to(torch.int32) + 127,
+            torch.zeros_like(scale, dtype=torch.int32),
+        )
+        return torch.clamp(e, 0, 255).to(torch.uint8)
+
+    @classmethod
+    def _dequant_fp8_weight(cls, weight: Tensor, scale: Tensor, block_size: Sequence[int]) -> Tensor:
+        if len(block_size) != 2:
+            raise ValueError(f"DeepSeek V4 expects 2D FP8 block scales, got block size {block_size}")
+
+        block_out, block_in = block_size
+        out_dim, in_dim = weight.shape
+        if out_dim % block_out != 0 or in_dim % block_in != 0:
+            raise ValueError(f"FP8 tensor shape {tuple(weight.shape)} is not divisible by block size {block_size}")
+
+        scale = cls._scale_to_float(scale)
+        expected_scale = (out_dim // block_out, in_dim // block_in)
+        if tuple(scale.shape) != expected_scale:
+            raise ValueError(f"FP8 scale shape {tuple(scale.shape)} does not match expected {expected_scale}")
+
+        weight = weight.reshape(out_dim // block_out, block_out, in_dim // block_in, block_in)
+        weight = weight.float() * scale[:, None, :, None]
+        return weight.reshape(out_dim, in_dim)
+
+    @classmethod
+    def _dequant_fp4_weight(cls, weight: Tensor, scale: Tensor) -> Tensor:
+        weight = weight.view(torch.uint8)
+        out_dim, packed_in_dim = weight.shape
+        in_dim = packed_in_dim * 2
+        if in_dim % 32 != 0:
+            raise ValueError(f"FP4 packed tensor shape {tuple(weight.shape)} does not contain 32-value blocks")
+
+        n_blocks = in_dim // 32
+        scale = cls._scale_to_float(scale)
+        if tuple(scale.shape) != (out_dim, n_blocks):
+            raise ValueError(f"FP4 scale shape {tuple(scale.shape)} does not match expected {(out_dim, n_blocks)}")
+
+        fp4_table = cls._fp4_table.to(weight.device)
+        packed = weight.reshape(out_dim, n_blocks, 16)
+        low = packed & 0x0F
+        high = (packed >> 4) & 0x0F
+        vals = torch.stack((low, high), dim=-1).reshape(out_dim, n_blocks, 32)
+        vals = fp4_table[vals.long()] * scale.unsqueeze(-1)
+        return vals.reshape(out_dim, in_dim)
+
+    @classmethod
+    def _pack_fp4_as_mxfp4(cls, weight: Tensor, scale: Tensor) -> tuple[np.ndarray, list[int]]:
+        weight = weight.view(torch.uint8)
+        out_dim, packed_in_dim = weight.shape
+        in_dim = packed_in_dim * 2
+        if in_dim % 32 != 0:
+            raise ValueError(f"FP4 packed tensor shape {tuple(weight.shape)} does not contain 32-value blocks")
+
+        n_blocks = in_dim // 32
+        scale_e = cls._scale_to_e8m0_bytes(scale)
+        if tuple(scale_e.shape) != (out_dim, n_blocks):
+            raise ValueError(f"FP4 scale shape {tuple(scale_e.shape)} does not match expected {(out_dim, n_blocks)}")
+
+        packed = weight.reshape(out_dim, n_blocks, 16)
+        low = packed & 0x0F
+        high = (packed >> 4) & 0x0F
+        vals = torch.stack((low, high), dim=-1).reshape(out_dim, n_blocks, 32)
+        qs = vals[:, :, :16] | (vals[:, :, 16:] << 4)
+        raw = torch.cat((scale_e.unsqueeze(-1), qs), dim=-1).reshape(out_dim, n_blocks * 17)
+        return raw.numpy(), [out_dim, in_dim]
+
+    _ggml_quant_lib: Any = None
+
+    @classmethod
+    def _load_ggml_quant_lib(cls):
+        if cls._ggml_quant_lib is not None:
+            return cls._ggml_quant_lib
+
+        # This module lives in the conversion/ package; the repo root (where
+        # build/bin/libggml.* lands) is its parent's parent. In the pre-#17114
+        # monolithic convert_hf_to_gguf.py, __file__ was the repo-root script,
+        # so .parent alone was the repo root -- search both so the lookup is
+        # correct regardless of package layout.
+        repo_root = Path(__file__).resolve().parent.parent
+        pkg_root  = Path(__file__).resolve().parent
+        candidates = [
+            os.environ.get("LLAMA_CPP_LIBGGML"),
+            repo_root / "build" / "bin" / "libggml.dylib",
+            repo_root / "build" / "bin" / "libggml.so",
+            repo_root / "build" / "bin" / "ggml.dll",
+            pkg_root  / "build" / "bin" / "libggml.dylib",
+            pkg_root  / "build" / "bin" / "libggml.so",
+            pkg_root  / "build" / "bin" / "ggml.dll",
+        ]
+        for candidate in candidates:
+            if candidate is None:
+                continue
+            path = Path(candidate)
+            if not path.is_file():
+                continue
+            lib = ctypes.CDLL(str(path))
+            lib.ggml_quantize_chunk.restype = ctypes.c_size_t
+            lib.ggml_quantize_chunk.argtypes = (
+                ctypes.c_int,
+                ctypes.POINTER(ctypes.c_float),
+                ctypes.c_void_p,
+                ctypes.c_int64,
+                ctypes.c_int64,
+                ctypes.c_int64,
+                ctypes.POINTER(ctypes.c_float),
+            )
+            lib.ggml_quantize_requires_imatrix.restype = ctypes.c_bool
+            lib.ggml_quantize_requires_imatrix.argtypes = (ctypes.c_int,)
+            cls._ggml_quant_lib = lib
+            return lib
+
+        raise RuntimeError(
+            "DeepSeek V4 low-bit expert conversion needs llama.cpp's libggml. "
+            "Build llama.cpp first or set LLAMA_CPP_LIBGGML to libggml."
+        )
+
+    @classmethod
+    def _quantize_deepseek4_expert(cls, data: np.ndarray, qtype: gguf.GGMLQuantizationType) -> np.ndarray:
+        c_quantized_types = {
+            gguf.GGMLQuantizationType.Q2_K,
+            gguf.GGMLQuantizationType.IQ2_XXS,
+            gguf.GGMLQuantizationType.IQ2_XS,
+        }
+        if qtype not in c_quantized_types:
+            return gguf.quants.quantize(data, qtype)
+
+        data = np.ascontiguousarray(data, dtype=np.float32)
+        out = np.zeros(gguf.quant_shape_to_byte_shape(data.shape, qtype), dtype=np.uint8, order="C")
+        lib = cls._load_ggml_quant_lib()
+        nrows = math.prod(data.shape[:-1])
+        n_per_row = data.shape[-1]
+        imatrix = ctypes.cast(0, ctypes.POINTER(ctypes.c_float))
+        if lib.ggml_quantize_requires_imatrix(qtype.value):
+            qw = np.ascontiguousarray(np.sum(data.reshape(-1, n_per_row) ** 2, axis=0), dtype=np.float32)
+            imatrix = qw.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+        result_size = lib.ggml_quantize_chunk(
+            qtype.value,
+            data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+            out.ctypes.data_as(ctypes.c_void_p),
+            0,
+            nrows,
+            n_per_row,
+            imatrix,
+        )
+        if result_size != out.size:
+            raise RuntimeError(f"ggml_quantize_chunk wrote {result_size} bytes, expected {out.size}")
+        return out
+
+    def _write_deepseek4_tid2eid_tensors(self) -> set[str]:
+        consumed: set[str] = set()
+        for name in list(self.model_tensors.keys()):
+            stripped = self._strip_model_prefix(name)
+            if self._skip_layer_tensor(stripped):
+                consumed.add(name)
+                continue
+            if re.match(r"layers\.\d+\.ffn\.gate\.tid2eid$", stripped) is None:
+                continue
+
+            data = LazyTorchTensor.to_eager(self.model_tensors[name]()).to(torch.int32).numpy()
+            new_name = self.map_tensor_name(stripped)
+            logger.info(f"{new_name}, int32 --> I32, shape = {{{', '.join(str(n) for n in reversed(data.shape))}}}")
+            self.gguf_writer.add_tensor(new_name, data)
+            consumed.add(name)
+        return consumed
+
+    def _write_deepseek4_expert_tensors(self) -> set[str]:
+        default_qtype = self._qtype_for_ftype(self.ftype)
+        if default_qtype is None and not self._deepseek4_expert_qtypes:
+            if any(re.match(r"(?:model\.)?layers\.\d+\.ffn\.experts\.\d+\.w[123]\.weight$", name) for name in self.model_tensors):
+                raise NotImplementedError(
+                    "DeepSeek V4 routed FP4 experts must be converted directly to a compact GGUF type. "
+                    "Use --outtype iq2_xxs, iq2_xs, q2_k, tq2_0, tq1_0, or q8_0."
+                )
+            return set()
+
+        n_experts = self.hparams["n_routed_experts"]
+        consumed: set[str] = set()
+        groups: dict[tuple[int, str], dict[int, tuple[str, str]]] = {}
+
+        for name in list(self.model_tensors.keys()):
+            stripped = self._strip_model_prefix(name)
+            if self._skip_layer_tensor(stripped):
+                consumed.add(name)
+                continue
+            match = re.match(r"layers\.(\d+)\.ffn\.experts\.(\d+)\.(w[123])\.weight$", stripped)
+            if match is None:
+                continue
+
+            bid = int(match.group(1))
+            xid = int(match.group(2))
+            wid = match.group(3)
+            qtype = self._deepseek4_expert_qtypes.get(wid, default_qtype)
+            if qtype is None:
+                raise RuntimeError(f"No DeepSeek V4 expert quantization type selected for {wid}")
+            scale_name = f"{stripped.removesuffix('.weight')}.scale"
+            model_scale_name = scale_name if scale_name in self.model_tensors else f"model.{scale_name}"
+            if model_scale_name not in self.model_tensors:
+                raise ValueError(f"Missing DeepSeek V4 FP4 scale tensor for {stripped}")
+
+            groups.setdefault((bid, wid), {})[xid] = (name, model_scale_name)
+            consumed.update((name, model_scale_name))
+
+        def convert_one(name: str, model_scale_name: str, qtype: gguf.GGMLQuantizationType) -> np.ndarray:
+            weight = LazyTorchTensor.to_eager(self.model_tensors[name]())
+            scale = LazyTorchTensor.to_eager(self.model_tensors[model_scale_name]())
+
+            if qtype == gguf.GGMLQuantizationType.MXFP4:
+                data, _ = self._pack_fp4_as_mxfp4(weight, scale)
+                return data
+
+            data = self._dequant_fp4_weight(weight, scale).numpy()
+            return self._quantize_deepseek4_expert(data, qtype)
+
+        def add_merged_tensor(bid: int, wid: str, qtype: gguf.GGMLQuantizationType, experts: dict[int, np.ndarray]) -> None:
+            missing = sorted(set(range(n_experts)).difference(experts))
+            if missing:
+                raise ValueError(f"Missing DeepSeek V4 expert tensors for layer {bid} {wid}: {missing[:8]}")
+
+            merged = np.stack([experts[i] for i in range(n_experts)], axis=0)
+            merged_name = f"layers.{bid}.ffn.experts.{wid}.weight"
+            new_name = self.map_tensor_name(merged_name)
+            shape = gguf.quant_shape_from_byte_shape(merged.shape, qtype) if merged.dtype == np.uint8 else merged.shape
+            shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
+            logger.info(f"{new_name}, DeepSeek FP4 --> {qtype.name}, shape = {shape_str}")
+            self.gguf_writer.add_tensor(new_name, merged, raw_dtype=qtype)
+
+        worker_count = max(1, self.deepseek4_expert_workers)
+        for bid, wid in sorted(groups):
+            qtype = self._deepseek4_expert_qtypes.get(wid, default_qtype)
+            if qtype is None:
+                raise RuntimeError(f"No DeepSeek V4 expert quantization type selected for {wid}")
+            group = groups[(bid, wid)]
+            experts: dict[int, np.ndarray] = {}
+            logger.info(
+                "DeepSeek V4: quantizing blk.%d %s experts to %s with %d worker%s",
+                bid,
+                wid,
+                qtype.name,
+                worker_count,
+                "" if worker_count == 1 else "s",
+            )
+
+            if worker_count == 1:
+                for done, xid in enumerate(sorted(group), start=1):
+                    name, model_scale_name = group[xid]
+                    experts[xid] = convert_one(name, model_scale_name, qtype)
+                    if done % 32 == 0 or done == n_experts:
+                        logger.info("DeepSeek V4: blk.%d %s %d/%d experts", bid, wid, done, n_experts)
+            else:
+                max_pending = worker_count * 2
+                pending: dict[concurrent.futures.Future[np.ndarray], int] = {}
+                xids = iter(sorted(group))
+                done = 0
+
+                with concurrent.futures.ThreadPoolExecutor(max_workers=worker_count) as executor:
+                    def submit_next() -> bool:
+                        try:
+                            xid = next(xids)
+                        except StopIteration:
+                            return False
+                        name, model_scale_name = group[xid]
+                        future = executor.submit(convert_one, name, model_scale_name, qtype)
+                        pending[future] = xid
+                        return True
+
+                    while len(pending) < max_pending and submit_next():
+                        pass
+
+                    while pending:
+                        finished, _ = concurrent.futures.wait(
+                            pending,
+                            return_when=concurrent.futures.FIRST_COMPLETED,
+                        )
+                        for future in finished:
+                            xid = pending.pop(future)
+                            experts[xid] = future.result()
+                            done += 1
+                            if done % 32 == 0 or done == n_experts:
+                                logger.info("DeepSeek V4: blk.%d %s %d/%d experts", bid, wid, done, n_experts)
+                            submit_next()
+
+            add_merged_tensor(bid, wid, qtype, experts)
+
+        return consumed
+
+    def _prepare_deepseek4_scaled_tensors(self) -> None:
+        block_size = (self.hparams.get("quantization_config") or {}).get("weight_block_size", [128, 128])
+        consumed: set[str] = set()
+
+        for name in list(self.model_tensors.keys()):
+            stripped = self._strip_model_prefix(name)
+            if stripped.startswith("mtp.") or self._skip_layer_tensor(stripped):
+                consumed.add(name)
+
+        consumed.update(self._write_deepseek4_tid2eid_tensors())
+        consumed.update(self._write_deepseek4_expert_tensors())
+
+        for name in list(self.model_tensors.keys()):
+            if name in consumed:
+                continue
+            stripped = self._strip_model_prefix(name)
+            if not stripped.endswith(".scale"):
+                continue
+            if re.match(r"layers\.\d+\.ffn\.experts\.\d+\.w[123]\.scale$", stripped) is not None:
+                continue
+
+            weight_name = f"{stripped.removesuffix('.scale')}.weight"
+            model_weight_name = weight_name if weight_name in self.model_tensors else f"model.{weight_name}"
+            if model_weight_name not in self.model_tensors:
+                raise ValueError(f"Missing DeepSeek V4 FP8 weight tensor for scale {stripped}")
+
+            w = self.model_tensors[model_weight_name]
+            s = self.model_tensors[name]
+            self.model_tensors[model_weight_name] = (
+                lambda w=w, s=s, bs=block_size: self._dequant_fp8_weight(
+                    LazyTorchTensor.to_eager(w()),
+                    LazyTorchTensor.to_eager(s()),
+                    bs,
+                )
+            )
+            consumed.add(name)
+
+        for name in consumed:
+            self.model_tensors.pop(name, None)
+
+    def prepare_tensors(self):
+        self._prepare_deepseek4_scaled_tensors()
+
+        if any(name.endswith(".scale") for name in self.model_tensors):
+            raise NotImplementedError("Unhandled DeepSeek V4 scale tensors remain after conversion preparation")
+
+        super().prepare_tensors()
+
+        if self._experts is not None:
+            experts = [k for d in self._experts for k in d.keys()]
+            if len(experts) > 0:
+                raise ValueError(f"Unprocessed experts: {experts}")
+
+    def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims: int) -> gguf.GGMLQuantizationType | bool:
+        del name
+        del new_name
+        del bid
+
+        if not self._is_low_bit_ftype(self.ftype) or n_dims <= 1:
+            return False
+
+        # DeepSeek V4 routed experts are handled in _write_deepseek4_expert_tensors(),
+        # where each expert is converted directly from FP4 to the requested compact
+        # GGUF type.  Keep the rest of the model in float form so attention,
+        # hyper-connections, indexers, compressors, shared experts and logits do not
+        # inherit the global low-bit file type.
+        return gguf.GGMLQuantizationType.F16
+
+    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
+        mapped = self._map_tensor_name_deepseek4(name)
+        if mapped is not None:
+            return mapped
+        return super().map_tensor_name(name, try_suffixes)
+
+    def _map_tensor_name_deepseek4(self, name: str) -> str | None:
+        if name.startswith("model."):
+            name = name.removeprefix("model.")
+
+        top_level: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
+            "embed.weight":    (gguf.MODEL_TENSOR.TOKEN_EMBD, ".weight"),
+            "norm.weight":     (gguf.MODEL_TENSOR.OUTPUT_NORM, ".weight"),
+            "head.weight":     (gguf.MODEL_TENSOR.OUTPUT, ".weight"),
+            "hc_head_base":    (gguf.MODEL_TENSOR.OUTPUT_HC_BASE, ".weight"),
+            "hc_head_fn":      (gguf.MODEL_TENSOR.OUTPUT_HC_FN, ".weight"),
+            "hc_head_scale":   (gguf.MODEL_TENSOR.OUTPUT_HC_SCALE, ".weight"),
+        }
+        if name in top_level:
+            tensor, suffix = top_level[name]
+            return self.format_tensor_name(tensor, suffix=suffix)
+
+        match = re.match(r"layers\.(\d+)\.(.+)", name)
+        if match is None:
+            return None
+
+        bid = int(match.group(1))
+        rest = match.group(2)
+
+        layer_level: dict[str, tuple[gguf.MODEL_TENSOR, str]] = {
+            "hc_attn_base":                  (gguf.MODEL_TENSOR.HC_ATTN_BASE, ".weight"),
+            "hc_attn_fn":                    (gguf.MODEL_TENSOR.HC_ATTN_FN, ".weight"),
+            "hc_attn_scale":                 (gguf.MODEL_TENSOR.HC_ATTN_SCALE, ".weight"),
+            "hc_ffn_base":                   (gguf.MODEL_TENSOR.HC_FFN_BASE, ".weight"),
+            "hc_ffn_fn":                     (gguf.MODEL_TENSOR.HC_FFN_FN, ".weight"),
+            "hc_ffn_scale":                  (gguf.MODEL_TENSOR.HC_FFN_SCALE, ".weight"),
+            "attn.attn_sink":                (gguf.MODEL_TENSOR.ATTN_SINKS, ".weight"),
+            "attn.wq_a.weight":              (gguf.MODEL_TENSOR.ATTN_Q_A, ".weight"),
+            "attn.wq_b.weight":              (gguf.MODEL_TENSOR.ATTN_Q_B, ".weight"),
+            "attn.q_norm.weight":            (gguf.MODEL_TENSOR.ATTN_Q_A_NORM, ".weight"),
+            "attn.wkv.weight":               (gguf.MODEL_TENSOR.ATTN_KV, ".weight"),
+            "attn.kv_norm.weight":           (gguf.MODEL_TENSOR.ATTN_KV_A_NORM, ".weight"),
+            "attn.wo_a.weight":              (gguf.MODEL_TENSOR.ATTN_OUT_A, ".weight"),
+            "attn.wo_b.weight":              (gguf.MODEL_TENSOR.ATTN_OUT_B, ".weight"),
+            "attn.compressor.ape":           (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_APE, ".weight"),
+            "attn.compressor.wkv.weight":    (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_KV, ".weight"),
+            "attn.compressor.wgate.weight":  (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_GATE, ".weight"),
+            "attn.compressor.norm.weight":   (gguf.MODEL_TENSOR.ATTN_COMPRESSOR_NORM, ".weight"),
+            "attn.indexer.wq_b.weight":      (gguf.MODEL_TENSOR.INDEXER_ATTN_Q_B, ".weight"),
+            "attn.indexer.weights_proj.weight": (gguf.MODEL_TENSOR.INDEXER_PROJ, ".weight"),
+            "attn.indexer.compressor.ape":   (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_APE, ".weight"),
+            "attn.indexer.compressor.wkv.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_KV, ".weight"),
+            "attn.indexer.compressor.wgate.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_GATE, ".weight"),
+            "attn.indexer.compressor.norm.weight": (gguf.MODEL_TENSOR.INDEXER_COMPRESSOR_NORM, ".weight"),
+            "attn_norm.weight":              (gguf.MODEL_TENSOR.ATTN_NORM, ".weight"),
+            "ffn_norm.weight":               (gguf.MODEL_TENSOR.FFN_NORM, ".weight"),
+            "ffn.shared_experts.w1.weight":  (gguf.MODEL_TENSOR.FFN_GATE_SHEXP, ".weight"),
+            "ffn.shared_experts.w3.weight":  (gguf.MODEL_TENSOR.FFN_UP_SHEXP, ".weight"),
+            "ffn.shared_experts.w2.weight":  (gguf.MODEL_TENSOR.FFN_DOWN_SHEXP, ".weight"),
+            "ffn.gate.weight":               (gguf.MODEL_TENSOR.FFN_GATE_INP, ".weight"),
+            "ffn.gate.bias":                 (gguf.MODEL_TENSOR.FFN_EXP_PROBS_B, ".bias"),
+            "ffn.gate.tid2eid":              (gguf.MODEL_TENSOR.FFN_GATE_TID2EID, ".weight"),
+            "ffn.experts.w1.weight":         (gguf.MODEL_TENSOR.FFN_GATE_EXP, ".weight"),
+            "ffn.experts.w3.weight":         (gguf.MODEL_TENSOR.FFN_UP_EXP, ".weight"),
+            "ffn.experts.w2.weight":         (gguf.MODEL_TENSOR.FFN_DOWN_EXP, ".weight"),
+        }
+        if rest in layer_level:
+            tensor, suffix = layer_level[rest]
+            return self.format_tensor_name(tensor, bid, suffix=suffix)
+
+        return None
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("model."):
+            name = name.removeprefix("model.")
+
+        # TODO: llama.cpp does not have Multi-Token Prediction for DeepSeek yet.
+        if name.startswith("mtp."):
+            return
+
+        # process the experts separately
+        match = re.match(r"layers\.(\d+)\.ffn\.experts\.(\d+)\.(w[123])\.weight", name)
+        if match is not None:
+            bid = int(match.group(1))
+            xid = int(match.group(2))
+            wid = match.group(3)
+            n_experts = self.hparams["n_routed_experts"]
+
+            if self._experts is None:
+                self._experts = [{} for _ in range(self.block_count)]
+
+            self._experts[bid][name] = data_torch
+
+            if len(self._experts[bid]) >= n_experts * 3:
+                for w_name in ["w1", "w3", "w2"]:
+                    datas: list[Tensor] = []
+
+                    for expert_id in range(n_experts):
+                        ename = f"layers.{bid}.ffn.experts.{expert_id}.{w_name}.weight"
+                        datas.append(self._experts[bid][ename])
+                        del self._experts[bid][ename]
+
+                    data_torch = torch.stack(datas, dim=0)
+                    merged_name = f"layers.{bid}.ffn.experts.{w_name}.weight"
+                    yield self.map_tensor_name(merged_name), data_torch
+                return
+
+            del xid, wid
+            return
+
+        yield self.map_tensor_name(name), data_torch
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 41566d41aef3..f71943ed33aa 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -567,6 +567,11 @@ extern "C" {
         GGML_OP_RWKV_WKV7,
         GGML_OP_SOLVE_TRI,
         GGML_OP_GATED_DELTA_NET,
+        GGML_OP_DSV4_HC_SPLIT_SINKHORN,
+        GGML_OP_DSV4_HC_WEIGHTED_SUM,
+        GGML_OP_DSV4_HC_EXPAND,
+        GGML_OP_DSV4_FP8_KV_QUANTIZE,
+        GGML_OP_DSV4_ROPE_TAIL,
 
         GGML_OP_UNARY,
 
@@ -2555,6 +2560,61 @@ extern "C" {
             struct ggml_tensor  * beta,
             struct ggml_tensor  * state);
 
+    // DeepSeek V4 hyperconnection helper.
+    // Splits [mix, tokens] into pre/post/comb regions and applies the
+    // Sinkhorn normalization used by the reference implementation.
+    GGML_API struct ggml_tensor * ggml_dsv4_hc_split_sinkhorn(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * mixes,
+            struct ggml_tensor  * scale,
+            struct ggml_tensor  * base,
+            int                   n_hc,
+            int                   sinkhorn_iters,
+            float                 eps);
+
+    // DeepSeek V4 hyperconnection weighted-sum helper.
+    // Computes sum_hc weights[hc, token] * x[embd, hc, token].
+    GGML_API struct ggml_tensor * ggml_dsv4_hc_weighted_sum(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * x,
+            struct ggml_tensor  * weights);
+
+    // DeepSeek V4 hyperconnection expand helper.
+    // Computes post * block_out + comb^T @ residual for each token.
+    GGML_API struct ggml_tensor * ggml_dsv4_hc_expand(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * block_out,
+            struct ggml_tensor  * residual,
+            struct ggml_tensor  * post,
+            struct ggml_tensor  * comb);
+
+    // DeepSeek V4 FP8 KV-cache simulation helper.
+    // Quantizes/dequantizes the non-RoPE prefix in E4M3FN blocks and leaves
+    // the RoPE tail unchanged, matching the reference inference path.
+    GGML_API struct ggml_tensor * ggml_dsv4_fp8_kv_quantize(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_rot);
+
+    // DeepSeek V4 partial RoPE helper.
+    // Leaves the non-RoPE prefix unchanged and applies RoPE to the tail,
+    // matching ggml_concat(prefix, ggml_rope_ext(tail)).
+    GGML_API struct ggml_tensor * ggml_dsv4_rope_tail(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * pos,
+            struct ggml_tensor  * freq_factors,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow,
+            bool                  inverse);
+
     // custom operators
 
     typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index cd5c61a81879..70f8def3a742 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2047,6 +2047,26 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_gated_delta_net(params, tensor);
             } break;
+        case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+            {
+                ggml_compute_forward_dsv4_hc_split_sinkhorn(params, tensor);
+            } break;
+        case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+            {
+                ggml_compute_forward_dsv4_hc_weighted_sum(params, tensor);
+            } break;
+        case GGML_OP_DSV4_HC_EXPAND:
+            {
+                ggml_compute_forward_dsv4_hc_expand(params, tensor);
+            } break;
+        case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+            {
+                ggml_compute_forward_dsv4_fp8_kv_quantize(params, tensor);
+            } break;
+        case GGML_OP_DSV4_ROPE_TAIL:
+            {
+                ggml_compute_forward_dsv4_rope_tail(params, tensor);
+            } break;
         case GGML_OP_MAP_CUSTOM1:
             {
                 ggml_compute_forward_map_custom1(params, tensor);
@@ -2227,6 +2247,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_COUNT_EQUAL:
         case GGML_OP_SOLVE_TRI:
         case GGML_OP_GATED_DELTA_NET:
+        case GGML_OP_DSV4_HC_SPLIT_SINKHORN:
+        case GGML_OP_DSV4_HC_WEIGHTED_SUM:
+        case GGML_OP_DSV4_HC_EXPAND:
+        case GGML_OP_DSV4_FP8_KV_QUANTIZE:
+        case GGML_OP_DSV4_ROPE_TAIL:
             {
                 n_tasks = n_threads;
             } break;
@@ -2847,6 +2872,7 @@ struct ggml_cplan ggml_graph_plan(
                 case GGML_OP_SOFT_MAX:
                 case GGML_OP_ROPE:
                 case GGML_OP_ROPE_BACK:
+                case GGML_OP_DSV4_ROPE_TAIL:
                     {
                         cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
                     } break;
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 7485ba4fc861..f473cb724725 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -5970,6 +5970,127 @@ void ggml_compute_forward_rope_back(
     }
 }
 
+// ggml_compute_forward_dsv4_rope_tail
+
+template<typename T>
+static void ggml_compute_forward_dsv4_rope_tail_flt(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+    const ggml_tensor * src1 = dst->src[1];
+    const ggml_tensor * src2 = dst->src[2];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+
+    const int n_dims     = ((int32_t *) dst->op_params)[0];
+    const int mode       = ((int32_t *) dst->op_params)[1];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[2];
+    const bool inverse   = ((int32_t *) dst->op_params)[3] != 0;
+
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    memcpy(&freq_base,   (int32_t *) dst->op_params + 4, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params + 5, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params + 6, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params + 7, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params + 8, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 9, sizeof(float));
+
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    GGML_ASSERT(nb0 == nb00);
+    GGML_ASSERT(nb0 == sizeof(T));
+    GGML_ASSERT(n_dims <= ne0);
+    GGML_ASSERT(n_dims % 2 == 0);
+    GGML_ASSERT(mode == GGML_ROPE_TYPE_NORMAL || mode == GGML_ROPE_TYPE_NEOX);
+
+    const int64_t n_nope = ne0 - n_dims;
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nr = ggml_nrows(dst);
+    const int dr = (nr + nth - 1)/nth;
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    float corr_dims[2];
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    const float * freq_factors = NULL;
+    if (src2 != NULL) {
+        GGML_ASSERT(src2->type == GGML_TYPE_F32);
+        GGML_ASSERT(src2->ne[0] >= n_dims / 2);
+        freq_factors = (const float *) src2->data;
+    }
+
+    const float sin_sign = inverse ? -1.0f : 1.0f;
+    const int32_t * pos = (const int32_t *) src1->data;
+
+    int ir = 0;
+    int64_t last_i2 = -1;
+
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                if (ir++ < ir0) continue;
+                if (ir   > ir1) break;
+
+                float * cache = (float *) params->wdata + (n_dims + CACHE_LINE_SIZE_F32)*ith;
+                if (last_i2 != i2) {
+                    const int64_t p = pos[i2];
+                    ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, n_dims, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+                    last_i2 = i2;
+                }
+
+                const T * src = (const T *)((const char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
+                T * dst_data  =       (T *)((      char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1);
+
+                for (int64_t i0 = 0; i0 < n_nope; ++i0) {
+                    dst_data[i0] = src[i0];
+                }
+
+                const T * src_tail = src + n_nope;
+                T * dst_tail = dst_data + n_nope;
+
+                switch (mode) {
+                    case GGML_ROPE_TYPE_NORMAL:
+                        rotate_pairs<T>(n_dims, 1, cache, src_tail, dst_tail, 1);
+                        break;
+                    case GGML_ROPE_TYPE_NEOX:
+                        rotate_pairs<T>(n_dims, n_dims/2, cache, src_tail, dst_tail);
+                        break;
+                    default:
+                        GGML_ABORT("rope type not supported");
+                }
+            }
+        }
+    }
+}
+
+void ggml_compute_forward_dsv4_rope_tail(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            {
+                ggml_compute_forward_dsv4_rope_tail_flt<ggml_fp16_t>(params, dst);
+            } break;
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_dsv4_rope_tail_flt<float>(params, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
 // ggml_compute_forward_conv_transpose_1d
 
 static void ggml_compute_forward_conv_transpose_1d_f16_f32(
@@ -10903,6 +11024,343 @@ void ggml_compute_forward_rwkv_wkv7(
     }
 }
 
+// ggml_compute_forward_dsv4_hc_split_sinkhorn
+
+void ggml_compute_forward_dsv4_hc_split_sinkhorn(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * mixes = dst->src[0];
+    const ggml_tensor * scale = dst->src[1];
+    const ggml_tensor * base  = dst->src[2];
+
+    GGML_ASSERT(mixes->type == GGML_TYPE_F32);
+    GGML_ASSERT(scale->type == GGML_TYPE_F32);
+    GGML_ASSERT(base->type  == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type   == GGML_TYPE_F32);
+    GGML_ASSERT(mixes->nb[0] == sizeof(float));
+    GGML_ASSERT(scale->nb[0] == sizeof(float));
+    GGML_ASSERT(base->nb[0]  == sizeof(float));
+    GGML_ASSERT(dst->nb[0]   == sizeof(float));
+
+    const int n_hc           = ggml_get_op_params_i32(dst, 0);
+    const int sinkhorn_iters = ggml_get_op_params_i32(dst, 1);
+    const float eps          = ggml_get_op_params_f32(dst, 2);
+    const int64_t mix_hc     = mixes->ne[0];
+    const int64_t n_rows     = ggml_nrows(mixes);
+
+    GGML_ASSERT(n_hc > 0 && n_hc <= 16);
+    GGML_ASSERT(sinkhorn_iters > 0);
+    GGML_ASSERT(mix_hc == (2 + n_hc) * n_hc);
+    GGML_ASSERT(ggml_nrows(dst) == n_rows);
+
+    const float * scale_data = (const float *) scale->data;
+    const float * base_data  = (const float *) base->data;
+
+    const float pre_scale  = scale_data[0];
+    const float post_scale = scale_data[1];
+    const float comb_scale = scale_data[2];
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t dr = (n_rows + nth - 1) / nth;
+    const int64_t r0 = dr * ith;
+    const int64_t r1 = std::min(r0 + dr, n_rows);
+
+    for (int64_t r = r0; r < r1; ++r) {
+        const float * mix = (const float *) ((const char *) mixes->data + r*mixes->nb[1]);
+        float * out = (float *) ((char *) dst->data + r*dst->nb[1]);
+
+        for (int i = 0; i < n_hc; ++i) {
+            const float z = mix[i] * pre_scale + base_data[i];
+            out[i] = 1.0f / (1.0f + expf(-z)) + eps;
+        }
+
+        for (int i = 0; i < n_hc; ++i) {
+            const int off = n_hc + i;
+            const float z = mix[off] * post_scale + base_data[off];
+            out[off] = 2.0f / (1.0f + expf(-z));
+        }
+
+        float c[16*16];
+
+        // comb is laid out as a flat [n_hc*n_hc] block per token, written as
+        // c[src_hc + dst_hc*n_hc]; after the graph's reshape_3d this is ggml
+        // tensor comb[ne0=src_hc, ne1=dst_hc, ne2=token]. The Sinkhorn pass
+        // below softmaxes over src_hc, then alternates row/col normalization.
+        // ggml_dsv4_hc_expand intentionally reads ggml-dim0 as dst_hc, which
+        // transposes this matrix on read so it computes comb^T @ residual
+        // (the V4 hyperconnection contract). CPU/Metal/CUDA use the identical
+        // flat write + transposed read; do not "fix" one side in isolation.
+        for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+            float row_max = -INFINITY;
+            for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                const int idx = src_hc + dst_hc*n_hc;
+                const int off = 2*n_hc + idx;
+                const float v = mix[off] * comb_scale + base_data[off];
+                c[idx] = v;
+                row_max = std::max(row_max, v);
+            }
+
+            float row_sum = 0.0f;
+            for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                const int idx = src_hc + dst_hc*n_hc;
+                const float v = expf(c[idx] - row_max);
+                c[idx] = v;
+                row_sum += v;
+            }
+
+            const float inv_sum = 1.0f / row_sum;
+            for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                const int idx = src_hc + dst_hc*n_hc;
+                c[idx] = c[idx] * inv_sum + eps;
+            }
+        }
+
+        for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+            float sum = 0.0f;
+            for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                sum += c[src_hc + dst_hc*n_hc];
+            }
+
+            const float inv_denom = 1.0f / (sum + eps);
+            for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                c[src_hc + dst_hc*n_hc] *= inv_denom;
+            }
+        }
+
+        for (int iter = 1; iter < sinkhorn_iters; ++iter) {
+            for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                float sum = 0.0f;
+                for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                    sum += c[src_hc + dst_hc*n_hc];
+                }
+
+                const float inv_denom = 1.0f / (sum + eps);
+                for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                    c[src_hc + dst_hc*n_hc] *= inv_denom;
+                }
+            }
+
+            for (int src_hc = 0; src_hc < n_hc; ++src_hc) {
+                float sum = 0.0f;
+                for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                    sum += c[src_hc + dst_hc*n_hc];
+                }
+
+                const float inv_denom = 1.0f / (sum + eps);
+                for (int dst_hc = 0; dst_hc < n_hc; ++dst_hc) {
+                    c[src_hc + dst_hc*n_hc] *= inv_denom;
+                }
+            }
+        }
+
+        for (int i = 0; i < n_hc*n_hc; ++i) {
+            out[2*n_hc + i] = c[i];
+        }
+    }
+}
+
+// ggml_compute_forward_dsv4_hc_weighted_sum
+
+void ggml_compute_forward_dsv4_hc_weighted_sum(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * x       = dst->src[0];
+    const ggml_tensor * weights = dst->src[1];
+
+    GGML_ASSERT(x->type       == GGML_TYPE_F32);
+    GGML_ASSERT(weights->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type     == GGML_TYPE_F32);
+    GGML_ASSERT(x->ne[0]       == dst->ne[0]);
+    GGML_ASSERT(x->ne[1]       == weights->ne[0]);
+    GGML_ASSERT(x->ne[2]       == dst->ne[1]);
+    GGML_ASSERT(weights->ne[1] == dst->ne[1]);
+    GGML_ASSERT(x->ne[3]       == 1);
+    GGML_ASSERT(weights->ne[2] == 1);
+    GGML_ASSERT(weights->ne[3] == 1);
+    GGML_ASSERT(dst->ne[2]     == 1);
+    GGML_ASSERT(dst->ne[3]     == 1);
+
+    const int64_t n_embd   = dst->ne[0];
+    const int64_t n_hc     = x->ne[1];
+    const int64_t n_tokens = dst->ne[1];
+    const int64_t n_elem   = n_embd * n_tokens;
+
+    const int64_t i0 = (n_elem * params->ith) / params->nth;
+    const int64_t i1 = (n_elem * (params->ith + 1)) / params->nth;
+
+    const char * x_data = (const char *) x->data;
+    const char * w_data = (const char *) weights->data;
+          char * y_data = (      char *) dst->data;
+
+    for (int64_t i = i0; i < i1; ++i) {
+        const int64_t d = i % n_embd;
+        const int64_t t = i / n_embd;
+
+        float acc = 0.0f;
+        for (int64_t h = 0; h < n_hc; ++h) {
+            const float xv = *(const float *) (x_data + d*x->nb[0] + h*x->nb[1] + t*x->nb[2]);
+            const float wv = *(const float *) (w_data + h*weights->nb[0] + t*weights->nb[1]);
+            acc += xv * wv;
+        }
+
+        *(float *) (y_data + d*dst->nb[0] + t*dst->nb[1]) = acc;
+    }
+}
+
+// ggml_compute_forward_dsv4_hc_expand
+
+void ggml_compute_forward_dsv4_hc_expand(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * block_out = dst->src[0];
+    const ggml_tensor * residual  = dst->src[1];
+    const ggml_tensor * post      = dst->src[2];
+    const ggml_tensor * comb      = dst->src[3];
+
+    GGML_ASSERT(block_out->type == GGML_TYPE_F32);
+    GGML_ASSERT(residual->type  == GGML_TYPE_F32);
+    GGML_ASSERT(post->type      == GGML_TYPE_F32);
+    GGML_ASSERT(comb->type      == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type       == GGML_TYPE_F32);
+    GGML_ASSERT(block_out->ne[0] == dst->ne[0]);
+    GGML_ASSERT(block_out->ne[1] == dst->ne[2]);
+    GGML_ASSERT(residual->ne[0]  == dst->ne[0]);
+    GGML_ASSERT(residual->ne[1]  == dst->ne[1]);
+    GGML_ASSERT(residual->ne[2]  == dst->ne[2]);
+    GGML_ASSERT(post->ne[0]      == dst->ne[1]);
+    GGML_ASSERT(post->ne[1]      == dst->ne[2]);
+    GGML_ASSERT(comb->ne[0]      == dst->ne[1]);
+    GGML_ASSERT(comb->ne[1]      == dst->ne[1]);
+    GGML_ASSERT(comb->ne[2]      == dst->ne[2]);
+    GGML_ASSERT(block_out->ne[3] == 1);
+    GGML_ASSERT(residual->ne[3]  == 1);
+    GGML_ASSERT(post->ne[2]      == 1);
+    GGML_ASSERT(post->ne[3]      == 1);
+    GGML_ASSERT(comb->ne[3]      == 1);
+    GGML_ASSERT(dst->ne[3]       == 1);
+
+    const int64_t n_embd   = dst->ne[0];
+    const int64_t n_hc     = dst->ne[1];
+    const int64_t n_tokens = dst->ne[2];
+    const int64_t n_elem   = n_embd * n_hc * n_tokens;
+
+    const int64_t i0 = (n_elem * params->ith) / params->nth;
+    const int64_t i1 = (n_elem * (params->ith + 1)) / params->nth;
+
+    const char * block_data = (const char *) block_out->data;
+    const char * res_data   = (const char *) residual->data;
+    const char * post_data  = (const char *) post->data;
+    const char * comb_data  = (const char *) comb->data;
+          char * dst_data   = (      char *) dst->data;
+
+    for (int64_t i = i0; i < i1; ++i) {
+        const int64_t d      = i % n_embd;
+        const int64_t tmp    = i / n_embd;
+        const int64_t dst_hc = tmp % n_hc;
+        const int64_t t      = tmp / n_hc;
+
+        const float block_v = *(const float *) (block_data + d*block_out->nb[0] + t*block_out->nb[1]);
+        const float post_v  = *(const float *) (post_data  + dst_hc*post->nb[0] + t*post->nb[1]);
+
+        float acc = block_v * post_v;
+        // comb arrives as comb[ne0=src_hc, ne1=dst_hc, ne2=t] from
+        // dsv4_hc_split_sinkhorn (flat write src_hc + dst_hc*n_hc). Reading
+        // ne0 as dst_hc and ne1 as src_hc here transposes it, giving
+        // (comb^T @ residual)[d, dst_hc] = sum_src_hc comb[src_hc,dst_hc,t]
+        // * residual[d, src_hc, t]. This transpose is deliberate and matches
+        // the Metal/CUDA expand kernels (validated 19/19 vs this CPU oracle).
+        for (int64_t src_hc = 0; src_hc < n_hc; ++src_hc) {
+            const float comb_v = *(const float *) (comb_data + dst_hc*comb->nb[0] + src_hc*comb->nb[1] + t*comb->nb[2]);
+            const float res_v  = *(const float *) (res_data  + d*residual->nb[0] + src_hc*residual->nb[1] + t*residual->nb[2]);
+            acc += comb_v * res_v;
+        }
+
+        *(float *) (dst_data + d*dst->nb[0] + dst_hc*dst->nb[1] + t*dst->nb[2]) = acc;
+    }
+}
+
+static float ggml_dsv4_e4m3fn_dequant(float x) {
+    const float sign = x < 0.0f ? -1.0f : 1.0f;
+    const float ax = std::min(std::fabs(x), 448.0f);
+
+    int best = 0;
+    float best_diff = ax;
+
+    for (int i = 1; i < 127; ++i) {
+        const int exp  = (i >> 3) & 0x0f;
+        const int mant = i & 0x07;
+        const float val = exp == 0
+            ? std::ldexp(float(mant), -9)
+            : std::ldexp(1.0f + float(mant) / 8.0f, exp - 7);
+        const float diff = std::fabs(ax - val);
+        if (diff < best_diff || (diff == best_diff && (i & 1) == 0 && (best & 1) != 0)) {
+            best = i;
+            best_diff = diff;
+        }
+    }
+
+    const int exp  = (best >> 3) & 0x0f;
+    const int mant = best & 0x07;
+    const float val = exp == 0
+        ? std::ldexp(float(mant), -9)
+        : std::ldexp(1.0f + float(mant) / 8.0f, exp - 7);
+
+    return sign * val;
+}
+
+void ggml_compute_forward_dsv4_fp8_kv_quantize(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+    const ggml_tensor * src0 = dst->src[0];
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    const int64_t n_rot = ggml_get_op_params_i32(dst, 0);
+    const int64_t head_dim = src0->ne[0];
+    const int64_t n_nope = head_dim - n_rot;
+
+    GGML_ASSERT(n_rot >= 0);
+    GGML_ASSERT(n_nope > 0);
+    GGML_ASSERT(n_nope % 64 == 0);
+
+    const int64_t n_rows = src0->ne[1] * src0->ne[2] * src0->ne[3];
+    const int64_t row_start = (n_rows * params->ith) / params->nth;
+    const int64_t row_end   = (n_rows * (params->ith + 1)) / params->nth;
+
+    for (int64_t row = row_start; row < row_end; ++row) {
+        const int64_t i1 = row % src0->ne[1];
+        const int64_t i2 = (row / src0->ne[1]) % src0->ne[2];
+        const int64_t i3 = row / (src0->ne[1] * src0->ne[2]);
+
+        const char * src_base = (const char *) src0->data + i1*src0->nb[1] + i2*src0->nb[2] + i3*src0->nb[3];
+        char       * dst_base = (      char *) dst->data  + i1*dst->nb[1]  + i2*dst->nb[2]  + i3*dst->nb[3];
+
+        for (int64_t off = 0; off < n_nope; off += 64) {
+            float amax = 0.0f;
+            for (int64_t i = 0; i < 64; ++i) {
+                const float v = *(const float *) (src_base + (off + i)*src0->nb[0]);
+                amax = std::max(amax, std::fabs(v));
+            }
+
+            amax = std::max(amax, 1.0e-4f);
+            const float scale = std::ldexp(1.0f, int(std::ceil(std::log2(amax / 448.0f))));
+            for (int64_t i = 0; i < 64; ++i) {
+                const float v = *(const float *) (src_base + (off + i)*src0->nb[0]);
+                *(float *) (dst_base + (off + i)*dst->nb[0]) =
+                    ggml_dsv4_e4m3fn_dequant(std::clamp(v / scale, -448.0f, 448.0f)) * scale;
+            }
+        }
+
+        for (int64_t i = n_nope; i < head_dim; ++i) {
+            *(float *) (dst_base + i*dst->nb[0]) = *(const float *) (src_base + i*src0->nb[0]);
+        }
+    }
+}
+
 // ggml_compute_forward_map_custom1
 
 void ggml_compute_forward_map_custom1(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 7398e5618948..4da4db62aa4e 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -104,6 +104,11 @@ void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, s
 void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_gated_delta_net(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_hc_split_sinkhorn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_hc_weighted_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_hc_expand(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_fp8_kv_quantize(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_dsv4_rope_tail(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 476c30797956..8b06c0bd5a49 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -1063,6 +1063,11 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "RWKV_WKV7",
     "SOLVE_TRI",
     "GATED_DELTA_NET",
+    "DSV4_HC_SPLIT_SINKHORN",
+    "DSV4_HC_WEIGHTED_SUM",
+    "DSV4_HC_EXPAND",
+    "DSV4_FP8_KV_QUANTIZE",
+    "DSV4_ROPE_TAIL",
 
     "UNARY",
 
@@ -1080,7 +1085,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "GLU",
 };
 
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1173,6 +1178,11 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "rwkv_wkv7(r, w, k, v, a, b, s)",
     "A X = B, A triangular, solve X",
     "gated_delta_net(q, k, v, g, beta, s)",
+    "dsv4_hc_split_sinkhorn(x)",
+    "dsv4_hc_weighted_sum(x)",
+    "dsv4_hc_expand(x)",
+    "dsv4_fp8_kv_quantize(x)",
+    "dsv4_rope_tail(x)",
 
     "unary(x)",
 
@@ -1190,7 +1200,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "glu(x)",
 };
 
-static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96");
+static_assert(GGML_OP_COUNT == 101, "GGML_OP_COUNT != 101");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -6230,6 +6240,180 @@ struct ggml_tensor * ggml_gated_delta_net(
     return result;
 }
 
+// ggml_dsv4_hc_split_sinkhorn
+
+struct ggml_tensor * ggml_dsv4_hc_split_sinkhorn(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * mixes,
+        struct ggml_tensor  * scale,
+        struct ggml_tensor  * base,
+        int                   n_hc,
+        int                   sinkhorn_iters,
+        float                 eps) {
+    GGML_ASSERT(mixes->type == GGML_TYPE_F32);
+    GGML_ASSERT(scale->type == GGML_TYPE_F32);
+    GGML_ASSERT(base->type  == GGML_TYPE_F32);
+
+    GGML_ASSERT(ggml_is_contiguous_rows(mixes));
+    GGML_ASSERT(ggml_is_contiguous(scale));
+    GGML_ASSERT(ggml_is_contiguous(base));
+
+    GGML_ASSERT(n_hc > 0);
+    GGML_ASSERT(n_hc <= 16);  // CPU forward uses a fixed float c[16*16] scratch
+    GGML_ASSERT(sinkhorn_iters > 0);
+    GGML_ASSERT(mixes->ne[0] == (2 + n_hc) * n_hc);
+    GGML_ASSERT(mixes->ne[2] == 1);
+    GGML_ASSERT(mixes->ne[3] == 1);
+    GGML_ASSERT(ggml_nelements(scale) >= 3);
+    GGML_ASSERT(ggml_nelements(base)  >= mixes->ne[0]);
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, mixes);
+
+    ggml_set_op_params_i32(result, 0, n_hc);
+    ggml_set_op_params_i32(result, 1, sinkhorn_iters);
+    ggml_set_op_params_f32(result, 2, eps);
+
+    result->op     = GGML_OP_DSV4_HC_SPLIT_SINKHORN;
+    result->src[0] = mixes;
+    result->src[1] = scale;
+    result->src[2] = base;
+
+    return result;
+}
+
+// ggml_dsv4_hc_weighted_sum
+
+struct ggml_tensor * ggml_dsv4_hc_weighted_sum(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * weights) {
+    GGML_ASSERT(x->type       == GGML_TYPE_F32);
+    GGML_ASSERT(weights->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(x->ne[1] == weights->ne[0]);
+    GGML_ASSERT(x->ne[2] == weights->ne[1]);
+    GGML_ASSERT(x->ne[3] == 1);
+    GGML_ASSERT(weights->ne[2] == 1);
+    GGML_ASSERT(weights->ne[3] == 1);
+
+    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, x->ne[0], x->ne[2]);
+
+    result->op     = GGML_OP_DSV4_HC_WEIGHTED_SUM;
+    result->src[0] = x;
+    result->src[1] = weights;
+
+    return result;
+}
+
+// ggml_dsv4_hc_expand
+
+struct ggml_tensor * ggml_dsv4_hc_expand(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * block_out,
+        struct ggml_tensor  * residual,
+        struct ggml_tensor  * post,
+        struct ggml_tensor  * comb) {
+    GGML_ASSERT(block_out->type == GGML_TYPE_F32);
+    GGML_ASSERT(residual->type  == GGML_TYPE_F32);
+    GGML_ASSERT(post->type      == GGML_TYPE_F32);
+    GGML_ASSERT(comb->type      == GGML_TYPE_F32);
+
+    GGML_ASSERT(block_out->ne[0] == residual->ne[0]);
+    GGML_ASSERT(block_out->ne[1] == residual->ne[2]);
+    GGML_ASSERT(block_out->ne[2] == 1);
+    GGML_ASSERT(block_out->ne[3] == 1);
+    GGML_ASSERT(post->ne[0] == residual->ne[1]);
+    GGML_ASSERT(post->ne[1] == residual->ne[2]);
+    GGML_ASSERT(post->ne[2] == 1);
+    GGML_ASSERT(post->ne[3] == 1);
+    GGML_ASSERT(comb->ne[0] == residual->ne[1]);
+    GGML_ASSERT(comb->ne[1] == residual->ne[1]);
+    GGML_ASSERT(comb->ne[2] == residual->ne[2]);
+    GGML_ASSERT(comb->ne[3] == 1);
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, residual);
+
+    result->op     = GGML_OP_DSV4_HC_EXPAND;
+    result->src[0] = block_out;
+    result->src[1] = residual;
+    result->src[2] = post;
+    result->src[3] = comb;
+
+    return result;
+}
+
+// ggml_dsv4_fp8_kv_quantize
+
+struct ggml_tensor * ggml_dsv4_fp8_kv_quantize(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int                   n_rot) {
+    GGML_ASSERT(a->type == GGML_TYPE_F32);
+    GGML_ASSERT(n_rot >= 0);
+    GGML_ASSERT(a->ne[0] > n_rot);
+    GGML_ASSERT((a->ne[0] - n_rot) % 64 == 0);
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    ggml_set_op_params_i32(result, 0, n_rot);
+
+    result->op     = GGML_OP_DSV4_FP8_KV_QUANTIZE;
+    result->src[0] = a;
+
+    return result;
+}
+
+// ggml_dsv4_rope_tail
+
+struct ggml_tensor * ggml_dsv4_rope_tail(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * pos,
+        struct ggml_tensor  * freq_factors,
+        int                   n_dims,
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow,
+        bool                  inverse) {
+    GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
+    GGML_ASSERT(mode == GGML_ROPE_TYPE_NORMAL || mode == GGML_ROPE_TYPE_NEOX);
+    GGML_ASSERT(a->type == GGML_TYPE_F32 || a->type == GGML_TYPE_F16);
+    GGML_ASSERT(pos->type == GGML_TYPE_I32);
+    GGML_ASSERT(ggml_is_vector(pos));
+    GGML_ASSERT(a->ne[2] == pos->ne[0]);
+    GGML_ASSERT(n_dims > 0);
+    GGML_ASSERT(n_dims <= a->ne[0]);
+    GGML_ASSERT(n_dims % 2 == 0);
+
+    if (freq_factors) {
+        GGML_ASSERT(freq_factors->type == GGML_TYPE_F32);
+        GGML_ASSERT(freq_factors->ne[0] >= n_dims / 2);
+    }
+
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
+
+    int32_t params[16] = { n_dims, mode, n_ctx_orig, inverse ? 1 : 0 };
+    memcpy(params +  4, &freq_base,   sizeof(float));
+    memcpy(params +  5, &freq_scale,  sizeof(float));
+    memcpy(params +  6, &ext_factor,  sizeof(float));
+    memcpy(params +  7, &attn_factor, sizeof(float));
+    memcpy(params +  8, &beta_fast,   sizeof(float));
+    memcpy(params +  9, &beta_slow,   sizeof(float));
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_DSV4_ROPE_TAIL;
+    result->src[0] = a;
+    result->src[1] = pos;
+    result->src[2] = freq_factors;
+
+    return result;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 struct ggml_hash_set ggml_hash_set_new(size_t size) {
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index c25f217f990e..8f44c7965e87 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -145,6 +145,10 @@ class LLM:
         INTERLEAVE_MOE_LAYER_STEP         = "{arch}.interleave_moe_layer_step"
         FULL_ATTENTION_INTERVAL           = "{arch}.full_attention_interval"
         ACTIVATION_SPARSITY_SCALE         = "{arch}.activation_sparsity_scale"
+        HASH_LAYER_COUNT                  = "{arch}.hash_layer_count"
+        HYPER_CONNECTION_COUNT            = "{arch}.hyper_connection.count"
+        HYPER_CONNECTION_SINKHORN_ITERS   = "{arch}.hyper_connection.sinkhorn_iterations"
+        HYPER_CONNECTION_EPS              = "{arch}.hyper_connection.epsilon"
         ALTUP_ACTIVE_IDX                  = "{arch}.altup.active_idx"
         ALTUP_NUM_INPUTS                  = "{arch}.altup.num_inputs"
         EMBD_LENGTH_PER_LAYER_INP         = "{arch}.embedding_length_per_layer_input"
@@ -184,6 +188,10 @@ class Attention:
         SHARED_KV_LAYERS             = "{arch}.attention.shared_kv_layers"
         SLIDING_WINDOW_PATTERN       = "{arch}.attention.sliding_window_pattern"
         TEMPERATURE_SCALE            = "{arch}.attention.temperature_scale"
+        COMPRESS_RATIOS              = "{arch}.attention.compress_ratios"
+        COMPRESS_ROPE_FREQ_BASE      = "{arch}.attention.compress_rope_freq_base"
+        OUTPUT_LORA_RANK             = "{arch}.attention.output_lora_rank"
+        OUTPUT_GROUP_COUNT           = "{arch}.attention.output_group_count"
 
         class Indexer:
             HEAD_COUNT = "{arch}.attention.indexer.head_count"
@@ -451,6 +459,7 @@ class MODEL_ARCH(IntEnum):
     DEEPSEEK         = auto()
     DEEPSEEK2        = auto()
     DEEPSEEK2OCR     = auto()
+    DEEPSEEK4        = auto()
     CHATGLM          = auto()
     GLM4             = auto()
     GLM4_MOE         = auto()
@@ -527,6 +536,9 @@ class MODEL_TENSOR(IntEnum):
     TOKEN_TYPES          = auto()
     POS_EMBD             = auto()
     OUTPUT               = auto()
+    OUTPUT_HC_BASE       = auto() # deepseek4 hyper-connection output
+    OUTPUT_HC_FN         = auto() # deepseek4 hyper-connection output
+    OUTPUT_HC_SCALE      = auto() # deepseek4 hyper-connection output
     DENSE_2_OUT          = auto() # embeddinggemma 2_Dense
     DENSE_3_OUT          = auto() # embeddinggemma 3_Dense
     OUTPUT_NORM          = auto()
@@ -650,12 +662,19 @@ class MODEL_TENSOR(IntEnum):
     CHANNEL_MIX_VALUE    = auto()
     ATTN_Q_A             = auto()
     ATTN_Q_B             = auto()
+    ATTN_KV              = auto() # deepseek4 single-tensor combined KV projection
     ATTN_KV_A_MQA        = auto()
     ATTN_KV_B            = auto()
     ATTN_K_B             = auto()
     ATTN_V_B             = auto()
+    ATTN_OUT_A           = auto() # deepseek4 attention output LoRA
+    ATTN_OUT_B           = auto() # deepseek4 attention output LoRA
     ATTN_Q_A_NORM        = auto()
     ATTN_KV_A_NORM       = auto()
+    ATTN_COMPRESSOR_APE  = auto() # deepseek4 attention compressor
+    ATTN_COMPRESSOR_KV   = auto() # deepseek4 attention compressor
+    ATTN_COMPRESSOR_GATE = auto() # deepseek4 attention compressor
+    ATTN_COMPRESSOR_NORM = auto() # deepseek4 attention compressor
     FFN_SUB_NORM         = auto()
     ATTN_SUB_NORM        = auto()
     DEC_ATTN_NORM        = auto()
@@ -717,6 +736,17 @@ class MODEL_TENSOR(IntEnum):
     INDEXER_PROJ         = auto()
     INDEXER_ATTN_K       = auto()
     INDEXER_ATTN_Q_B     = auto()
+    INDEXER_COMPRESSOR_APE  = auto() # deepseek4 indexer compressor
+    INDEXER_COMPRESSOR_KV   = auto() # deepseek4 indexer compressor
+    INDEXER_COMPRESSOR_GATE = auto() # deepseek4 indexer compressor
+    INDEXER_COMPRESSOR_NORM = auto() # deepseek4 indexer compressor
+    HC_ATTN_BASE         = auto() # deepseek4 hyper-connection attention
+    HC_ATTN_FN           = auto() # deepseek4 hyper-connection attention
+    HC_ATTN_SCALE        = auto() # deepseek4 hyper-connection attention
+    HC_FFN_BASE          = auto() # deepseek4 hyper-connection ffn
+    HC_FFN_FN            = auto() # deepseek4 hyper-connection ffn
+    HC_FFN_SCALE         = auto() # deepseek4 hyper-connection ffn
+    FFN_GATE_TID2EID     = auto() # deepseek4 token-id-to-expert-id gating
     # vision
     V_MMPROJ             = auto()
     V_MMPROJ_FC          = auto()
@@ -966,6 +996,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.DEEPSEEK:         "deepseek",
     MODEL_ARCH.DEEPSEEK2:        "deepseek2",
     MODEL_ARCH.DEEPSEEK2OCR:     "deepseek2-ocr",
+    MODEL_ARCH.DEEPSEEK4:        "deepseek4",
     MODEL_ARCH.CHATGLM:          "chatglm",
     MODEL_ARCH.GLM4:             "glm4",
     MODEL_ARCH.GLM4_MOE:         "glm4moe",
@@ -1042,6 +1073,9 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.POS_EMBD:                  "position_embd",
     MODEL_TENSOR.OUTPUT_NORM:               "output_norm",
     MODEL_TENSOR.OUTPUT:                    "output",
+    MODEL_TENSOR.OUTPUT_HC_BASE:            "output_hc_base",
+    MODEL_TENSOR.OUTPUT_HC_FN:              "output_hc_fn",
+    MODEL_TENSOR.OUTPUT_HC_SCALE:           "output_hc_scale",
     MODEL_TENSOR.DENSE_2_OUT:                "dense_2", # embeddinggemma 2_Dense
     MODEL_TENSOR.DENSE_3_OUT:                "dense_3", # embeddinggemma 2_Dense
     MODEL_TENSOR.ROPE_FREQS:                "rope_freqs",
@@ -1164,12 +1198,19 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.CHANNEL_MIX_VALUE:         "blk.{bid}.channel_mix_value",
     MODEL_TENSOR.ATTN_Q_A:                  "blk.{bid}.attn_q_a",
     MODEL_TENSOR.ATTN_Q_B:                  "blk.{bid}.attn_q_b",
+    MODEL_TENSOR.ATTN_KV:                   "blk.{bid}.attn_kv",
     MODEL_TENSOR.ATTN_KV_A_MQA:             "blk.{bid}.attn_kv_a_mqa",
     MODEL_TENSOR.ATTN_KV_B:                 "blk.{bid}.attn_kv_b",
     MODEL_TENSOR.ATTN_K_B:                  "blk.{bid}.attn_k_b",
     MODEL_TENSOR.ATTN_V_B:                  "blk.{bid}.attn_v_b",
+    MODEL_TENSOR.ATTN_OUT_A:                "blk.{bid}.attn_output_a",
+    MODEL_TENSOR.ATTN_OUT_B:                "blk.{bid}.attn_output_b",
     MODEL_TENSOR.ATTN_Q_A_NORM:             "blk.{bid}.attn_q_a_norm",
     MODEL_TENSOR.ATTN_KV_A_NORM:            "blk.{bid}.attn_kv_a_norm",
+    MODEL_TENSOR.ATTN_COMPRESSOR_APE:       "blk.{bid}.attn_compressor_ape",
+    MODEL_TENSOR.ATTN_COMPRESSOR_KV:        "blk.{bid}.attn_compressor_kv",
+    MODEL_TENSOR.ATTN_COMPRESSOR_GATE:      "blk.{bid}.attn_compressor_gate",
+    MODEL_TENSOR.ATTN_COMPRESSOR_NORM:      "blk.{bid}.attn_compressor_norm",
     MODEL_TENSOR.ATTN_SUB_NORM:             "blk.{bid}.attn_sub_norm",
     MODEL_TENSOR.FFN_SUB_NORM:              "blk.{bid}.ffn_sub_norm",
     MODEL_TENSOR.DEC_ATTN_NORM:             "dec.blk.{bid}.attn_norm",
@@ -1231,6 +1272,17 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.INDEXER_PROJ:              "blk.{bid}.indexer.proj",
     MODEL_TENSOR.INDEXER_ATTN_K:            "blk.{bid}.indexer.attn_k",
     MODEL_TENSOR.INDEXER_ATTN_Q_B:          "blk.{bid}.indexer.attn_q_b",
+    MODEL_TENSOR.INDEXER_COMPRESSOR_APE:    "blk.{bid}.indexer_compressor_ape",
+    MODEL_TENSOR.INDEXER_COMPRESSOR_KV:     "blk.{bid}.indexer_compressor_kv",
+    MODEL_TENSOR.INDEXER_COMPRESSOR_GATE:   "blk.{bid}.indexer_compressor_gate",
+    MODEL_TENSOR.INDEXER_COMPRESSOR_NORM:   "blk.{bid}.indexer_compressor_norm",
+    MODEL_TENSOR.HC_ATTN_BASE:              "blk.{bid}.hc_attn_base",
+    MODEL_TENSOR.HC_ATTN_FN:                "blk.{bid}.hc_attn_fn",
+    MODEL_TENSOR.HC_ATTN_SCALE:             "blk.{bid}.hc_attn_scale",
+    MODEL_TENSOR.HC_FFN_BASE:               "blk.{bid}.hc_ffn_base",
+    MODEL_TENSOR.HC_FFN_FN:                 "blk.{bid}.hc_ffn_fn",
+    MODEL_TENSOR.HC_FFN_SCALE:              "blk.{bid}.hc_ffn_scale",
+    MODEL_TENSOR.FFN_GATE_TID2EID:          "blk.{bid}.ffn_gate_tid2eid",
     # vision
     MODEL_TENSOR.V_MMPROJ:                  "mm.{bid}",
     MODEL_TENSOR.V_MMPROJ_FC:               "mm.model.fc",
@@ -2928,6 +2980,49 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_UP_SHEXP,
         MODEL_TENSOR.FFN_EXP_PROBS_B,
     ],
+    MODEL_ARCH.DEEPSEEK4: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.OUTPUT_HC_BASE,
+        MODEL_TENSOR.OUTPUT_HC_FN,
+        MODEL_TENSOR.OUTPUT_HC_SCALE,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_SINKS,
+        MODEL_TENSOR.ATTN_Q_A,
+        MODEL_TENSOR.ATTN_Q_B,
+        MODEL_TENSOR.ATTN_Q_A_NORM,
+        MODEL_TENSOR.ATTN_KV,
+        MODEL_TENSOR.ATTN_KV_A_NORM,
+        MODEL_TENSOR.ATTN_OUT_A,
+        MODEL_TENSOR.ATTN_OUT_B,
+        MODEL_TENSOR.ATTN_COMPRESSOR_APE,
+        MODEL_TENSOR.ATTN_COMPRESSOR_KV,
+        MODEL_TENSOR.ATTN_COMPRESSOR_GATE,
+        MODEL_TENSOR.ATTN_COMPRESSOR_NORM,
+        MODEL_TENSOR.INDEXER_PROJ,
+        MODEL_TENSOR.INDEXER_ATTN_Q_B,
+        MODEL_TENSOR.INDEXER_COMPRESSOR_APE,
+        MODEL_TENSOR.INDEXER_COMPRESSOR_KV,
+        MODEL_TENSOR.INDEXER_COMPRESSOR_GATE,
+        MODEL_TENSOR.INDEXER_COMPRESSOR_NORM,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
+        MODEL_TENSOR.FFN_GATE_TID2EID,
+        MODEL_TENSOR.HC_ATTN_BASE,
+        MODEL_TENSOR.HC_ATTN_FN,
+        MODEL_TENSOR.HC_ATTN_SCALE,
+        MODEL_TENSOR.HC_FFN_BASE,
+        MODEL_TENSOR.HC_FFN_FN,
+        MODEL_TENSOR.HC_FFN_SCALE,
+    ],
     MODEL_ARCH.ERNIE4_5_MOE: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
@@ -4147,6 +4242,8 @@ class GGMLQuantizationType(IntEnum):
 class ExpertGatingFuncType(IntEnum):
     SOFTMAX  = 1
     SIGMOID  = 2
+    SOFTMAX_WEIGHT = 3
+    SQRTSOFTPLUS   = 4
 
 
 # TODO: add GGMLFileType from ggml_ftype in ggml.h
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index a101382719d0..4e3c94de12b9 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -868,6 +868,18 @@ def add_moe_latent_size(self, value: int) -> None:
     def add_nextn_predict_layers(self, count: int) -> None:
         self.add_uint32(Keys.LLM.NEXTN_PREDICT_LAYERS.format(arch=self.arch), count)
 
+    def add_hash_layer_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.HASH_LAYER_COUNT.format(arch=self.arch), count)
+
+    def add_hyper_connection_count(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.HYPER_CONNECTION_COUNT.format(arch=self.arch), count)
+
+    def add_hyper_connection_sinkhorn_iters(self, count: int) -> None:
+        self.add_uint32(Keys.LLM.HYPER_CONNECTION_SINKHORN_ITERS.format(arch=self.arch), count)
+
+    def add_hyper_connection_eps(self, value: float) -> None:
+        self.add_float32(Keys.LLM.HYPER_CONNECTION_EPS.format(arch=self.arch), value)
+
     def add_swin_norm(self, value: bool) -> None:
         self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
 
@@ -952,6 +964,18 @@ def add_attn_temperature_length(self, value: int) -> None:
     def add_attn_temperature_scale(self, value: float) -> None:
         self.add_float32(Keys.Attention.TEMPERATURE_SCALE.format(arch=self.arch), value)
 
+    def add_attention_compress_ratios(self, values: Sequence[int]) -> None:
+        self.add_array(Keys.Attention.COMPRESS_RATIOS.format(arch=self.arch), values)
+
+    def add_attention_compress_rope_freq_base(self, value: float) -> None:
+        self.add_float32(Keys.Attention.COMPRESS_ROPE_FREQ_BASE.format(arch=self.arch), value)
+
+    def add_attention_output_lora_rank(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.OUTPUT_LORA_RANK.format(arch=self.arch), value)
+
+    def add_attention_output_group_count(self, value: int) -> None:
+        self.add_uint32(Keys.Attention.OUTPUT_GROUP_COUNT.format(arch=self.arch), value)
+
     def add_pooling_type(self, value: PoolingType) -> None:
         self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
 
diff --git a/models/templates/deepseek-ai-DeepSeek-V4.jinja b/models/templates/deepseek-ai-DeepSeek-V4.jinja
new file mode 100644
index 000000000000..44d5b785ec04
--- /dev/null
+++ b/models/templates/deepseek-ai-DeepSeek-V4.jinja
@@ -0,0 +1,96 @@
+{%- if not add_generation_prompt is defined -%}
+  {%- set add_generation_prompt = false -%}
+{%- endif -%}
+{%- if not thinking is defined -%}
+  {%- if enable_thinking is defined -%}
+    {%- set thinking = enable_thinking -%}
+  {%- else -%}
+    {%- set thinking = false -%}
+  {%- endif -%}
+{%- endif -%}
+{%- set dsml_token = '｜DSML｜' -%}
+{%- set thinking_start_token = '<think>' -%}
+{%- set thinking_end_token = '</think>' -%}
+{%- set tools_header = '## Tools\n\nYou have access to a set of tools to help answer the user question. You can invoke tools by writing a "<' + dsml_token + 'tool_calls>" block like the following:\n\n<' + dsml_token + 'tool_calls>\n<' + dsml_token + 'invoke name="$TOOL_NAME">\n<' + dsml_token + 'parameter name="$PARAMETER_NAME" string="true|false">$PARAMETER_VALUE</' + dsml_token + 'parameter>\n...\n</' + dsml_token + 'invoke>\n<' + dsml_token + 'invoke name="$TOOL_NAME2">\n...\n</' + dsml_token + 'invoke>\n</' + dsml_token + 'tool_calls>\n\nString parameters should be specified as is and set `string="true"`. For all other types (numbers, booleans, arrays, objects), pass the value in JSON format and set `string="false"`.\n\nIf thinking_mode is enabled (triggered by ' + thinking_start_token + '), you MUST output your complete reasoning inside ' + thinking_start_token + '...' + thinking_end_token + ' BEFORE any tool calls or final response.\n\nOtherwise, output directly after ' + thinking_end_token + ' with tool calls or final response.\n\n### Available Tool Schemas\n\n' -%}
+{%- set tools_footer = '\n\nYou MUST strictly follow the above defined tool name and parameter schemas to invoke tool calls.' -%}
+{%- set ns = namespace(system_prompt='', is_first_system=true, pending_assistant=false, pending_tool_result=false) -%}
+{%- for message in messages -%}
+  {%- if message['role'] == 'system' -%}
+    {%- if ns.is_first_system -%}
+      {%- set ns.system_prompt = ns.system_prompt + (message['content'] or '') -%}
+      {%- set ns.is_first_system = false -%}
+    {%- else -%}
+      {%- set ns.system_prompt = ns.system_prompt + '\n\n' + (message['content'] or '') -%}
+    {%- endif -%}
+  {%- endif -%}
+{%- endfor -%}
+{%- if tools is defined and tools -%}
+  {%- set ts = namespace(schemas='') -%}
+  {%- for tool in tools -%}
+    {%- if tool['type'] == 'function' -%}
+      {%- set ts.schemas = ts.schemas + (tool['function'] | tojson) + '\n' -%}
+    {%- endif -%}
+  {%- endfor -%}
+  {%- if ns.system_prompt -%}
+    {%- set ns.system_prompt = ns.system_prompt + '\n\n' + tools_header + ts.schemas + tools_footer -%}
+  {%- else -%}
+    {%- set ns.system_prompt = tools_header + ts.schemas + tools_footer -%}
+  {%- endif -%}
+{%- endif -%}
+{{- bos_token -}}
+{{- ns.system_prompt -}}
+{%- for message in messages -%}
+  {%- if message['role'] == 'user' -%}
+    {{- '<｜User｜>' + (message['content'] or '') -}}
+    {%- set ns.pending_assistant = true -%}
+    {%- set ns.pending_tool_result = true -%}
+  {%- elif message['role'] == 'tool' -%}
+    {%- if not ns.pending_tool_result -%}
+      {{- '<｜User｜>' -}}
+    {%- endif -%}
+    {{- '<tool_result>' + (message['content'] or '') + '</tool_result>' -}}
+    {%- set ns.pending_assistant = true -%}
+    {%- set ns.pending_tool_result = true -%}
+  {%- elif message['role'] == 'assistant' -%}
+    {%- if ns.pending_assistant -%}
+      {{- '<｜Assistant｜>' -}}
+      {%- if thinking and message['reasoning_content'] is defined and message['reasoning_content'] -%}
+        {{- thinking_start_token + message['reasoning_content'] + thinking_end_token -}}
+      {%- else -%}
+        {{- thinking_end_token -}}
+      {%- endif -%}
+    {%- endif -%}
+    {{- (message['content'] or '') -}}
+    {%- if message['tool_calls'] -%}
+      {{- '\n\n<' + dsml_token + 'tool_calls>\n' -}}
+      {%- for tool in message['tool_calls'] -%}
+        {%- set func = tool['function'] -%}
+        {{- '<' + dsml_token + 'invoke name="' + func['name'] + '">\n' -}}
+        {%- set args = func['arguments'] -%}
+        {%- if args is string -%}
+          {%- set args = args | from_json -%}
+        {%- endif -%}
+        {%- for key, val in args.items() -%}
+          {%- if val is string -%}
+            {{- '<' + dsml_token + 'parameter name="' + key + '" string="true">' + val + '</' + dsml_token + 'parameter>\n' -}}
+          {%- else -%}
+            {{- '<' + dsml_token + 'parameter name="' + key + '" string="false">' + (val | tojson) + '</' + dsml_token + 'parameter>\n' -}}
+          {%- endif -%}
+        {%- endfor -%}
+        {{- '</' + dsml_token + 'invoke>\n' -}}
+      {%- endfor -%}
+      {{- '</' + dsml_token + 'tool_calls>' -}}
+    {%- endif -%}
+    {{- '<｜end▁of▁sentence｜>' -}}
+    {%- set ns.pending_assistant = false -%}
+    {%- set ns.pending_tool_result = false -%}
+  {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt and ns.pending_assistant -%}
+  {{- '<｜Assistant｜>' -}}
+  {%- if thinking -%}
+    {{- thinking_start_token -}}
+  {%- else -%}
+    {{- thinking_end_token -}}
+  {%- endif -%}
+{%- endif -%}
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index c9eead18aa39..e789e5a681ae 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -75,6 +75,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_DEEPSEEK,         "deepseek"         },
     { LLM_ARCH_DEEPSEEK2,        "deepseek2"        },
     { LLM_ARCH_DEEPSEEK2OCR,     "deepseek2-ocr"    },
+    { LLM_ARCH_DEEPSEEK4,        "deepseek4"        },
     { LLM_ARCH_CHATGLM,          "chatglm"          },
     { LLM_ARCH_GLM4,             "glm4"             },
     { LLM_ARCH_GLM4_MOE,         "glm4moe"          },
@@ -209,6 +210,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKEN_SHIFT_COUNT,                 "%s.token_shift_count"                 },
     { LLM_KV_INTERLEAVE_MOE_LAYER_STEP,         "%s.interleave_moe_layer_step"         },
     { LLM_KV_FULL_ATTENTION_INTERVAL,           "%s.full_attention_interval"           },
+    { LLM_KV_HASH_LAYER_COUNT,                  "%s.hash_layer_count"                  },
+    { LLM_KV_HYPER_CONNECTION_COUNT,            "%s.hyper_connection.count"            },
+    { LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS,   "%s.hyper_connection.sinkhorn_iterations" },
+    { LLM_KV_HYPER_CONNECTION_EPS,              "%s.hyper_connection.epsilon"          },
 
     { LLM_KV_ATTENTION_HEAD_COUNT,                   "%s.attention.head_count"                   },
     { LLM_KV_ATTENTION_HEAD_COUNT_KV,                "%s.attention.head_count_kv"                },
@@ -243,6 +248,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,           "%s.attention.indexer.key_length"           },
     { LLM_KV_ATTENTION_INDEXER_TOP_K,                "%s.attention.indexer.top_k"                },
     { LLM_KV_ATTENTION_SHARED_KV_LAYERS,             "%s.attention.shared_kv_layers"             },
+    { LLM_KV_ATTENTION_COMPRESS_RATIOS,              "%s.attention.compress_ratios"              },
+    { LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE,      "%s.attention.compress_rope_freq_base"      },
+    { LLM_KV_ATTENTION_OUTPUT_LORA_RANK,             "%s.attention.output_lora_rank"             },
+    { LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT,           "%s.attention.output_group_count"           },
 
     { LLM_KV_ROPE_DIMENSION_COUNT,           "%s.rope.dimension_count"                 },
     { LLM_KV_ROPE_DIMENSION_COUNT_SWA,       "%s.rope.dimension_count_swa"             },
@@ -346,6 +355,9 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_OUTPUT_NORM,                            "output_norm" },
     { LLM_TENSOR_OUTPUT_NORM_LFM2,                       "token_embd_norm" }, // fix for wrong tensor name
     { LLM_TENSOR_OUTPUT,                                 "output" },
+    { LLM_TENSOR_OUTPUT_HC_BASE,                         "output_hc_base" },
+    { LLM_TENSOR_OUTPUT_HC_FN,                           "output_hc_fn" },
+    { LLM_TENSOR_OUTPUT_HC_SCALE,                        "output_hc_scale" },
     { LLM_TENSOR_ROPE_FREQS,                             "rope_freqs" },
     { LLM_TENSOR_ATTN_NORM,                              "blk.%d.attn_norm" },
     { LLM_TENSOR_ATTN_Q,                                 "blk.%d.attn_q" },
@@ -422,8 +434,15 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_ATTN_KV_A_NORM,                         "blk.%d.attn_kv_a_norm" },
     { LLM_TENSOR_ATTN_Q_A,                               "blk.%d.attn_q_a" },
     { LLM_TENSOR_ATTN_Q_B,                               "blk.%d.attn_q_b" },
+    { LLM_TENSOR_ATTN_KV,                                "blk.%d.attn_kv" },
     { LLM_TENSOR_ATTN_KV_A_MQA,                          "blk.%d.attn_kv_a_mqa" },
     { LLM_TENSOR_ATTN_KV_B,                              "blk.%d.attn_kv_b" },
+    { LLM_TENSOR_ATTN_OUT_A,                             "blk.%d.attn_output_a" },
+    { LLM_TENSOR_ATTN_OUT_B,                             "blk.%d.attn_output_b" },
+    { LLM_TENSOR_ATTN_COMPRESSOR_APE,                    "blk.%d.attn_compressor_ape" },
+    { LLM_TENSOR_ATTN_COMPRESSOR_KV,                     "blk.%d.attn_compressor_kv" },
+    { LLM_TENSOR_ATTN_COMPRESSOR_GATE,                   "blk.%d.attn_compressor_gate" },
+    { LLM_TENSOR_ATTN_COMPRESSOR_NORM,                   "blk.%d.attn_compressor_norm" },
     { LLM_TENSOR_PER_LAYER_TOKEN_EMBD,                   "per_layer_token_embd" },
     { LLM_TENSOR_PER_LAYER_MODEL_PROJ,                   "per_layer_model_proj" },
     { LLM_TENSOR_PER_LAYER_PROJ_NORM,                    "per_layer_proj_norm" },
@@ -548,6 +567,17 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_INDEXER_PROJ,                           "blk.%d.indexer.proj" },
     { LLM_TENSOR_INDEXER_ATTN_K,                         "blk.%d.indexer.attn_k" },
     { LLM_TENSOR_INDEXER_ATTN_Q_B,                       "blk.%d.indexer.attn_q_b" },
+    { LLM_TENSOR_INDEXER_COMPRESSOR_APE,                 "blk.%d.indexer_compressor_ape" },
+    { LLM_TENSOR_INDEXER_COMPRESSOR_KV,                  "blk.%d.indexer_compressor_kv" },
+    { LLM_TENSOR_INDEXER_COMPRESSOR_GATE,                "blk.%d.indexer_compressor_gate" },
+    { LLM_TENSOR_INDEXER_COMPRESSOR_NORM,                "blk.%d.indexer_compressor_norm" },
+    { LLM_TENSOR_HC_ATTN_BASE,                           "blk.%d.hc_attn_base" },
+    { LLM_TENSOR_HC_ATTN_FN,                             "blk.%d.hc_attn_fn" },
+    { LLM_TENSOR_HC_ATTN_SCALE,                          "blk.%d.hc_attn_scale" },
+    { LLM_TENSOR_HC_FFN_BASE,                            "blk.%d.hc_ffn_base" },
+    { LLM_TENSOR_HC_FFN_FN,                              "blk.%d.hc_ffn_fn" },
+    { LLM_TENSOR_HC_FFN_SCALE,                           "blk.%d.hc_ffn_scale" },
+    { LLM_TENSOR_FFN_GATE_TID2EID,                       "blk.%d.ffn_gate_tid2eid" },
 };
 
 // declare information about the model weight tensors:
@@ -566,6 +596,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_TOKEN_TYPES,                {LLM_TENSOR_LAYER_INPUT,     GGML_OP_GET_ROWS}},
     {LLM_TENSOR_TOKEN_EMBD_NORM,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},  // do the norms on the first layer (not the input layer)
     {LLM_TENSOR_OUTPUT,                     {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_OUTPUT_HC_BASE,             {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_ADD}},
+    {LLM_TENSOR_OUTPUT_HC_FN,               {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_OUTPUT_HC_SCALE,            {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_SCALE}},
     {LLM_TENSOR_CLS,                        {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CLS_OUT,                    {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CLS_NORM,                   {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
@@ -592,10 +625,15 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_FFN_UP_SHEXP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_Q_A,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_Q_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_KV,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_KV_A_MQA,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_KV_B,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_K_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_V_B,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_OUT_A,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_OUT_B,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_COMPRESSOR_KV,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_COMPRESSOR_GATE,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ATTN_SINKS,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
     {LLM_TENSOR_DEC_ATTN_Q,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_DEC_ATTN_K,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -757,6 +795,19 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_INDEXER_PROJ,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_INDEXER_ATTN_K,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_INDEXER_ATTN_Q_B,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_INDEXER_COMPRESSOR_KV,      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_INDEXER_COMPRESSOR_GATE,    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ATTN_COMPRESSOR_NORM,       {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_INDEXER_COMPRESSOR_NORM,    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_HC_ATTN_BASE,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_HC_ATTN_FN,                 {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_HC_ATTN_SCALE,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
+    {LLM_TENSOR_HC_FFN_BASE,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_HC_FFN_FN,                  {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_HC_FFN_SCALE,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SCALE}},
+    {LLM_TENSOR_FFN_GATE_TID2EID,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_GET_ROWS}},
+    {LLM_TENSOR_ATTN_COMPRESSOR_APE,        {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_INDEXER_COMPRESSOR_APE,     {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     // NextN/MTP tensors are stored per-block (blk.%d.nextn.*) even though only the
     // last nextn_predict_layers blocks carry them. Classify as LAYER_REPEATING so
     // the model loader doesn't fault on the block index.
@@ -902,6 +953,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
         case LLM_ARCH_OLMO2:
         case LLM_ARCH_OLMOE:
         case LLM_ARCH_DEEPSEEK2:
+        case LLM_ARCH_DEEPSEEK4:
         case LLM_ARCH_GLM_DSA:
         case LLM_ARCH_BITNET:
         case LLM_ARCH_T5:
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 89cf16cc37cf..a1dcb037c7a2 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -79,6 +79,7 @@ enum llm_arch {
     LLM_ARCH_DEEPSEEK,
     LLM_ARCH_DEEPSEEK2,
     LLM_ARCH_DEEPSEEK2OCR,
+    LLM_ARCH_DEEPSEEK4,
     LLM_ARCH_CHATGLM,
     LLM_ARCH_GLM4,
     LLM_ARCH_GLM4_MOE,
@@ -213,6 +214,10 @@ enum llm_kv {
     LLM_KV_TOKEN_SHIFT_COUNT,
     LLM_KV_INTERLEAVE_MOE_LAYER_STEP,
     LLM_KV_FULL_ATTENTION_INTERVAL,
+    LLM_KV_HASH_LAYER_COUNT,
+    LLM_KV_HYPER_CONNECTION_COUNT,
+    LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS,
+    LLM_KV_HYPER_CONNECTION_EPS,
 
     LLM_KV_ATTENTION_HEAD_COUNT,
     LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -247,6 +252,10 @@ enum llm_kv {
     LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,
     LLM_KV_ATTENTION_INDEXER_TOP_K,
     LLM_KV_ATTENTION_SHARED_KV_LAYERS,
+    LLM_KV_ATTENTION_COMPRESS_RATIOS,
+    LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE,
+    LLM_KV_ATTENTION_OUTPUT_LORA_RANK,
+    LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT,
 
     LLM_KV_ROPE_DIMENSION_COUNT,
     LLM_KV_ROPE_DIMENSION_COUNT_SWA,
@@ -354,6 +363,9 @@ enum llm_tensor {
     LLM_TENSOR_DENSE_2_OUT,
     LLM_TENSOR_DENSE_3_OUT,
     LLM_TENSOR_OUTPUT,
+    LLM_TENSOR_OUTPUT_HC_BASE,
+    LLM_TENSOR_OUTPUT_HC_FN,
+    LLM_TENSOR_OUTPUT_HC_SCALE,
     LLM_TENSOR_OUTPUT_NORM,
     LLM_TENSOR_OUTPUT_NORM_LFM2, // fix for wrong tensor name
     LLM_TENSOR_ROPE_FREQS,
@@ -482,12 +494,19 @@ enum llm_tensor {
     LLM_TENSOR_CHANNEL_MIX_VALUE,
     LLM_TENSOR_ATTN_Q_A,
     LLM_TENSOR_ATTN_Q_B,
+    LLM_TENSOR_ATTN_KV,
     LLM_TENSOR_ATTN_KV_A_MQA,
     LLM_TENSOR_ATTN_KV_B,
     LLM_TENSOR_ATTN_K_B,
     LLM_TENSOR_ATTN_V_B,
+    LLM_TENSOR_ATTN_OUT_A,
+    LLM_TENSOR_ATTN_OUT_B,
     LLM_TENSOR_ATTN_Q_A_NORM,
     LLM_TENSOR_ATTN_KV_A_NORM,
+    LLM_TENSOR_ATTN_COMPRESSOR_APE,
+    LLM_TENSOR_ATTN_COMPRESSOR_KV,
+    LLM_TENSOR_ATTN_COMPRESSOR_GATE,
+    LLM_TENSOR_ATTN_COMPRESSOR_NORM,
     LLM_TENSOR_ATTN_SUB_NORM,
     LLM_TENSOR_FFN_SUB_NORM,
     LLM_TENSOR_DEC_ATTN_NORM,
@@ -549,6 +568,17 @@ enum llm_tensor {
     LLM_TENSOR_INDEXER_PROJ,
     LLM_TENSOR_INDEXER_ATTN_K,
     LLM_TENSOR_INDEXER_ATTN_Q_B,
+    LLM_TENSOR_INDEXER_COMPRESSOR_APE,
+    LLM_TENSOR_INDEXER_COMPRESSOR_KV,
+    LLM_TENSOR_INDEXER_COMPRESSOR_GATE,
+    LLM_TENSOR_INDEXER_COMPRESSOR_NORM,
+    LLM_TENSOR_HC_ATTN_BASE,
+    LLM_TENSOR_HC_ATTN_FN,
+    LLM_TENSOR_HC_ATTN_SCALE,
+    LLM_TENSOR_HC_FFN_BASE,
+    LLM_TENSOR_HC_FFN_FN,
+    LLM_TENSOR_HC_FFN_SCALE,
+    LLM_TENSOR_FFN_GATE_TID2EID,
     LLM_TENSOR_NEXTN_EH_PROJ,
     LLM_TENSOR_NEXTN_EMBED_TOKENS,
     LLM_TENSOR_NEXTN_ENORM,
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d62abc4009b8..b1b6aa350735 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -420,7 +420,7 @@ void llama_context::sched_reserve() {
 
     const int64_t t_start_us = ggml_time_us();
 
-    const uint32_t n_seqs = cparams.n_seq_max;
+    const uint32_t n_seqs = model.arch == LLM_ARCH_DEEPSEEK4 ? 1 : cparams.n_seq_max;
     const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
     const size_t max_nodes = this->graph_max_nodes(n_tokens);
@@ -596,6 +596,22 @@ void llama_context::sched_reserve() {
         n_nodes_pp  = ggml_graph_n_nodes(gf);
     }
 
+    // DeepSeek V4 resumed-prompt chunks use the compressed-attention decode
+    // graph, which is larger than the position-zero prefill graph.
+    if (model.arch == LLM_ARCH_DEEPSEEK4 && n_tokens > 1) {
+        const llama_pos reserve_pos0 = std::min<llama_pos>(
+                cparams.n_ctx > n_tokens ? cparams.n_ctx - n_tokens : n_tokens,
+                std::max<uint32_t>(cparams.n_batch, 8u*n_tokens));
+        auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(),
+                model.hparams.no_alloc, nullptr, reserve_pos0);
+        if (!gf) {
+            throw std::runtime_error("failed to allocate DeepSeek V4 resumed pp buffers");
+        }
+
+        n_splits_pp = std::max(n_splits_pp, ggml_backend_sched_get_n_splits(sched.get()));
+        n_nodes_pp  = std::max(n_nodes_pp,  ggml_graph_n_nodes(gf));
+    }
+
     // reserve with tg (token generation) graph to get the number of splits and nodes
     {
         auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc);
@@ -2171,6 +2187,15 @@ uint32_t llama_context::graph_max_nodes(uint32_t n_tokens) const {
     if (model.arch == LLM_ARCH_QWEN3NEXT || model.arch == LLM_ARCH_KIMI_LINEAR || model.arch == LLM_ARCH_QWEN35 || model.arch == LLM_ARCH_QWEN35MOE) {
         return std::max<uint32_t>(n_tokens * 40, 32u * model.n_tensors());
     }
+    if (model.arch == LLM_ARCH_DEEPSEEK4) {
+        // DeepSeek V4 has a position-dependent compressed-attention decode path
+        // that creates many temporary tensor objects, especially when a long
+        // prompt is split into non-prefill ubatches. The visible graph node
+        // count is much smaller than the number of GGML objects allocated while
+        // building those graphs, so reserve a larger metadata arena than the
+        // generic tensor-count heuristic would provide.
+        return std::max<uint32_t>(524288u, n_tokens * 192 + 64u * model.n_tensors());
+    }
     uint32_t res = std::max<uint32_t>(1024u, 8u*model.n_tensors());
     for (const auto & lora : model.loras) {
         res += lora->get_n_nodes();
@@ -2183,7 +2208,7 @@ llm_graph_result * llama_context::get_gf_res_reserve() const {
 }
 
 ggml_cgraph * llama_context::graph_reserve(
-        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) {
+        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes, llama_pos pos0) {
     LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs);
     GGML_ASSERT(n_outputs >= 1);
 
@@ -2207,6 +2232,14 @@ ggml_cgraph * llama_context::graph_reserve(
 
     llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
     llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
+    if (pos0 != 0 && ubatch.pos != nullptr) {
+        for (uint32_t i = 0; i < ubatch.n_tokens; ++i) {
+            ubatch.pos[i*ubatch.n_pos] = pos0 + i;
+            for (uint32_t j = 1; j < ubatch.n_pos; ++j) {
+                ubatch.pos[i*ubatch.n_pos + j] = 0;
+            }
+        }
+    }
 
     // set one output token per sequence in order to activate all backend samplers
     std::vector<llama_seq_id> seq_ids(n_seqs);
@@ -3357,6 +3390,29 @@ llama_context * llama_init_from_model(
         params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_DISABLED;
     }
 
+    // V4 (DeepSeek4) requires fp16 KV cache: V4's standard SWA K cache,
+    // compressed-attention K cache (cache.attn_k), and indexer K cache
+    // (cache.index_k) all share the same `type_k` and must agree in dtype
+    // because src/models/deepseek4.cpp concatenates the SWA K view with the
+    // compressed K view via ggml_concat (which asserts a->type == b->type).
+    // Furthermore, V4's K activations are post-fp8-quantized
+    // (ggml_dsv4_fp8_kv_quantize), and q8_0's single fp16 scale per 32-element
+    // block cannot faithfully reproduce fp8-quantized value distributions --
+    // pinning to q8_0 corrupts decode silently ("=" loops, "Mirror ..."
+    // garbage). Coerce here, before the SPLIT_MODE_TENSOR / FA / V-quant
+    // shared validations below and before the constructor's flash_attn check,
+    // so those validations see the effective fp16 types and won't reject V4
+    // requests with --cache-type-k|v q8_0. See
+    // docs/plans/v4-port-kv-q8-completion.md.
+    if (model->arch == LLM_ARCH_DEEPSEEK4) {
+        if (params.type_k != GGML_TYPE_F16 || params.type_v != GGML_TYPE_F16) {
+            LLAMA_LOG_WARN("DeepSeek4: forcing fp16 KV cache (--cache-type-k|v are ignored for V4 because compressed/indexer K caches require fp16; "
+                           "see docs/plans/v4-port-kv-q8-completion.md)\n");
+            params.type_k = GGML_TYPE_F16;
+            params.type_v = GGML_TYPE_F16;
+        }
+    }
+
     if (model->split_mode() == LLAMA_SPLIT_MODE_TENSOR) {
         if (params.flash_attn_type == LLAMA_FLASH_ATTN_TYPE_AUTO) {
             LLAMA_LOG_INFO("%s: enabling flash_attn since it is required for SPLIT_MODE_TENSOR\n", __func__);
diff --git a/src/llama-context.h b/src/llama-context.h
index e16ac4c618ba..999ba5a800c5 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -240,7 +240,8 @@ struct llama_context {
 
     // reserve a graph with a dummy ubatch of the specified size
     ggml_cgraph * graph_reserve(
-        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr);
+        uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr,
+        llama_pos pos0 = 0);
 
     bool set_sampler(llama_seq_id seq_id, llama_sampler * sampler);
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 858c297dd762..bbb74a0661b4 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -500,29 +500,41 @@ bool llm_graph_input_attn_k::can_reuse(const llm_graph_params & params) {
 }
 
 void llm_graph_input_attn_kv_iswa::set_input(const llama_ubatch * ubatch) {
-    mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
-    mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+    if (self_k_idxs && self_k_idxs->buffer) {
+        mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
+    }
+    if (self_v_idxs && self_v_idxs->buffer) {
+        mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
+    }
 
-    mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    if (self_kq_mask && self_kq_mask->buffer) {
+        mctx->get_base()->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
+    }
 
-    mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
-    mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+    if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
+        mctx->get_swa()->set_input_k_idxs(self_k_idxs_swa, ubatch);
+    }
+    if (self_v_idxs_swa && self_v_idxs_swa->buffer) {
+        mctx->get_swa()->set_input_v_idxs(self_v_idxs_swa, ubatch);
+    }
 
-    mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+    if (self_kq_mask_swa && self_kq_mask_swa->buffer) {
+        mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
+    }
 
-    if (self_k_rot) {
+    if (self_k_rot && self_k_rot->buffer) {
         mctx->get_base()->set_input_k_rot(self_k_rot);
     }
 
-    if (self_v_rot) {
+    if (self_v_rot && self_v_rot->buffer) {
         mctx->get_base()->set_input_v_rot(self_v_rot);
     }
 
-    if (self_k_rot_swa) {
+    if (self_k_rot_swa && self_k_rot_swa->buffer) {
         mctx->get_swa()->set_input_k_rot(self_k_rot_swa);
     }
 
-    if (self_v_rot_swa) {
+    if (self_v_rot_swa && self_v_rot_swa->buffer) {
         mctx->get_swa()->set_input_v_rot(self_v_rot_swa);
     }
 }
@@ -534,14 +546,19 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
 
     bool res = true;
 
-    res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
-  //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+    if (self_k_idxs && self_k_idxs->buffer) {
+        res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
+      //res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+
+        res &= can_reuse_kq_mask(self_kq_mask, mctx->get_base(), params.ubatch, params.cparams);
+    }
 
-    res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
-  //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
+    if (self_k_idxs_swa && self_k_idxs_swa->buffer) {
+        res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
+      //res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
 
-    res &= can_reuse_kq_mask(self_kq_mask,     mctx->get_base(), params.ubatch, params.cparams);
-    res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(),  params.ubatch, params.cparams);
+        res &= can_reuse_kq_mask(self_kq_mask_swa, mctx->get_swa(), params.ubatch, params.cparams);
+    }
 
     return res;
 }
@@ -591,7 +608,7 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
 
     const int64_t n_rs = mctx->get_recr()->get_n_rs();
 
-    if (inp_rs->s_copy) {
+    if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
         GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
         int32_t * data = (int32_t *) inp_rs->s_copy->data;
 
@@ -614,10 +631,12 @@ bool llm_graph_input_mem_hybrid::can_reuse(const llm_graph_params & params) {
 
     res &= can_reuse_kq_mask(inp_attn->self_kq_mask, mctx->get_attn(), params.ubatch, params.cparams);
 
-    res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+    if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
+        res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
 
-    res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
-    res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+        res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
+        res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+    }
 
     res &= inp_rs->head == mctx->get_recr()->get_head();
     res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
@@ -635,7 +654,7 @@ void llm_graph_input_mem_hybrid_k::set_input(const llama_ubatch * ubatch) {
 
     const int64_t n_rs = mctx->get_recr()->get_n_rs();
 
-    if (inp_rs->s_copy) {
+    if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
         GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
         int32_t * data = (int32_t *) inp_rs->s_copy->data;
 
@@ -657,10 +676,12 @@ bool llm_graph_input_mem_hybrid_k::can_reuse(const llm_graph_params & params) {
 
     res &= can_reuse_kq_mask(inp_attn->self_kq_mask, mctx->get_attn(), params.ubatch, params.cparams);
 
-    res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+    if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
+        res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
 
-    res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
-    res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+        res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
+        res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+    }
 
     res &= inp_rs->head == mctx->get_recr()->get_head();
     res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
@@ -674,38 +695,46 @@ void llm_graph_input_mem_hybrid_iswa::set_input(const llama_ubatch * ubatch) {
     // base tensors may not be allocated if there are no non-SWA attention layers
     if (inp_attn->self_k_idxs && inp_attn->self_k_idxs->buffer) {
         attn_ctx->get_base()->set_input_k_idxs(inp_attn->self_k_idxs, ubatch);
-        attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
+        if (inp_attn->self_v_idxs && inp_attn->self_v_idxs->buffer) {
+            attn_ctx->get_base()->set_input_v_idxs(inp_attn->self_v_idxs, ubatch);
+        }
 
-        attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+        if (inp_attn->self_kq_mask && inp_attn->self_kq_mask->buffer) {
+            attn_ctx->get_base()->set_input_kq_mask(inp_attn->self_kq_mask, ubatch, cparams.causal_attn);
+        }
     }
 
     // swa tensors may not be allocated if there are no SWA attention layers
     if (inp_attn->self_k_idxs_swa && inp_attn->self_k_idxs_swa->buffer) {
         attn_ctx->get_swa()->set_input_k_idxs(inp_attn->self_k_idxs_swa, ubatch);
-        attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
+        if (inp_attn->self_v_idxs_swa && inp_attn->self_v_idxs_swa->buffer) {
+            attn_ctx->get_swa()->set_input_v_idxs(inp_attn->self_v_idxs_swa, ubatch);
+        }
 
-        attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
+        if (inp_attn->self_kq_mask_swa && inp_attn->self_kq_mask_swa->buffer) {
+            attn_ctx->get_swa()->set_input_kq_mask(inp_attn->self_kq_mask_swa, ubatch, cparams.causal_attn);
+        }
     }
 
-    if (inp_attn->self_k_rot) {
+    if (inp_attn->self_k_rot && inp_attn->self_k_rot->buffer) {
         attn_ctx->get_base()->set_input_k_rot(inp_attn->self_k_rot);
     }
 
-    if (inp_attn->self_v_rot) {
+    if (inp_attn->self_v_rot && inp_attn->self_v_rot->buffer) {
         attn_ctx->get_base()->set_input_v_rot(inp_attn->self_v_rot);
     }
 
-    if (inp_attn->self_k_rot_swa) {
+    if (inp_attn->self_k_rot_swa && inp_attn->self_k_rot_swa->buffer) {
         attn_ctx->get_swa()->set_input_k_rot(inp_attn->self_k_rot_swa);
     }
 
-    if (inp_attn->self_v_rot_swa) {
+    if (inp_attn->self_v_rot_swa && inp_attn->self_v_rot_swa->buffer) {
         attn_ctx->get_swa()->set_input_v_rot(inp_attn->self_v_rot_swa);
     }
 
     const int64_t n_rs = mctx->get_recr()->get_n_rs();
 
-    if (inp_rs->s_copy) {
+    if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
         GGML_ASSERT(ggml_backend_buffer_is_host(inp_rs->s_copy->buffer));
         int32_t * data = (int32_t *) inp_rs->s_copy->data;
 
@@ -741,10 +770,12 @@ bool llm_graph_input_mem_hybrid_iswa::can_reuse(const llm_graph_params & params)
         res &= can_reuse_kq_mask(inp_attn->self_kq_mask_swa, attn_ctx->get_swa(), params.ubatch, params.cparams);
     }
 
-    res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
+    if (inp_rs->s_copy && inp_rs->s_copy->buffer) {
+        res &= inp_rs->s_copy->ne[0] == mctx->get_recr()->get_n_rs();
 
-    res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
-    res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+        res &= inp_rs->s_copy_main->ne[0]  == params.ubatch.n_seqs;
+        res &= inp_rs->s_copy_extra->ne[0] == mctx->get_recr()->get_n_rs() - params.ubatch.n_seqs;
+    }
 
     res &= inp_rs->head == mctx->get_recr()->get_head();
     res &= inp_rs->rs_z == mctx->get_recr()->get_rs_z();
@@ -1325,7 +1356,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
          ggml_tensor * gate_up_exps,
          ggml_tensor * up_exps_s,
          ggml_tensor * gate_exps_s,
-         ggml_tensor * down_exps_s) const {
+         ggml_tensor * down_exps_s,
+         ggml_tensor * selected_experts_in) const {
     return build_moe_ffn(
         cur,
         gate_inp,  /* gate_inp_b  */ nullptr,
@@ -1345,7 +1377,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
         /* gate_up_exps_b */ nullptr,
         up_exps_s,
         gate_exps_s,
-        down_exps_s
+        down_exps_s,
+        selected_experts_in
     );
 }
 
@@ -1372,10 +1405,12 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
          ggml_tensor * gate_up_exps_b,
          ggml_tensor * up_exps_s,
          ggml_tensor * gate_exps_s,
-         ggml_tensor * down_exps_s) const {
+         ggml_tensor * down_exps_s,
+         ggml_tensor * selected_experts_in) const {
     const int64_t n_embd   = cur->ne[0];
     const int64_t n_tokens = cur->ne[1];
     const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
+    const bool weight_before_down = arch == LLM_ARCH_DEEPSEEK4; // DeepSeek V4 applies routed weights after SwiGLU and before w2
 
     ggml_tensor * logits = nullptr;
 
@@ -1401,6 +1436,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
             {
                 probs = ggml_sigmoid(ctx0, logits); // [n_expert, n_tokens]
             } break;
+        case LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS:
+            {
+                probs = ggml_sqrt(ctx0, ggml_softplus(ctx0, logits)); // [n_expert, n_tokens]
+            } break;
         case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT:
             {
                 probs = logits; // [n_expert, n_tokens]
@@ -1455,8 +1494,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     }
 
     // select experts
-    ggml_tensor * selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
-    cb(selected_experts->src[0], "ffn_moe_argsort", il);
+    ggml_tensor * selected_experts = selected_experts_in;
+    if (selected_experts == nullptr) {
+        selected_experts = ggml_argsort_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
+        cb(selected_experts->src[0], "ffn_moe_argsort", il);
+    }
     cb(selected_experts, "ffn_moe_topk", il);
 
     if (arch == LLM_ARCH_GROVEMOE && n_expert != hparams.n_expert) {
@@ -1584,6 +1626,25 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     switch (type_op) {
         case LLM_FFN_SILU:
             if (gate_exps) {
+                if (arch == LLM_ARCH_DEEPSEEK4 && il >= 0) {
+                    const float limit = hparams.swiglu_clamp_exp[il];
+                    constexpr float eps = 1e-6f;
+                    if (limit > eps) {
+                        cur = ggml_clamp(ctx0, cur, -INFINITY, limit);
+                        cb(cur, "ffn_moe_gate_clamped", il);
+
+                        ggml_tensor * gate_act = ggml_silu(ctx0, cur);
+                        cb(gate_act, "ffn_moe_silu", il);
+
+                        up = ggml_clamp(ctx0, up, -limit, limit);
+                        cb(up, "ffn_moe_up_clamped", il);
+
+                        cur = ggml_mul(ctx0, gate_act, up);
+                        cb(cur, "ffn_moe_swiglu_limited", il);
+                        break;
+                    }
+                }
+
                 // Step35: per-layer clamp for routed experts
                 if (arch == LLM_ARCH_STEP35 && il >= 0) {
                     const float limit = hparams.swiglu_clamp_exp[il];
@@ -1648,6 +1709,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
             GGML_ABORT("fatal error");
     }
 
+    if (weight_before_down) {
+        cur = ggml_mul(ctx0, cur, weights);
+        cb(cur, "ffn_moe_weighted_swiglu", il);
+    }
+
     experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
     cb(experts, "ffn_moe_down", il);
 
@@ -1665,7 +1731,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
         cb(experts, "ffn_moe_down_scaled", il);
     }
 
-    if (!weight_before_ffn) {
+    if (!weight_before_ffn && !weight_before_down) {
         experts = ggml_mul(ctx0, experts, weights);
         cb(experts, "ffn_moe_weighted", il);
     }
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 9e55d0a675e0..260334f7302f 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -849,7 +849,8 @@ struct llm_graph_context {
              ggml_tensor * gate_up_exps = nullptr,
              ggml_tensor * up_exps_s = nullptr,
              ggml_tensor * gate_exps_s = nullptr,
-             ggml_tensor * down_exps_s = nullptr) const;
+             ggml_tensor * down_exps_s = nullptr,
+             ggml_tensor * selected_experts_in = nullptr) const;
 
     ggml_tensor * build_moe_ffn(
              ggml_tensor * cur,
@@ -874,7 +875,8 @@ struct llm_graph_context {
              ggml_tensor * gate_up_exps_b = nullptr,
              ggml_tensor * up_exps_s = nullptr,
              ggml_tensor * gate_exps_s = nullptr,
-             ggml_tensor * down_exps_s = nullptr) const;
+             ggml_tensor * down_exps_s = nullptr,
+             ggml_tensor * selected_experts_in = nullptr) const;
 
     //
     // inputs
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
index 2239309c8fb4..44eaf501f7dc 100644
--- a/src/llama-hparams.cpp
+++ b/src/llama-hparams.cpp
@@ -153,6 +153,10 @@ uint32_t llama_hparams::n_embd_v_gqa_max() const {
 }
 
 uint32_t llama_hparams::n_embd_r() const {
+    if (dsv4_state_size != 0) {
+        return dsv4_state_size;
+    }
+
     if (wkv_head_size != 0) {
         // for RWKV models
         return token_shift_count * n_embd;
@@ -177,6 +181,10 @@ uint32_t llama_hparams::n_embd_r() const {
 }
 
 uint32_t llama_hparams::n_embd_s() const {
+    if (dsv4_state_size != 0) {
+        return dsv4_state_size;
+    }
+
     if (wkv_head_size != 0) {
         // corresponds to RWKV's wkv_states size
         return n_embd * wkv_head_size;
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index e2d051edc6cd..3a0438283e77 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -14,6 +14,7 @@ enum llama_expert_gating_func_type {
     LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX        = 1,
     LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID        = 2,
     LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX_WEIGHT = 3, // applied to the router weights instead of the logits
+    LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS   = 4,
 };
 
 enum llama_swa_type {
@@ -75,6 +76,8 @@ struct llama_hparams {
     uint32_t n_layer_dense_lead = 0;
     uint32_t n_lora_q           = 0;
     uint32_t n_lora_kv          = 0;
+    uint32_t n_lora_o           = 0;
+    uint32_t n_attn_out_groups  = 0;
     uint32_t n_ff_exp           = 0;
     uint32_t n_ff_shexp         = 0;
     uint32_t n_ff_chexp         = 0;
@@ -91,6 +94,7 @@ struct llama_hparams {
     uint32_t moe_every_n_layers   = 0;
     uint32_t moe_latent_size      = 0;
     uint32_t nextn_predict_layers = 0;
+    uint32_t n_hash_layers        = 0;
 
     bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches)
 
@@ -211,6 +215,14 @@ struct llama_hparams {
     uint32_t indexer_head_size = 0;
     uint32_t indexer_top_k     = 0;
 
+    // DeepSeek V4 hyper-connections and sparse KV compression
+    uint32_t n_hc                    = 1;
+    uint32_t hc_sinkhorn_iters       = 0;
+    float    hc_eps                  = 0.0f;
+    float    compress_rope_freq_base = 0.0f;
+    uint32_t dsv4_state_size         = 0;
+    std::array<uint32_t, LLAMA_MAX_LAYERS> attn_compress_ratio;
+
     // qwen3vl deepstack
     uint32_t n_deepstack_layers = 0;
 
diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp
index 26e2cb4270b0..9b9f17903637 100644
--- a/src/llama-kv-cache-iswa.cpp
+++ b/src/llama-kv-cache-iswa.cpp
@@ -60,14 +60,14 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
     LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
 
     kv_base = std::make_unique<llama_kv_cache>(
-            model, type_k, type_v,
+            model, hparams, type_k, type_v,
             v_trans, offload, unified, size_base, n_seq_max, n_pad,
             0, LLAMA_SWA_TYPE_NONE, filter_base, reuse);
 
     LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
 
     kv_swa = std::make_unique<llama_kv_cache>(
-            model, type_k, type_v,
+            model, hparams, type_k, type_v,
             v_trans, offload, unified, size_swa, n_seq_max, n_pad,
             hparams.n_swa, hparams.swa_type, filter_swa, reuse);
 }
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
index a49a055a6304..92585b671b55 100644
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -79,6 +79,7 @@ static ggml_tensor * ggml_mul_mat_aux(
 
 llama_kv_cache::llama_kv_cache(
         const llama_model & model,
+        const llama_hparams & hparams,
                 ggml_type   type_k,
                 ggml_type   type_v,
                      bool   v_trans,
@@ -91,7 +92,7 @@ llama_kv_cache::llama_kv_cache(
            llama_swa_type   swa_type,
     const layer_filter_cb & filter,
     const  layer_reuse_cb & reuse) :
-    model(model), hparams(model.hparams), v_trans(v_trans),
+    model(model), hparams(hparams), v_trans(v_trans),
     n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) {
 
     GGML_ASSERT(kv_size % n_pad == 0);
@@ -205,7 +206,7 @@ llama_kv_cache::llama_kv_cache(
         }
 
         const bool has_k = true;
-        const bool has_v = !is_mla;
+        const bool has_v = !is_mla && model.arch != LLM_ARCH_DEEPSEEK4;
 
         ggml_tensor * k = has_k ? ggml_new_tensor_3d(ctx, type_k, n_embd_k_gqa, kv_size, n_stream) : nullptr;
         ggml_tensor * v = has_v ? ggml_new_tensor_3d(ctx, type_v, n_embd_v_gqa, kv_size, n_stream) : nullptr;
@@ -253,7 +254,7 @@ llama_kv_cache::llama_kv_cache(
     // allocate tensors and initialize the buffers to avoid NaNs in the padding
     for (auto & [buft, ctx] : ctx_map) {
         ggml_backend_buffer_t buf;
-        if (model.hparams.no_alloc) {
+        if (hparams.no_alloc) {
             buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer
             for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
                 t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
index 0b62dc7b2320..0b0a56ce92f4 100644
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@@ -95,6 +95,7 @@ class llama_kv_cache : public llama_memory_i {
 
     llama_kv_cache(
             const llama_model & model,
+            const llama_hparams & hparams,
                     ggml_type   type_k,
                     ggml_type   type_v,
                          bool   v_trans,
diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp
index a59561ea54dd..58dadabc9f62 100644
--- a/src/llama-memory-hybrid-iswa.cpp
+++ b/src/llama-memory-hybrid-iswa.cpp
@@ -1,9 +1,113 @@
 #include "llama-memory-hybrid-iswa.h"
 
+#include "ggml-backend.h"
+
 #include "llama-impl.h"
+#include "llama-io.h"
 #include "llama-model.h"
 #include "llama-context.h"
 
+#include <algorithm>
+#include <cstring>
+#include <limits>
+#include <map>
+#include <stdexcept>
+#include <vector>
+
+namespace {
+
+constexpr uint32_t DSV4_COMPRESSED_KV_STATE_MAGIC   = 0x44535634; // "DSV4"
+constexpr uint32_t DSV4_COMPRESSED_KV_STATE_VERSION = 1;
+constexpr uint32_t DSV4_COMPRESSED_DECODE_UBATCH_MAX = 512;
+
+struct dsv4_row_range {
+    uint32_t begin = 0;
+    uint32_t end   = 0;
+
+    uint32_t size() const {
+        GGML_ASSERT(end >= begin);
+        return end - begin;
+    }
+};
+
+static dsv4_row_range dsv4_make_row_range(uint32_t n_comp, uint32_t ratio, llama_pos p0, llama_pos p1) {
+    GGML_ASSERT(ratio > 0);
+
+    if (n_comp == 0) {
+        return {};
+    }
+
+    if (p0 < 0) {
+        p0 = 0;
+    }
+    if (p1 < 0) {
+        p1 = std::numeric_limits<llama_pos>::max();
+    }
+    if (p0 >= p1) {
+        return {};
+    }
+
+    const uint64_t row_begin = (uint64_t) p0 / ratio;
+    uint64_t row_end;
+    if (p1 == std::numeric_limits<llama_pos>::max()) {
+        row_end = n_comp;
+    } else {
+        row_end = ((uint64_t) p1 + ratio - 1) / ratio;
+    }
+
+    return {
+        (uint32_t) std::min<uint64_t>(row_begin, n_comp),
+        (uint32_t) std::min<uint64_t>(row_end,   n_comp),
+    };
+}
+
+static size_t dsv4_cache_row_size(const ggml_tensor * t) {
+    GGML_ASSERT(t != nullptr);
+
+    const size_t row_size = ggml_row_size(t->type, t->ne[0]);
+    GGML_ASSERT((size_t) t->nb[1] == row_size);
+    GGML_ASSERT((size_t) t->nb[2] == row_size*(size_t) t->ne[1]);
+
+    return row_size;
+}
+
+static size_t dsv4_cache_offset(const ggml_tensor * t, llama_seq_id seq_id, uint32_t row) {
+    GGML_ASSERT(seq_id >= 0);
+    GGML_ASSERT(row <= (uint32_t) t->ne[1]);
+
+    return (size_t) seq_id*(size_t) t->nb[2] + (size_t) row*(size_t) t->nb[1];
+}
+
+static void dsv4_zero_cache_rows(ggml_tensor * t, llama_seq_id seq_id, uint32_t row_start, uint32_t n_rows) {
+    if (t == nullptr || n_rows == 0) {
+        return;
+    }
+
+    const size_t row_size = dsv4_cache_row_size(t);
+    const size_t n_bytes  = (size_t) n_rows*row_size;
+    const size_t offset   = dsv4_cache_offset(t, seq_id, row_start);
+
+    std::vector<uint8_t> zeros(n_bytes, 0);
+    ggml_backend_tensor_set(t, zeros.data(), offset, n_bytes);
+}
+
+static void dsv4_copy_cache_rows(ggml_tensor * t, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, uint32_t row_start, uint32_t n_rows) {
+    if (t == nullptr || n_rows == 0 || seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    const size_t row_size   = dsv4_cache_row_size(t);
+    const size_t n_bytes    = (size_t) n_rows*row_size;
+    const size_t src_offset = dsv4_cache_offset(t, seq_id_src, row_start);
+    const size_t dst_offset = dsv4_cache_offset(t, seq_id_dst, row_start);
+
+    std::vector<uint8_t> tmp(n_bytes);
+    ggml_backend_tensor_get(t, tmp.data(), src_offset, n_bytes);
+    ggml_backend_tensor_set(t, tmp.data(), dst_offset, n_bytes);
+}
+
+} // namespace
+
 //
 // llama_memory_hybrid_iswa
 //
@@ -59,9 +163,103 @@ llama_memory_hybrid_iswa::llama_memory_hybrid_iswa(
         filter_recr == nullptr ?
             [&](int32_t il) { return hparams.is_recurrent(il); }
             : filter_recr
-    )) {}
+    )) {
+    if (model.arch != LLM_ARCH_DEEPSEEK4) {
+        return;
+    }
+
+    dsv4_n_seq_max = n_seq_max;
+    dsv4_cache_layers.resize(hparams.n_layer);
+
+    struct ggml_backend_buft_comparator {
+        bool operator()(const ggml_backend_buffer_type_t & lhs, const ggml_backend_buffer_type_t & rhs) const {
+            return strcmp(ggml_backend_buft_name(lhs), ggml_backend_buft_name(rhs)) < 0;
+        }
+    };
+    std::map<ggml_backend_buffer_type_t, ggml_context_ptr, ggml_backend_buft_comparator> ctx_map;
+
+    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
+        auto it = ctx_map.find(buft);
+        if (it == ctx_map.end()) {
+            ggml_init_params params = {
+                /*.mem_size   =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()),
+                /*.mem_buffer =*/ nullptr,
+                /*.no_alloc   =*/ true,
+            };
+
+            ggml_context * ctx = ggml_init(params);
+            if (!ctx) {
+                return nullptr;
+            }
+
+            ctx_map.emplace(buft, ctx);
+            return ctx;
+        }
+
+        return it->second.get();
+    };
+
+    for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+        const uint32_t ratio = hparams.attn_compress_ratio[il];
+        if (ratio == 0) {
+            continue;
+        }
+
+        const uint32_t n_comp = std::max<uint32_t>(1, (kv_size + ratio - 1) / ratio);
+
+        const char * dev_name = "CPU";
+        ggml_backend_buffer_type_t buft = ggml_backend_cpu_buffer_type();
+
+        if (offload) {
+            auto * dev = model.dev_layer(il);
+            buft = ggml_backend_dev_buffer_type(dev);
+            dev_name = ggml_backend_dev_name(dev);
+        }
+
+        LLAMA_LOG_DEBUG("%s: DeepSeek4 compressed KV layer %3d: dev = %s, ratio = %u, rows = %u\n",
+                __func__, il, dev_name, ratio, n_comp);
+
+        ggml_context * ctx = ctx_for_buft(buft);
+        if (!ctx) {
+            throw std::runtime_error("failed to create ggml context for DeepSeek4 compressed KV cache");
+        }
+
+        auto & cache = dsv4_cache_layers[il];
+        cache.n_comp = n_comp;
+        cache.attn_k = ggml_new_tensor_3d(ctx, type_k, hparams.n_embd_head_k(il), n_comp, dsv4_n_seq_max);
+        ggml_format_name(cache.attn_k, "cache_dsv4_attn_k_l%d", il);
+
+        if (ratio == 4) {
+            cache.index_k = ggml_new_tensor_3d(ctx, type_k, hparams.indexer_head_size, n_comp, dsv4_n_seq_max);
+            ggml_format_name(cache.index_k, "cache_dsv4_index_k_l%d", il);
+        }
+    }
+
+    for (auto & [buft, ctx] : ctx_map) {
+        ggml_backend_buffer_t buf;
+        if (model.hparams.no_alloc) {
+            buf = ggml_backend_buft_alloc_buffer(buft, 0);
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) {
+                t->buffer = buf;
+            }
+        } else {
+            buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft);
+        }
+        if (!buf) {
+            throw std::runtime_error("failed to allocate DeepSeek4 compressed KV cache buffer");
+        }
+
+        LLAMA_LOG_INFO("%s: %10s DeepSeek4 compressed KV buffer size = %8.2f MiB\n", __func__,
+                ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+
+        ggml_backend_buffer_clear(buf, 0);
+        dsv4_ctxs_bufs.emplace_back(std::move(ctx), buf);
+    }
+}
 
 llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+    const bool dsv4_compressed = has_dsv4_compressed_kv();
+
     do {
         balloc.split_reset();
 
@@ -71,7 +269,23 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_batch(llama_batch_allocr
         while (true) {
             llama_ubatch ubatch;
 
-            if (embd_all) {
+            if (dsv4_compressed) {
+                // DeepSeek V4 compressed attention keeps sequence-local compressor
+                // state and compressed cache rows. Process one sequence set per
+                // ubatch while still allowing multi-sequence batches at the API
+                // level.
+                uint32_t n_ubatch_dsv4 = n_ubatch;
+                const auto & batch = balloc.get_batch();
+                const bool first_split = balloc.get_n_used() == 0;
+                const bool starts_at_zero = batch.pos == nullptr || batch.pos[0] == 0;
+                if (!first_split || !starts_at_zero) {
+                    // Non-prefill compressed-attention chunks build one
+                    // compressor update per token and can otherwise exhaust the
+                    // graph metadata arena on long contexts.
+                    n_ubatch_dsv4 = std::min<uint32_t>(n_ubatch_dsv4, DSV4_COMPRESSED_DECODE_UBATCH_MAX);
+                }
+                ubatch = balloc.split_seq(n_ubatch_dsv4);
+            } else if (embd_all) {
                 // if all tokens are output, split by sequence
                 ubatch = balloc.split_seq(n_ubatch);
             } else {
@@ -128,6 +342,10 @@ llama_memory_context_ptr llama_memory_hybrid_iswa::init_update(llama_context * l
 }
 
 bool llama_memory_hybrid_iswa::get_can_shift() const {
+    if (has_dsv4_compressed_kv()) {
+        return false;
+    }
+
     // Shifting is trivially supported for recurrent
     return mem_attn->get_can_shift();
 }
@@ -135,6 +353,12 @@ bool llama_memory_hybrid_iswa::get_can_shift() const {
 void llama_memory_hybrid_iswa::clear(bool data) {
     mem_attn->clear(data);
     mem_recr->clear(data);
+
+    if (data) {
+        for (auto & [_, buf] : dsv4_ctxs_bufs) {
+            ggml_backend_buffer_clear(buf.get(), 0);
+        }
+    }
 }
 
 bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
@@ -143,25 +367,39 @@ bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_p
     if (!mem_recr->seq_rm(seq_id, p0, p1)) {
         return false;
     }
-    return mem_attn->seq_rm(seq_id, p0, p1);
+    if (!mem_attn->seq_rm(seq_id, p0, p1)) {
+        return false;
+    }
+    dsv4_seq_rm(seq_id, p0, p1);
+    return true;
 }
 
 void llama_memory_hybrid_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
     mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
     mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    dsv4_seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }
 
 void llama_memory_hybrid_iswa::seq_keep(llama_seq_id seq_id) {
     mem_attn->seq_keep(seq_id);
     mem_recr->seq_keep(seq_id);
+    dsv4_seq_keep(seq_id);
 }
 
 void llama_memory_hybrid_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    if (has_dsv4_compressed_kv() && shift != 0) {
+        GGML_ABORT("DeepSeek V4 compressed KV cache does not support K-shift");
+    }
+
     mem_attn->seq_add(seq_id, p0, p1, shift);
     mem_recr->seq_add(seq_id, p0, p1, shift);
 }
 
 void llama_memory_hybrid_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    if (has_dsv4_compressed_kv() && d != 1) {
+        GGML_ABORT("DeepSeek V4 compressed KV cache does not support position division");
+    }
+
     mem_attn->seq_div(seq_id, p0, p1, d);
     mem_recr->seq_div(seq_id, p0, p1, d);
 }
@@ -181,17 +419,383 @@ std::map<ggml_backend_buffer_type_t, size_t> llama_memory_hybrid_iswa::memory_br
     for (const auto & buft_size : mem_recr->memory_breakdown()) {
         mb[buft_size.first] += buft_size.second;
     }
+    for (const auto & [_, buf] : dsv4_ctxs_bufs) {
+        mb[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get());
+    }
     return mb;
 }
 
 void llama_memory_hybrid_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) const {
     mem_attn->state_write(io, seq_id, flags);
     mem_recr->state_write(io, seq_id, flags);
+    dsv4_state_write(io, seq_id);
 }
 
 void llama_memory_hybrid_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id, llama_state_seq_flags flags) {
     mem_attn->state_read(io, seq_id, flags);
     mem_recr->state_read(io, seq_id, flags);
+    dsv4_state_read(io, seq_id);
+}
+
+void llama_memory_hybrid_iswa::dsv4_seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    if (!has_dsv4_compressed_kv()) {
+        return;
+    }
+
+    if (seq_id >= 0) {
+        GGML_ASSERT((uint32_t) seq_id < dsv4_n_seq_max);
+        for (int32_t il = 0; il < (int32_t) dsv4_cache_layers.size(); ++il) {
+            dsv4_clear_rows(seq_id, il, p0, p1);
+        }
+        return;
+    }
+
+    for (uint32_t seq = 0; seq < dsv4_n_seq_max; ++seq) {
+        for (int32_t il = 0; il < (int32_t) dsv4_cache_layers.size(); ++il) {
+            dsv4_clear_rows(seq, il, p0, p1);
+        }
+    }
+}
+
+void llama_memory_hybrid_iswa::dsv4_seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    if (!has_dsv4_compressed_kv() || seq_id_src == seq_id_dst) {
+        return;
+    }
+
+    GGML_ASSERT(seq_id_src >= 0 && (uint32_t) seq_id_src < dsv4_n_seq_max);
+    GGML_ASSERT(seq_id_dst >= 0 && (uint32_t) seq_id_dst < dsv4_n_seq_max);
+
+    for (int32_t il = 0; il < (int32_t) dsv4_cache_layers.size(); ++il) {
+        dsv4_copy_rows(seq_id_src, seq_id_dst, il, p0, p1);
+    }
+}
+
+void llama_memory_hybrid_iswa::dsv4_seq_keep(llama_seq_id seq_id) {
+    if (!has_dsv4_compressed_kv()) {
+        return;
+    }
+
+    GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max);
+
+    for (uint32_t seq = 0; seq < dsv4_n_seq_max; ++seq) {
+        if ((llama_seq_id) seq == seq_id) {
+            continue;
+        }
+
+        dsv4_clear_seq(seq);
+    }
+}
+
+void llama_memory_hybrid_iswa::dsv4_clear_seq(llama_seq_id seq_id) {
+    GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max);
+
+    for (const auto & layer : dsv4_cache_layers) {
+        dsv4_zero_cache_rows(layer.attn_k,  seq_id, 0, layer.n_comp);
+        dsv4_zero_cache_rows(layer.index_k, seq_id, 0, layer.n_comp);
+    }
+}
+
+void llama_memory_hybrid_iswa::dsv4_clear_rows(llama_seq_id seq_id, int32_t il, llama_pos p0, llama_pos p1) {
+    GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max);
+    GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size());
+
+    const uint32_t ratio = hparams.attn_compress_ratio[il];
+    if (ratio == 0) {
+        return;
+    }
+
+    const auto & layer = dsv4_cache_layers[il];
+    const auto range = dsv4_make_row_range(layer.n_comp, ratio, p0, p1);
+
+    dsv4_zero_cache_rows(layer.attn_k,  seq_id, range.begin, range.size());
+    dsv4_zero_cache_rows(layer.index_k, seq_id, range.begin, range.size());
+}
+
+void llama_memory_hybrid_iswa::dsv4_copy_rows(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, int32_t il, llama_pos p0, llama_pos p1) {
+    GGML_ASSERT(seq_id_src >= 0 && (uint32_t) seq_id_src < dsv4_n_seq_max);
+    GGML_ASSERT(seq_id_dst >= 0 && (uint32_t) seq_id_dst < dsv4_n_seq_max);
+    GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size());
+
+    const uint32_t ratio = hparams.attn_compress_ratio[il];
+    if (ratio == 0) {
+        return;
+    }
+
+    const auto & layer = dsv4_cache_layers[il];
+    const auto range = dsv4_make_row_range(layer.n_comp, ratio, p0, p1);
+
+    dsv4_copy_cache_rows(layer.attn_k,  seq_id_src, seq_id_dst, range.begin, range.size());
+    dsv4_copy_cache_rows(layer.index_k, seq_id_src, seq_id_dst, range.begin, range.size());
+}
+
+uint32_t llama_memory_hybrid_iswa::dsv4_n_state_rows(int32_t il, llama_seq_id seq_id) const {
+    GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max);
+    GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size());
+
+    const uint32_t ratio = hparams.attn_compress_ratio[il];
+    if (ratio == 0) {
+        return 0;
+    }
+
+    const llama_pos pos_max = mem_attn->seq_pos_max(seq_id);
+    if (pos_max < 0) {
+        return 0;
+    }
+
+    const uint64_t n_rows = ((uint64_t) pos_max + 1) / ratio;
+    return (uint32_t) std::min<uint64_t>(n_rows, dsv4_cache_layers[il].n_comp);
+}
+
+void llama_memory_hybrid_iswa::dsv4_state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    if (!has_dsv4_compressed_kv()) {
+        return;
+    }
+
+    GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max));
+
+    std::vector<llama_seq_id> seq_ids;
+    auto seq_has_rows = [&](llama_seq_id seq) {
+        for (int32_t il = 0; il < (int32_t) dsv4_cache_layers.size(); ++il) {
+            if (dsv4_n_state_rows(il, seq) > 0) {
+                return true;
+            }
+        }
+        return false;
+    };
+
+    if (seq_id >= 0) {
+        if (seq_has_rows(seq_id)) {
+            seq_ids.push_back(seq_id);
+        }
+    } else {
+        for (uint32_t seq = 0; seq < dsv4_n_seq_max; ++seq) {
+            if (seq_has_rows(seq)) {
+                seq_ids.push_back(seq);
+            }
+        }
+    }
+
+    const uint32_t magic   = DSV4_COMPRESSED_KV_STATE_MAGIC;
+    const uint32_t version = DSV4_COMPRESSED_KV_STATE_VERSION;
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_seq   = seq_ids.size();
+
+    io.write(&magic,   sizeof(magic));
+    io.write(&version, sizeof(version));
+    io.write(&n_layer, sizeof(n_layer));
+    io.write(&n_seq,   sizeof(n_seq));
+
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        const auto & layer = dsv4_cache_layers[il];
+
+        const uint32_t n_comp = layer.n_comp;
+        io.write(&n_comp, sizeof(n_comp));
+
+        const uint32_t has_attn = layer.attn_k != nullptr;
+        io.write(&has_attn, sizeof(has_attn));
+        if (has_attn) {
+            const int32_t  type_i   = (int32_t) layer.attn_k->type;
+            const uint64_t row_size = dsv4_cache_row_size(layer.attn_k);
+            io.write(&type_i,   sizeof(type_i));
+            io.write(&row_size, sizeof(row_size));
+        }
+
+        const uint32_t has_index = layer.index_k != nullptr;
+        io.write(&has_index, sizeof(has_index));
+        if (has_index) {
+            const int32_t  type_i   = (int32_t) layer.index_k->type;
+            const uint64_t row_size = dsv4_cache_row_size(layer.index_k);
+            io.write(&type_i,   sizeof(type_i));
+            io.write(&row_size, sizeof(row_size));
+        }
+    }
+
+    for (llama_seq_id seq : seq_ids) {
+        io.write(&seq, sizeof(seq));
+
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const auto & layer = dsv4_cache_layers[il];
+            const uint32_t n_rows = dsv4_n_state_rows(il, seq);
+
+            if (layer.attn_k != nullptr) {
+                const uint64_t row_size = dsv4_cache_row_size(layer.attn_k);
+                io.write(&n_rows, sizeof(n_rows));
+                if (n_rows > 0) {
+                    io.write_tensor(layer.attn_k, dsv4_cache_offset(layer.attn_k, seq, 0), (size_t) n_rows*row_size);
+                }
+            }
+
+            if (layer.index_k != nullptr) {
+                const uint64_t row_size = dsv4_cache_row_size(layer.index_k);
+                io.write(&n_rows, sizeof(n_rows));
+                if (n_rows > 0) {
+                    io.write_tensor(layer.index_k, dsv4_cache_offset(layer.index_k, seq, 0), (size_t) n_rows*row_size);
+                }
+            }
+        }
+    }
+}
+
+void llama_memory_hybrid_iswa::dsv4_state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    if (!has_dsv4_compressed_kv()) {
+        return;
+    }
+
+    GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max));
+
+    uint32_t magic;
+    uint32_t version;
+    uint32_t n_layer;
+    uint32_t n_seq;
+
+    io.read(&magic,   sizeof(magic));
+    io.read(&version, sizeof(version));
+    io.read(&n_layer, sizeof(n_layer));
+    io.read(&n_seq,   sizeof(n_seq));
+
+    if (magic != DSV4_COMPRESSED_KV_STATE_MAGIC) {
+        throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: bad magic");
+    }
+    if (version != DSV4_COMPRESSED_KV_STATE_VERSION) {
+        throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: bad version");
+    }
+    if (n_layer != hparams.n_layer || n_layer != dsv4_cache_layers.size()) {
+        throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: mismatched layer count");
+    }
+
+    struct layer_meta {
+        uint32_t n_comp = 0;
+        bool has_attn = false;
+        int32_t attn_type = -1;
+        uint64_t attn_row_size = 0;
+        bool has_index = false;
+        int32_t index_type = -1;
+        uint64_t index_row_size = 0;
+    };
+
+    std::vector<layer_meta> meta(n_layer);
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        auto & m = meta[il];
+        const auto & layer = dsv4_cache_layers[il];
+
+        io.read(&m.n_comp, sizeof(m.n_comp));
+
+        uint32_t has_attn;
+        io.read(&has_attn, sizeof(has_attn));
+        m.has_attn = has_attn != 0;
+        if (m.has_attn) {
+            io.read(&m.attn_type,     sizeof(m.attn_type));
+            io.read(&m.attn_row_size, sizeof(m.attn_row_size));
+        }
+
+        uint32_t has_index;
+        io.read(&has_index, sizeof(has_index));
+        m.has_index = has_index != 0;
+        if (m.has_index) {
+            io.read(&m.index_type,     sizeof(m.index_type));
+            io.read(&m.index_row_size, sizeof(m.index_row_size));
+        }
+
+        const bool local_has_attn  = layer.attn_k  != nullptr;
+        const bool local_has_index = layer.index_k != nullptr;
+
+        if (m.n_comp != layer.n_comp || m.has_attn != local_has_attn || m.has_index != local_has_index) {
+            throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: mismatched layer layout");
+        }
+        if (local_has_attn) {
+            const int32_t  type_i   = (int32_t) layer.attn_k->type;
+            const uint64_t row_size = dsv4_cache_row_size(layer.attn_k);
+            if (m.attn_type != type_i || m.attn_row_size != row_size) {
+                throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: mismatched attention cache type");
+            }
+        }
+        if (local_has_index) {
+            const int32_t  type_i   = (int32_t) layer.index_k->type;
+            const uint64_t row_size = dsv4_cache_row_size(layer.index_k);
+            if (m.index_type != type_i || m.index_row_size != row_size) {
+                throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: mismatched index cache type");
+            }
+        }
+    }
+
+    if (seq_id == -1) {
+        for (auto & [_, buf] : dsv4_ctxs_bufs) {
+            ggml_backend_buffer_clear(buf.get(), 0);
+        }
+    } else {
+        dsv4_clear_seq(seq_id);
+    }
+
+    // scratch buffer for skipping additional blocks in targeted-restore mode
+    std::vector<uint8_t> skip_buf;
+
+    // For targeted restore (seq_id != -1) we follow the public API contract
+    // documented at llama.h:836 (llama_state_seq_set_data) and exercised by
+    // examples/save-load-state/save-load-state.cpp: the first encountered
+    // serialized block is REMAPPED into the requested destination seq_id.
+    // Subsequent blocks (atypical multi-seq payloads) are skipped to avoid
+    // silently merging multiple source sequences into one destination.
+    bool restored_one = false;
+
+    for (uint32_t is = 0; is < n_seq; ++is) {
+        llama_seq_id src_seq_id;
+        io.read(&src_seq_id, sizeof(src_seq_id));
+
+        const bool skip_block = (seq_id != -1 && restored_one);
+
+        const llama_seq_id dst_seq_id = (seq_id == -1) ? src_seq_id : seq_id;
+        if (!skip_block && (dst_seq_id < 0 || (uint32_t) dst_seq_id >= dsv4_n_seq_max)) {
+            throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: invalid sequence id");
+        }
+
+        for (uint32_t il = 0; il < n_layer; ++il) {
+            const auto & layer = dsv4_cache_layers[il];
+
+            if (layer.attn_k != nullptr) {
+                uint32_t n_rows;
+                io.read(&n_rows, sizeof(n_rows));
+                if (n_rows > layer.n_comp) {
+                    throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: too many attention rows");
+                }
+                if (n_rows > 0) {
+                    const size_t row_size = dsv4_cache_row_size(layer.attn_k);
+                    const size_t nbytes   = (size_t) n_rows * row_size;
+                    if (skip_block) {
+                        // advance io past this block's bytes without restoring
+                        skip_buf.resize(nbytes);
+                        io.read(skip_buf.data(), nbytes);
+                    } else {
+                        io.read_tensor(layer.attn_k,
+                                dsv4_cache_offset(layer.attn_k, dst_seq_id, 0), nbytes);
+                    }
+                }
+            }
+
+            if (layer.index_k != nullptr) {
+                uint32_t n_rows;
+                io.read(&n_rows, sizeof(n_rows));
+                if (n_rows > layer.n_comp) {
+                    throw std::runtime_error("failed to restore DeepSeek V4 compressed KV cache: too many index rows");
+                }
+                if (n_rows > 0) {
+                    const size_t row_size = dsv4_cache_row_size(layer.index_k);
+                    const size_t nbytes   = (size_t) n_rows * row_size;
+                    if (skip_block) {
+                        // advance io past this block's bytes without restoring
+                        skip_buf.resize(nbytes);
+                        io.read(skip_buf.data(), nbytes);
+                    } else {
+                        io.read_tensor(layer.index_k,
+                                dsv4_cache_offset(layer.index_k, dst_seq_id, 0), nbytes);
+                    }
+                }
+            }
+        }
+
+        if (!skip_block && seq_id != -1) {
+            restored_one = true;
+        }
+    }
 }
 
 llama_kv_cache_iswa * llama_memory_hybrid_iswa::get_mem_attn() const {
@@ -202,6 +806,41 @@ llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const {
     return mem_recr.get();
 }
 
+bool llama_memory_hybrid_iswa::has_dsv4_compressed_kv() const {
+    for (const auto & layer : dsv4_cache_layers) {
+        if (layer.n_comp != 0) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+uint32_t llama_memory_hybrid_iswa::get_dsv4_n_comp(int32_t il) const {
+    GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size());
+    return dsv4_cache_layers[il].n_comp;
+}
+
+ggml_tensor * llama_memory_hybrid_iswa::get_dsv4_attn_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const {
+    GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size());
+    GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max);
+
+    ggml_tensor * t = dsv4_cache_layers[il].attn_k;
+    GGML_ASSERT(t != nullptr);
+
+    return ggml_view_2d(ctx, t, t->ne[0], t->ne[1], t->nb[1], seq_id*t->nb[2]);
+}
+
+ggml_tensor * llama_memory_hybrid_iswa::get_dsv4_index_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const {
+    GGML_ASSERT(il >= 0 && il < (int32_t) dsv4_cache_layers.size());
+    GGML_ASSERT(seq_id >= 0 && (uint32_t) seq_id < dsv4_n_seq_max);
+
+    ggml_tensor * t = dsv4_cache_layers[il].index_k;
+    GGML_ASSERT(t != nullptr);
+
+    return ggml_view_2d(ctx, t, t->ne[0], t->ne[1], t->nb[1], seq_id*t->nb[2]);
+}
+
 //
 // llama_memory_hybrid_iswa_context
 //
@@ -209,6 +848,7 @@ llama_memory_recurrent * llama_memory_hybrid_iswa::get_mem_recr() const {
 llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_status status) : status(status) {}
 
 llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(llama_memory_hybrid_iswa * mem) :
+    mem(mem),
     ctx_attn(mem->get_mem_attn()->init_full()),
     ctx_recr(mem->get_mem_recr()->init_full()),
     status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
@@ -218,6 +858,7 @@ llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
         llama_memory_hybrid_iswa * mem,
                    llama_context * lctx,
                             bool   optimize) :
+    mem(mem),
     ctx_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
     ctx_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
     status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
@@ -228,6 +869,7 @@ llama_memory_hybrid_iswa_context::llama_memory_hybrid_iswa_context(
                     slot_info_vec_t   sinfos_base,
                     slot_info_vec_t   sinfos_swa,
           std::vector<llama_ubatch>   ubatches) :
+    mem(mem),
     ubatches(std::move(ubatches)),
     // note: here we copy the ubatches. not sure if this is ideal
     ctx_attn(new llama_kv_cache_iswa_context(mem->get_mem_attn(), std::move(sinfos_base), std::move(sinfos_swa), this->ubatches)),
@@ -275,3 +917,22 @@ const llama_kv_cache_iswa_context * llama_memory_hybrid_iswa_context::get_attn()
 const llama_memory_recurrent_context * llama_memory_hybrid_iswa_context::get_recr() const {
     return static_cast<const llama_memory_recurrent_context *>(ctx_recr.get());
 }
+
+bool llama_memory_hybrid_iswa_context::has_dsv4_compressed_kv() const {
+    return mem != nullptr && mem->has_dsv4_compressed_kv();
+}
+
+uint32_t llama_memory_hybrid_iswa_context::get_dsv4_n_comp(int32_t il) const {
+    GGML_ASSERT(mem != nullptr);
+    return mem->get_dsv4_n_comp(il);
+}
+
+ggml_tensor * llama_memory_hybrid_iswa_context::get_dsv4_attn_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const {
+    GGML_ASSERT(mem != nullptr);
+    return mem->get_dsv4_attn_k(ctx, il, seq_id);
+}
+
+ggml_tensor * llama_memory_hybrid_iswa_context::get_dsv4_index_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const {
+    GGML_ASSERT(mem != nullptr);
+    return mem->get_dsv4_index_k(ctx, il, seq_id);
+}
diff --git a/src/llama-memory-hybrid-iswa.h b/src/llama-memory-hybrid-iswa.h
index c9d3f9f57c50..8d11f081615c 100644
--- a/src/llama-memory-hybrid-iswa.h
+++ b/src/llama-memory-hybrid-iswa.h
@@ -83,11 +83,39 @@ class llama_memory_hybrid_iswa : public llama_memory_i {
     llama_kv_cache_iswa * get_mem_attn() const;
     llama_memory_recurrent * get_mem_recr() const;
 
+    bool has_dsv4_compressed_kv() const;
+    uint32_t get_dsv4_n_comp(int32_t il) const;
+    ggml_tensor * get_dsv4_attn_k (ggml_context * ctx, int32_t il, llama_seq_id seq_id) const;
+    ggml_tensor * get_dsv4_index_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const;
+
 private:
     const llama_hparams & hparams;
 
     const std::unique_ptr<llama_kv_cache_iswa> mem_attn;
     const std::unique_ptr<llama_memory_recurrent> mem_recr;
+
+    struct dsv4_cache_layer {
+        uint32_t n_comp = 0;
+        ggml_tensor * attn_k  = nullptr;
+        ggml_tensor * index_k = nullptr;
+    };
+
+    uint32_t dsv4_n_seq_max = 0;
+    std::vector<dsv4_cache_layer> dsv4_cache_layers;
+    std::vector<std::pair<ggml_context_ptr, ggml_backend_buffer_ptr>> dsv4_ctxs_bufs;
+
+    void dsv4_seq_rm  (llama_seq_id seq_id, llama_pos p0, llama_pos p1);
+    void dsv4_seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);
+    void dsv4_seq_keep(llama_seq_id seq_id);
+
+    void dsv4_clear_seq(llama_seq_id seq_id);
+    void dsv4_clear_rows(llama_seq_id seq_id, int32_t il, llama_pos p0, llama_pos p1);
+    void dsv4_copy_rows (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, int32_t il, llama_pos p0, llama_pos p1);
+
+    uint32_t dsv4_n_state_rows(int32_t il, llama_seq_id seq_id) const;
+
+    void dsv4_state_write(llama_io_write_i & io, llama_seq_id seq_id) const;
+    void dsv4_state_read (llama_io_read_i  & io, llama_seq_id seq_id);
 };
 
 class llama_memory_hybrid_iswa_context : public llama_memory_context_i {
@@ -128,7 +156,14 @@ class llama_memory_hybrid_iswa_context : public llama_memory_context_i {
     const llama_kv_cache_iswa_context * get_attn() const;
     const llama_memory_recurrent_context * get_recr() const;
 
+    bool has_dsv4_compressed_kv() const;
+    uint32_t get_dsv4_n_comp(int32_t il) const;
+    ggml_tensor * get_dsv4_attn_k (ggml_context * ctx, int32_t il, llama_seq_id seq_id) const;
+    ggml_tensor * get_dsv4_index_k(ggml_context * ctx, int32_t il, llama_seq_id seq_id) const;
+
 private:
+    llama_memory_hybrid_iswa * mem = nullptr;
+
     // the index of the next ubatch to process
     size_t i_next = 0;
 
diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp
index fd305cab79c2..529022ded18d 100644
--- a/src/llama-memory-hybrid.cpp
+++ b/src/llama-memory-hybrid.cpp
@@ -33,6 +33,7 @@ llama_memory_hybrid::llama_memory_hybrid(
     hparams(model.hparams),
     mem_attn(new llama_kv_cache(
         model,
+        model.hparams,
         type_k,
         type_v,
         v_trans,
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
index c645d0785ab7..3343790070c4 100644
--- a/src/llama-model-loader.cpp
+++ b/src/llama-model-loader.cpp
@@ -392,6 +392,7 @@ namespace GGUFMeta {
         return get_arr(llm_kv(kid), result, required);
     }
 
+    template bool llama_model_loader::get_arr<std::vector<uint32_t>>(enum llm_kv kid, std::vector<uint32_t> & result, bool required);
     template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
 
     template<typename T>
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 8bf20a716eba..d35fb4db23a2 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -170,6 +170,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
             return new llama_model_deepseek2(params);
         case LLM_ARCH_DEEPSEEK2OCR:
             return new llama_model_deepseek2ocr(params);
+        case LLM_ARCH_DEEPSEEK4:
+            return new llama_model_deepseek4(params);
         case LLM_ARCH_GLM_DSA:
             return new llama_model_glm_dsa(params);
         case LLM_ARCH_MISTRAL4:
@@ -777,6 +779,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_310B_A15B:     return "310B.A15B";
         case LLM_TYPE_355B_A32B:     return "355B.A32B";
         case LLM_TYPE_397B_A17B:     return "397B.A17B";
+        case LLM_TYPE_685B_A37B:     return "685B.A37B";
         case LLM_TYPE_744B_A40B:     return "744B.A40B";
         case LLM_TYPE_E2B:           return "E2B";
         case LLM_TYPE_E4B:           return "E4B";
@@ -1768,6 +1771,27 @@ void llama_model::print_info() const {
             LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
         }
 
+        if (arch == LLM_ARCH_DEEPSEEK4) {
+            LLAMA_LOG_INFO("%s: n_lora_q              = %d\n",     __func__, hparams.n_lora_q);
+            LLAMA_LOG_INFO("%s: n_lora_o              = %d\n",     __func__, hparams.n_lora_o);
+            LLAMA_LOG_INFO("%s: n_attn_out_groups     = %d\n",     __func__, hparams.n_attn_out_groups);
+            LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
+            LLAMA_LOG_INFO("%s: n_expert_shared       = %d\n",     __func__, hparams.n_expert_shared);
+            LLAMA_LOG_INFO("%s: n_swa                 = %d\n",     __func__, hparams.n_swa);
+            LLAMA_LOG_INFO("%s: compress_rope_freq_base = %.1f\n", __func__, hparams.compress_rope_freq_base);
+            LLAMA_LOG_INFO("%s: indexer_n_head        = %d\n",     __func__, hparams.indexer_n_head);
+            LLAMA_LOG_INFO("%s: indexer_head_size     = %d\n",     __func__, hparams.indexer_head_size);
+            LLAMA_LOG_INFO("%s: indexer_top_k         = %d\n",     __func__, hparams.indexer_top_k);
+            LLAMA_LOG_INFO("%s: n_hash_layers         = %d\n",     __func__, hparams.n_hash_layers);
+            LLAMA_LOG_INFO("%s: n_hc                  = %d\n",     __func__, hparams.n_hc);
+            LLAMA_LOG_INFO("%s: hc_sinkhorn_iters     = %d\n",     __func__, hparams.hc_sinkhorn_iters);
+            LLAMA_LOG_INFO("%s: hc_eps                = %.1e\n",   __func__, hparams.hc_eps);
+            LLAMA_LOG_INFO("%s: nextn_predict_layers  = %d\n",     __func__, hparams.nextn_predict_layers);
+            LLAMA_LOG_INFO("%s: expert_weights_scale  = %.1f\n",   __func__, hparams.expert_weights_scale);
+            LLAMA_LOG_INFO("%s: expert_weights_norm   = %d\n",     __func__, hparams.expert_weights_norm);
+            LLAMA_LOG_INFO("%s: expert_gating_func    = %s\n",     __func__, llama_expert_gating_func_name((llama_expert_gating_func_type) hparams.expert_gating_func));
+        }
+
         if (arch == LLM_ARCH_QWEN2MOE) {
             LLAMA_LOG_INFO("%s: n_ff_exp              = %d\n",     __func__, hparams.n_ff_exp);
             LLAMA_LOG_INFO("%s: n_ff_shexp            = %d\n",     __func__, hparams.n_ff_shexp);
@@ -1943,6 +1967,58 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
             {
                 res = nullptr;
             } break;
+        case LLM_ARCH_DEEPSEEK4:
+            {
+                llama_memory_i::layer_filter_cb filter_attn = [&](int32_t) {
+                    return true;
+                };
+                llama_memory_i::layer_filter_cb filter_recr = [&](int32_t il) {
+                    return hparams.attn_compress_ratio[il] != 0;
+                };
+
+                // V4's standard SWA K cache, compressed-attention K cache
+                // (cache.attn_k), and indexer K cache (cache.index_k) all
+                // share the same `type_k` and must agree in dtype because
+                // src/models/deepseek4.cpp concatenates the SWA K view with
+                // the compressed K view via ggml_concat (which asserts
+                // a->type == b->type). Furthermore, V4's K activations are
+                // post-fp8-quantized (ggml_dsv4_fp8_kv_quantize), and q8_0's
+                // single fp16 scale per 32-element block cannot faithfully
+                // reproduce fp8-quantized value distributions -- pinning to
+                // q8_0 corrupts decode silently ("=" loops, "Mirror ..."
+                // garbage). Force fp16 unconditionally for V4 KV caches.
+                //
+                // NOTE: the user-facing WARN and the params.type_k/type_v
+                // coercion already happen earlier in llama_init_from_model
+                // (src/llama-context.cpp), BEFORE the shared
+                // SPLIT_MODE_TENSOR / V-quant-requires-FA validations run,
+                // so users requesting q8_0 KV with V4 don't trip those
+                // checks. The fp16 pin here is a defense-in-depth safety
+                // net for any direct callers of create_memory() that
+                // bypass llama_init_from_model. See
+                // docs/plans/v4-port-kv-q8-completion.md.
+                ggml_type v4_type_k = GGML_TYPE_F16;
+                ggml_type v4_type_v = GGML_TYPE_F16;
+
+                res = new llama_memory_hybrid_iswa(
+                        /* model             */ *this,
+                        /* attn_type_k       */ v4_type_k,
+                        /* attn_type_v       */ v4_type_v,
+                        /* attn_v_trans      */ !cparams.flash_attn,
+                        /* attn_swa_full     */ params.swa_full,
+                        /* attn_kv_size      */ cparams.n_ctx_seq,
+                        /* attn_n_ubatch     */ cparams.n_ubatch,
+                        /* attn_n_pad        */ 1,
+                        /* recurrent_type_r  */ GGML_TYPE_F32,
+                        /* recurrent_type_s  */ GGML_TYPE_F32,
+                        /* recurrent_rs_size */ std::max((uint32_t) 1, cparams.n_seq_max),
+                        /* n_seq_max         */ cparams.n_seq_max,
+                        /* n_rs_seq          */ cparams.n_rs_seq,
+                        /* offload           */ cparams.offload_kqv,
+                        /* unified           */ cparams.kv_unified,
+                        /* filter_attn       */ std::move(filter_attn),
+                        /* filter_recr       */ std::move(filter_recr));
+            } break;
         // Models that need standard caching should rely on recurrent/hybrid
         // checks
         default:
@@ -2069,6 +2145,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
 
                         res = new llama_kv_cache(
                                 *this,
+                                hparams,
                                 params.type_k,
                                 params.type_v,
                                 !cparams.flash_attn,
@@ -2258,6 +2335,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_DEEPSEEK:
         case LLM_ARCH_DEEPSEEK2:
         case LLM_ARCH_DEEPSEEK2OCR:
+        case LLM_ARCH_DEEPSEEK4:
         case LLM_ARCH_PLM:
         case LLM_ARCH_CHATGLM:
         case LLM_ARCH_GRANITE:
diff --git a/src/llama-model.h b/src/llama-model.h
index 01c87a75271f..fe1ea971a16c 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -137,6 +137,7 @@ enum llm_type {
     LLM_TYPE_310B_A15B, // /MiMo-V2-Flash
     LLM_TYPE_355B_A32B, // GLM-4.5
     LLM_TYPE_397B_A17B, // Qwen3.5
+    LLM_TYPE_685B_A37B, // DeepSeek V4-Flash
     LLM_TYPE_744B_A40B, // GLM-5
     LLM_TYPE_E2B,
     LLM_TYPE_E4B,
@@ -257,6 +258,15 @@ struct llama_layer {
     struct ggml_tensor * wv_enc    = nullptr;
     struct ggml_tensor * wo_enc    = nullptr;
     struct ggml_tensor * wqkv_gate = nullptr;
+    struct ggml_tensor * attn_kv   = nullptr;
+    struct ggml_tensor * attn_wo_a = nullptr;
+    struct ggml_tensor * attn_wo_b = nullptr;
+
+    // DeepSeek V4 KV compressors
+    struct ggml_tensor * attn_compressor_ape  = nullptr;
+    struct ggml_tensor * attn_compressor_kv   = nullptr;
+    struct ggml_tensor * attn_compressor_gate = nullptr;
+    struct ggml_tensor * attn_compressor_norm = nullptr;
 
     // relative position bias
     struct ggml_tensor * attn_rel_b       = nullptr;
@@ -322,6 +332,7 @@ struct llama_layer {
     struct ggml_tensor * ffn_up_b   = nullptr; // b3
     struct ggml_tensor * ffn_act    = nullptr;
     struct ggml_tensor * ffn_exp_probs_b = nullptr;
+    struct ggml_tensor * ffn_gate_tid2eid = nullptr;
 
     // mamba proj
     struct ggml_tensor * ssm_in  = nullptr;
@@ -483,6 +494,18 @@ struct llama_layer {
     struct ggml_tensor * indexer_proj     = nullptr;
     struct ggml_tensor * indexer_attn_k   = nullptr;
     struct ggml_tensor * indexer_attn_q_b = nullptr; // note: for lora a/b, not bias
+    struct ggml_tensor * indexer_compressor_ape  = nullptr;
+    struct ggml_tensor * indexer_compressor_kv   = nullptr;
+    struct ggml_tensor * indexer_compressor_gate = nullptr;
+    struct ggml_tensor * indexer_compressor_norm = nullptr;
+
+    // DeepSeek V4 hyper-connection weights
+    struct ggml_tensor * hc_attn_base  = nullptr;
+    struct ggml_tensor * hc_attn_fn    = nullptr;
+    struct ggml_tensor * hc_attn_scale = nullptr;
+    struct ggml_tensor * hc_ffn_base   = nullptr;
+    struct ggml_tensor * hc_ffn_fn     = nullptr;
+    struct ggml_tensor * hc_ffn_scale  = nullptr;
 
     // gemma4 layer output scale
     struct ggml_tensor * out_scale = nullptr;
@@ -531,6 +554,9 @@ struct llama_model {
     struct ggml_tensor * output_norm_b   = nullptr;
     struct ggml_tensor * output          = nullptr;
     struct ggml_tensor * output_b        = nullptr;
+    struct ggml_tensor * output_hc_base  = nullptr;
+    struct ggml_tensor * output_hc_fn    = nullptr;
+    struct ggml_tensor * output_hc_scale = nullptr;
     struct ggml_tensor * output_norm_enc = nullptr;
 
 
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 43e05c3d56fe..6914df768c1e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -292,6 +292,14 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
     // quantize only 2D and 3D tensors (experts)
     if (ggml_n_dims(tensor) < 2) return false;
 
+    // do not quantize integer tensors (e.g. DeepSeek V4 ffn_gate_tid2eid which
+    // stores expert-id indices as I32). Quantization makes no sense for non
+    // floating-point data; the dequantize path also explicitly rejects them.
+    if (tensor->type == GGML_TYPE_I8  ||
+        tensor->type == GGML_TYPE_I16 ||
+        tensor->type == GGML_TYPE_I32 ||
+        tensor->type == GGML_TYPE_I64) return false;
+
     const std::string name = ggml_get_name(tensor);
 
     // This used to be a regex, but <regex> has an extreme cost to compile times.
diff --git a/src/models/deepseek4.cpp b/src/models/deepseek4.cpp
new file mode 100644
index 000000000000..71d803014fd6
--- /dev/null
+++ b/src/models/deepseek4.cpp
@@ -0,0 +1,1583 @@
+#include "models.h"
+
+#include "ggml-backend.h"
+#include "llama-kv-cache-iswa.h"
+#include "llama-memory-hybrid-iswa.h"
+#include "llama-memory-recurrent.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+void llama_model_deepseek4::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK,       hparams.n_lora_q);
+    ml.get_key(LLM_KV_ATTENTION_OUTPUT_LORA_RANK,  hparams.n_lora_o);
+    ml.get_key(LLM_KV_ATTENTION_OUTPUT_GROUP_COUNT,hparams.n_attn_out_groups);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH,  hparams.n_ff_exp);
+    ml.get_key(LLM_KV_EXPERT_SHARED_COUNT,         hparams.n_expert_shared);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE,        hparams.expert_weights_scale, false);
+    ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM,         hparams.expert_weights_norm, false);
+    ml.get_key(LLM_KV_EXPERT_GATING_FUNC,          hparams.expert_gating_func, false);
+    if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
+        hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SQRTSOFTPLUS;
+    }
+
+    ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW,          hparams.n_swa, false);
+    if (hparams.n_swa > 0) {
+        hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
+        hparams.set_swa_pattern(0, false);
+        hparams.rope_freq_base_train_swa  = hparams.rope_freq_base_train;
+        hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train;
+    }
+    ml.get_key(LLM_KV_ATTENTION_COMPRESS_ROPE_FREQ_BASE, hparams.compress_rope_freq_base, false);
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_HEAD_COUNT,      hparams.indexer_n_head, false);
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_KEY_LENGTH,      hparams.indexer_head_size, false);
+    ml.get_key(LLM_KV_ATTENTION_INDEXER_TOP_K,           hparams.indexer_top_k, false);
+    ml.get_key(LLM_KV_HASH_LAYER_COUNT,                  hparams.n_hash_layers);
+    ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS,              hparams.nextn_predict_layers, false);
+    ml.get_key(LLM_KV_HYPER_CONNECTION_COUNT,            hparams.n_hc);
+    ml.get_key(LLM_KV_HYPER_CONNECTION_SINKHORN_ITERS,   hparams.hc_sinkhorn_iters);
+    ml.get_key(LLM_KV_HYPER_CONNECTION_EPS,              hparams.hc_eps);
+    ml.get_key_or_arr(LLM_KV_SWIGLU_CLAMP_EXP,           hparams.swiglu_clamp_exp, hparams.n_layer, false);
+
+    std::vector<uint32_t> compress_ratios;
+    ml.get_arr(LLM_KV_ATTENTION_COMPRESS_RATIOS, compress_ratios);
+    if (compress_ratios.size() < hparams.n_layer) {
+        throw std::runtime_error(format("DeepSeek V4 compress ratio count mismatch: got %zu, expected %u",
+                    compress_ratios.size(), hparams.n_layer));
+    }
+    std::copy_n(compress_ratios.begin(), hparams.n_layer, hparams.attn_compress_ratio.begin());
+
+    for (uint32_t il = 0; il < hparams.n_layer; ++il) {
+        const uint32_t ratio = hparams.attn_compress_ratio[il];
+        if (ratio == 0) {
+            continue;
+        }
+
+        const uint32_t coff = ratio == 4 ? 2 : 1;
+        uint32_t state_size = coff * ratio * coff * hparams.n_embd_head_k(il);
+        if (ratio == 4) {
+            state_size += coff * ratio * coff * hparams.indexer_head_size;
+        }
+        hparams.dsv4_state_size = std::max(hparams.dsv4_state_size, state_size);
+    }
+
+    type = LLM_TYPE_UNKNOWN;
+}
+
+void llama_model_deepseek4::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    const int64_t q_lora_rank       = hparams.n_lora_q;
+    const int64_t o_lora_rank       = hparams.n_lora_o;
+    const int64_t n_out_groups      = hparams.n_attn_out_groups;
+    const int64_t n_ff_exp          = hparams.n_ff_exp;
+    const int64_t n_expert_shared   = hparams.n_expert_shared;
+    const int64_t n_hc              = hparams.n_hc;
+    const int64_t hc_dim            = n_hc * n_embd;
+    const int64_t hc_mix            = (2 + n_hc) * n_hc;
+
+    if (n_out_groups == 0) {
+        throw std::runtime_error("DeepSeek V4 requires attention output groups");
+    }
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    output_norm     = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM,     "weight"), {n_embd}, 0);
+    output          = create_tensor(tn(LLM_TENSOR_OUTPUT,          "weight"), {n_embd, n_vocab}, 0);
+    output_hc_base  = create_tensor(tn(LLM_TENSOR_OUTPUT_HC_BASE,  "weight"), {n_hc}, 0);
+    output_hc_fn    = create_tensor(tn(LLM_TENSOR_OUTPUT_HC_FN,    "weight"), {hc_dim, n_hc}, 0);
+    output_hc_scale = create_tensor(tn(LLM_TENSOR_OUTPUT_HC_SCALE, "weight"), {1}, 0);
+
+    auto create_deepseek4_compressor = [&](llama_layer & layer, int bid, int64_t compress_ratio, int64_t head_size, bool indexer) {
+        const int64_t coff = compress_ratio == 4 ? 2 : 1;
+        ggml_tensor *& ape  = indexer ? layer.indexer_compressor_ape  : layer.attn_compressor_ape;
+        ggml_tensor *& kv   = indexer ? layer.indexer_compressor_kv   : layer.attn_compressor_kv;
+        ggml_tensor *& gate = indexer ? layer.indexer_compressor_gate : layer.attn_compressor_gate;
+        ggml_tensor *& norm = indexer ? layer.indexer_compressor_norm : layer.attn_compressor_norm;
+
+        ape  = create_tensor(tn(indexer ? LLM_TENSOR_INDEXER_COMPRESSOR_APE  : LLM_TENSOR_ATTN_COMPRESSOR_APE,  "weight", bid), {coff * head_size, compress_ratio}, 0);
+        kv   = create_tensor(tn(indexer ? LLM_TENSOR_INDEXER_COMPRESSOR_KV   : LLM_TENSOR_ATTN_COMPRESSOR_KV,   "weight", bid), {n_embd, coff * head_size}, 0);
+        gate = create_tensor(tn(indexer ? LLM_TENSOR_INDEXER_COMPRESSOR_GATE : LLM_TENSOR_ATTN_COMPRESSOR_GATE, "weight", bid), {n_embd, coff * head_size}, 0);
+        norm = create_tensor(tn(indexer ? LLM_TENSOR_INDEXER_COMPRESSOR_NORM : LLM_TENSOR_ATTN_COMPRESSOR_NORM, "weight", bid), {head_size}, 0);
+    };
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        const int64_t compress_ratio = hparams.attn_compress_ratio[i];
+
+        layer.hc_attn_base  = create_tensor(tn(LLM_TENSOR_HC_ATTN_BASE,  "weight", i), {hc_mix}, 0);
+        layer.hc_attn_fn    = create_tensor(tn(LLM_TENSOR_HC_ATTN_FN,    "weight", i), {hc_dim, hc_mix}, 0);
+        layer.hc_attn_scale = create_tensor(tn(LLM_TENSOR_HC_ATTN_SCALE, "weight", i), {3}, 0);
+        layer.hc_ffn_base   = create_tensor(tn(LLM_TENSOR_HC_FFN_BASE,   "weight", i), {hc_mix}, 0);
+        layer.hc_ffn_fn     = create_tensor(tn(LLM_TENSOR_HC_FFN_FN,     "weight", i), {hc_dim, hc_mix}, 0);
+        layer.hc_ffn_scale  = create_tensor(tn(LLM_TENSOR_HC_FFN_SCALE,  "weight", i), {3}, 0);
+
+        layer.attn_norm      = create_tensor(tn(LLM_TENSOR_ATTN_NORM,      "weight", i), {n_embd}, 0);
+        layer.ffn_norm       = create_tensor(tn(LLM_TENSOR_FFN_NORM,       "weight", i), {n_embd}, 0);
+        layer.attn_sinks     = create_tensor(tn(LLM_TENSOR_ATTN_SINKS,     "weight", i), {n_head}, 0);
+        layer.attn_q_a_norm  = create_tensor(tn(LLM_TENSOR_ATTN_Q_A_NORM,  "weight", i), {q_lora_rank}, 0);
+        layer.attn_kv_a_norm = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_NORM, "weight", i), {n_embd_head_k}, 0);
+
+        layer.wq_a    = create_tensor(tn(LLM_TENSOR_ATTN_Q_A,    "weight", i), {n_embd, q_lora_rank}, 0);
+        layer.wq_b    = create_tensor(tn(LLM_TENSOR_ATTN_Q_B,    "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0);
+        layer.attn_kv = create_tensor(tn(LLM_TENSOR_ATTN_KV,     "weight", i), {n_embd, n_embd_head_k}, 0);
+        layer.attn_wo_a = create_tensor(tn(LLM_TENSOR_ATTN_OUT_A, "weight", i), {n_head * n_embd_head_v / n_out_groups, n_out_groups * o_lora_rank}, 0);
+        layer.attn_wo_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_B, "weight", i), {n_out_groups * o_lora_rank, n_embd}, 0);
+
+        if (compress_ratio > 0) {
+            create_deepseek4_compressor(layer, i, compress_ratio, n_embd_head_k, false);
+        }
+        if (compress_ratio == 4) {
+            layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, 0);
+            layer.indexer_proj     = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ,     "weight", i), {n_embd, hparams.indexer_n_head}, 0);
+            create_deepseek4_compressor(layer, i, compress_ratio, hparams.indexer_head_size, true);
+        }
+
+        layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
+        if (static_cast<uint32_t>(i) < hparams.n_hash_layers) {
+            layer.ffn_gate_tid2eid = create_tensor(tn(LLM_TENSOR_FFN_GATE_TID2EID, "weight", i), {n_expert_used, n_vocab}, 0);
+            layer.ffn_exp_probs_b  = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B,  "bias",   i), {n_expert}, TENSOR_NOT_REQUIRED);
+        } else {
+            layer.ffn_exp_probs_b  = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B,  "bias",   i), {n_expert}, 0);
+            layer.ffn_gate_tid2eid = create_tensor(tn(LLM_TENSOR_FFN_GATE_TID2EID, "weight", i), {n_expert_used, n_vocab}, TENSOR_NOT_REQUIRED);
+        }
+
+        layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
+        layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd,   n_expert}, 0);
+        layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {n_embd,   n_ff_exp, n_expert}, 0);
+
+        layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd,   n_ff_exp * n_expert_shared}, 0);
+        layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_exp * n_expert_shared, n_embd}, 0);
+        layer.ffn_up_shexp   = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP,   "weight", i), {n_embd,   n_ff_exp * n_expert_shared}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_deepseek4::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+namespace {
+
+struct dsv4_hc_mix {
+    ggml_tensor * x;
+    ggml_tensor * mixes;
+    ggml_tensor * pre;
+    ggml_tensor * post;
+    ggml_tensor * comb;
+};
+
+struct dsv4_state_pair {
+    ggml_tensor * kv;
+    ggml_tensor * score;
+};
+
+struct dsv4_decode_compressor {
+    ggml_tensor * kv_state;
+    ggml_tensor * score_state;
+    ggml_tensor * kv_comp;
+};
+
+struct dsv4_state_layout {
+    int64_t width;
+    int64_t rows;
+    int64_t elems;
+};
+
+enum class dsv4_mask_kind {
+    RAW_WINDOW,
+    COMPRESS_CAUSAL,
+    ATTN_STATIC,
+};
+
+struct dsv4_mask_entry {
+    ggml_tensor   * tensor = nullptr;
+    dsv4_mask_kind kind;
+    int64_t         n_raw = 0;
+    int64_t         n_comp = 0;
+    int64_t         window = 0;
+    int64_t         ratio = 0;
+};
+
+class dsv4_graph_inputs : public llm_graph_input_i {
+public:
+    ggml_tensor * add_mask(
+            ggml_context  * ctx,
+            dsv4_mask_kind kind,
+            int64_t        n0,
+            int64_t        n1,
+            int64_t        n_raw,
+            int64_t        n_comp,
+            int64_t        window,
+            int64_t        ratio,
+            const char   * name) {
+        ggml_tensor * t = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n0, n1, 1, 1);
+        ggml_set_input(t);
+        ggml_set_name(t, name);
+        masks.push_back({ t, kind, n_raw, n_comp, window, ratio });
+        return t;
+    }
+
+    void set_input(const llama_ubatch * ubatch) override {
+        for (const auto & mask : masks) {
+            GGML_ASSERT(mask.tensor != nullptr);
+            if (mask.tensor->buffer == nullptr) {
+                continue;
+            }
+
+            const int64_t n0 = mask.tensor->ne[0];
+            const int64_t n1 = mask.tensor->ne[1];
+
+            std::vector<float> data(n0*n1, -INFINITY);
+
+            switch (mask.kind) {
+                case dsv4_mask_kind::RAW_WINDOW:
+                    fill_raw_window(data, n0, n1, mask.window, ubatch);
+                    break;
+                case dsv4_mask_kind::COMPRESS_CAUSAL:
+                    fill_compress_causal(data, n0, n1, mask.ratio, 0, ubatch);
+                    break;
+                case dsv4_mask_kind::ATTN_STATIC:
+                    fill_raw_window(data, n0, n1, mask.window, ubatch);
+                    fill_compress_causal(data, n0, n1, mask.ratio, mask.n_raw, ubatch);
+                    break;
+            }
+
+            ggml_backend_tensor_set(mask.tensor, data.data(), 0, data.size()*sizeof(float));
+        }
+    }
+
+private:
+    static void fill_raw_window(
+            std::vector<float> & data,
+            int64_t              n0,
+            int64_t              n1,
+            int64_t              window,
+            const llama_ubatch * ubatch) {
+        GGML_ASSERT((int64_t) ubatch->n_tokens == n1);
+
+        for (int64_t iq = 0; iq < n1; ++iq) {
+            const llama_pos p1 = ubatch->pos ? ubatch->pos[iq] : (llama_pos) iq;
+
+            for (int64_t ik = 0; ik < std::min<int64_t>(n0, ubatch->n_tokens); ++ik) {
+                const llama_pos p0 = ubatch->pos ? ubatch->pos[ik] : (llama_pos) ik;
+
+                if (p0 > p1) {
+                    continue;
+                }
+
+                if (window > 0 && p1 - p0 >= window) {
+                    continue;
+                }
+
+                data[iq*n0 + ik] = 0.0f;
+            }
+        }
+    }
+
+    static void fill_compress_causal(
+            std::vector<float> & data,
+            int64_t              n0,
+            int64_t              n1,
+            int64_t              ratio,
+            int64_t              offset,
+            const llama_ubatch * ubatch) {
+        GGML_ASSERT(ratio > 0);
+
+        const int64_t n_comp = n0 - offset;
+        for (int64_t iq = 0; iq < n1; ++iq) {
+            const llama_pos p1 = ubatch->pos ? ubatch->pos[iq] : (llama_pos) iq;
+            const int64_t n_visible = (p1 + 1) / ratio;
+
+            for (int64_t ic = 0; ic < std::min<int64_t>(n_comp, n_visible); ++ic) {
+                data[iq*n0 + offset + ic] = 0.0f;
+            }
+        }
+    }
+
+    std::vector<dsv4_mask_entry> masks;
+};
+
+struct dsv4_rope_cfg {
+    int32_t n_ctx_orig;
+    float   freq_base;
+    float   freq_scale;
+    float   ext_factor;
+    float   attn_factor;
+    float   beta_fast;
+    float   beta_slow;
+};
+
+static ggml_tensor * dsv4_view_scale(ggml_context * ctx, ggml_tensor * scale, int64_t idx) {
+    return ggml_view_2d(ctx, scale, 1, 1, scale->nb[0], idx * scale->nb[0]);
+}
+
+static ggml_tensor * dsv4_add_scalar(ggml_context * ctx, ggml_tensor * x, float value) {
+    ggml_tensor * shape = x;
+    x = ggml_cont(ctx, x);
+    x = ggml_reshape_1d(ctx, x, ggml_nelements(x));
+    x = ggml_scale_bias(ctx, x, 1.0f, value);
+    return ggml_reshape(ctx, x, shape);
+}
+
+static ggml_tensor * dsv4_mul_scalar(ggml_context * ctx, ggml_tensor * x, float value) {
+    ggml_tensor * shape = x;
+    x = ggml_cont(ctx, x);
+    x = ggml_reshape_1d(ctx, x, ggml_nelements(x));
+    x = ggml_scale(ctx, x, value);
+    return ggml_reshape(ctx, x, shape);
+}
+
+static ggml_tensor * dsv4_arange_i32(ggml_context * ctx, int64_t begin, int64_t end) {
+    ggml_tensor * t = ggml_arange(ctx, (float) begin, (float) end, 1.0f);
+    return ggml_cast(ctx, t, GGML_TYPE_I32);
+}
+
+static ggml_tensor * dsv4_new_filled_2d(ggml_context * ctx, int64_t n0, int64_t n1, float value) {
+    return ggml_fill(ctx, ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n0, n1), value);
+}
+
+static ggml_tensor * dsv4_new_filled_3d(ggml_context * ctx, int64_t n0, int64_t n1, int64_t n2, float value) {
+    return ggml_fill(ctx, ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n0, n1, n2), value);
+}
+
+static dsv4_state_layout dsv4_make_state_layout(int64_t compress_ratio, int64_t head_dim) {
+    const int64_t coff = compress_ratio == 4 ? 2 : 1;
+    const int64_t width = coff * head_dim;
+    const int64_t rows  = coff * compress_ratio;
+    return { width, rows, width * rows };
+}
+
+static ggml_tensor * dsv4_view_cols(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        int64_t        n0,
+        int64_t        n1,
+        int64_t        off0,
+        int64_t        off1) {
+    return ggml_view_2d(ctx, x, n0, n1, x->nb[1], off1*x->nb[1] + off0*x->nb[0]);
+}
+
+static ggml_tensor * dsv4_view_state_segment(
+        ggml_context * ctx,
+        ggml_tensor  * state,
+        int64_t        offset,
+        int64_t        width,
+        int64_t        rows) {
+    return ggml_view_2d(ctx, state, width, rows, width*state->nb[0], offset*state->nb[0]);
+}
+
+static void dsv4_store_state_segment(
+        ggml_context * ctx,
+        ggml_cgraph  * gf,
+        ggml_tensor  * src,
+        ggml_tensor  * dst,
+        int64_t        state_size,
+        int64_t        head,
+        int64_t        offset) {
+    const int64_t n = ggml_nelements(src);
+    src = ggml_cont(ctx, src);
+    src = ggml_reshape_1d(ctx, src, n);
+
+    ggml_tensor * view = ggml_view_1d(ctx, dst, n, (head*state_size + offset)*ggml_element_size(dst));
+    ggml_build_forward_expand(gf, ggml_cpy(ctx, src, view));
+}
+
+static void dsv4_store_cache_rows(
+        ggml_context * ctx,
+        ggml_cgraph  * gf,
+        ggml_tensor  * cache,
+        ggml_tensor  * src,
+        int64_t        row_start,
+        int64_t        n_rows) {
+    if (n_rows <= 0) {
+        return;
+    }
+
+    src = ggml_cont(ctx, src);
+    src = ggml_reshape_2d(ctx, src, cache->ne[0], n_rows);
+
+    // Avoid ggml_set_rows here: on multi-GPU, sched routes set_rows by SOURCE
+    // device, but the cache destination has its own device affinity → illegal
+    // memory access when those differ. ggml_cpy into a contiguous view of
+    // cache routes correctly by dst affinity (same pattern as
+    // dsv4_store_state_segment, which works in production multi-GPU).
+    ggml_tensor * cache_view = ggml_view_2d(ctx, cache,
+            cache->ne[0], n_rows,
+            cache->nb[1],
+            row_start * cache->nb[1]);
+    ggml_build_forward_expand(gf, ggml_cpy(ctx, src, cache_view));
+}
+
+static dsv4_rope_cfg dsv4_make_rope_cfg(
+        const llama_hparams & hparams,
+        const llama_cparams  & cparams,
+        uint32_t              compress_ratio) {
+    if (compress_ratio == 0) {
+        return {
+            0,
+            hparams.rope_freq_base_train,
+            1.0f,
+            0.0f,
+            1.0f,
+            cparams.yarn_beta_fast,
+            cparams.yarn_beta_slow,
+        };
+    }
+
+    float attn_factor = 1.0f;
+    if (cparams.yarn_ext_factor != 0.0f && cparams.rope_freq_scale > 0.0f) {
+        // DeepSeek V4 uses YaRN-style frequency interpolation for compressed RoPE,
+        // but the reference implementation does not apply YaRN's magnitude scale.
+        attn_factor /= 1.0f + 0.1f * std::log(1.0f / cparams.rope_freq_scale);
+    }
+
+    return {
+        (int32_t) cparams.n_ctx_orig_yarn,
+        hparams.compress_rope_freq_base > 0.0f ? hparams.compress_rope_freq_base : cparams.rope_freq_base,
+        cparams.rope_freq_scale,
+        cparams.yarn_ext_factor,
+        attn_factor,
+        cparams.yarn_beta_fast,
+        cparams.yarn_beta_slow,
+    };
+}
+
+static ggml_tensor * dsv4_view_base(ggml_context * ctx, ggml_tensor * base, int64_t n, int64_t off) {
+    return ggml_view_2d(ctx, base, n, 1, base->nb[0], off * base->nb[0]);
+}
+
+static ggml_tensor * dsv4_apply_rope_tail(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        ggml_tensor  * inp_pos,
+        int64_t        n_embd_head,
+        int64_t        n_head,
+        int64_t        n_tokens,
+        int64_t        n_rot,
+        int            rope_type,
+        int32_t        n_ctx_orig,
+        float          freq_base,
+        float          freq_scale,
+        float          ext_factor,
+        float          attn_factor,
+        float          beta_fast,
+        float          beta_slow,
+        bool           inverse) {
+    GGML_ASSERT(x->ne[0] == n_embd_head);
+    GGML_ASSERT(x->ne[1] == n_head);
+    GGML_ASSERT(x->ne[2] == n_tokens);
+
+    if (n_rot == n_embd_head) {
+        return inverse
+            ? ggml_rope_ext_back(ctx, x, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow)
+            : ggml_rope_ext     (ctx, x, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+    }
+
+    const int64_t n_nope = n_embd_head - n_rot;
+    GGML_ASSERT(n_nope > 0);
+
+    return ggml_dsv4_rope_tail(ctx, x, inp_pos, nullptr, n_rot, rope_type,
+            n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor,
+            beta_fast, beta_slow, inverse);
+}
+
+static dsv4_hc_mix dsv4_hc_pre(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        ggml_tensor  * hc_fn,
+        ggml_tensor  * hc_scale,
+        ggml_tensor  * hc_base,
+        int64_t        n_embd,
+        int64_t        n_hc,
+        int64_t        n_tokens,
+        float          norm_eps,
+        int            sinkhorn_iters,
+        float          hc_eps) {
+    const int64_t hc_dim = n_embd * n_hc;
+    ggml_tensor * flat = ggml_cont(ctx, ggml_reshape_2d(ctx, x, hc_dim, n_tokens));
+    flat = ggml_rms_norm(ctx, flat, norm_eps);
+    ggml_tensor * mixes = ggml_mul_mat(ctx, hc_fn, flat); // [mix_hc, n_tokens]
+    ggml_tensor * split = ggml_dsv4_hc_split_sinkhorn(ctx, mixes, hc_scale, hc_base, n_hc, sinkhorn_iters, hc_eps);
+    ggml_tensor * pre = ggml_view_2d(ctx, split, n_hc, n_tokens, split->nb[1], 0);
+    ggml_tensor * post = ggml_view_2d(ctx, split, n_hc, n_tokens, split->nb[1], n_hc * split->nb[0]);
+    ggml_tensor * comb = ggml_view_2d(ctx, split, n_hc * n_hc, n_tokens, split->nb[1], 2 * n_hc * split->nb[0]);
+    if (n_tokens != 1) {
+        pre = ggml_cont(ctx, pre);
+        post = ggml_cont(ctx, post);
+        comb = ggml_cont(ctx, comb);
+    }
+    comb = ggml_reshape_3d(ctx, comb, n_hc, n_hc, n_tokens); // [src_hc, dst_hc, n_tokens]
+    ggml_tensor * y = ggml_dsv4_hc_weighted_sum(ctx, x, pre);
+    return { y, mixes, pre, post, comb };
+}
+
+static ggml_tensor * dsv4_hc_post(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        ggml_tensor  * residual,
+        ggml_tensor  * post,
+        ggml_tensor  * comb,
+        int64_t        n_embd,
+        int64_t        n_hc,
+        int64_t        n_tokens) {
+    GGML_ASSERT(x->ne[0] == n_embd);
+    GGML_ASSERT(x->ne[1] == n_tokens);
+    GGML_ASSERT(residual->ne[0] == n_embd);
+    GGML_ASSERT(residual->ne[1] == n_hc);
+    GGML_ASSERT(residual->ne[2] == n_tokens);
+    GGML_ASSERT(post->ne[0] == n_hc);
+    GGML_ASSERT(post->ne[1] == n_tokens);
+    GGML_ASSERT(comb->ne[0] == n_hc);
+    GGML_ASSERT(comb->ne[1] == n_hc);
+    GGML_ASSERT(comb->ne[2] == n_tokens);
+
+    return ggml_dsv4_hc_expand(ctx, x, residual, post, comb);
+}
+
+static ggml_tensor * dsv4_hc_head(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        ggml_tensor  * hc_fn,
+        ggml_tensor  * hc_scale,
+        ggml_tensor  * hc_base,
+        int64_t        n_embd,
+        int64_t        n_hc,
+        int64_t        n_tokens,
+        float          norm_eps,
+        float          hc_eps) {
+    const int64_t hc_dim = n_embd * n_hc;
+
+    ggml_tensor * flat = ggml_cont(ctx, ggml_reshape_2d(ctx, x, hc_dim, n_tokens));
+    flat = ggml_rms_norm(ctx, flat, norm_eps);
+
+    ggml_tensor * pre = ggml_mul_mat(ctx, hc_fn, flat); // [hc, n_tokens]
+    pre = ggml_mul(ctx, pre, dsv4_view_scale(ctx, hc_scale, 0));
+    pre = ggml_add(ctx, pre, dsv4_view_base(ctx, hc_base, n_hc, 0));
+    pre = dsv4_add_scalar(ctx, ggml_sigmoid(ctx, pre), hc_eps);
+
+    return ggml_dsv4_hc_weighted_sum(ctx, x, pre);
+}
+
+static ggml_tensor * dsv4_grouped_out(
+        ggml_context * ctx,
+        ggml_tensor  * o,
+        ggml_tensor  * wo_a,
+        ggml_tensor  * wo_b,
+        int64_t        n_embd_head,
+        int64_t        n_head,
+        int64_t        n_groups,
+        int64_t        o_lora_rank,
+        int64_t        n_tokens) {
+    GGML_ASSERT(n_head % n_groups == 0);
+
+    const int64_t group_heads = n_head / n_groups;
+    const int64_t group_dim   = n_embd_head * group_heads;
+
+    o = ggml_cont(ctx, o);
+    o = ggml_reshape_3d(ctx, o, group_dim, n_groups, n_tokens);
+
+    ggml_tensor * wo_a_g = ggml_reshape_3d(ctx, wo_a, group_dim, o_lora_rank, n_groups);
+    ggml_tensor * ids = ggml_arange(ctx, 0.0f, float(n_groups), 1.0f);
+    ids = ggml_cast(ctx, ids, GGML_TYPE_I32);
+    ids = ggml_repeat_4d(ctx, ids, n_groups, n_tokens, 1, 1);
+
+    ggml_tensor * low = ggml_mul_mat_id(ctx, wo_a_g, o, ids); // [o_lora_rank, n_groups, n_tokens]
+    low = ggml_reshape_2d(ctx, low, o_lora_rank * n_groups, n_tokens);
+
+    return ggml_mul_mat(ctx, wo_b, low);
+}
+
+static ggml_tensor * dsv4_softmax_pool_ratio(
+        ggml_context * ctx,
+        ggml_tensor  * kv,
+        ggml_tensor  * score) {
+    score = ggml_soft_max(ctx, score);
+    ggml_tensor * pooled = ggml_mul(ctx, kv, score);
+    pooled = ggml_sum_rows(ctx, pooled);
+    return ggml_reshape_2d(ctx, pooled, kv->ne[1], kv->ne[2]);
+}
+
+static ggml_tensor * dsv4_shift_overlap_state(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        float          pad_value) {
+    const int64_t n_embd  = x->ne[0];
+    const int64_t ratio   = x->ne[1];
+    const int64_t n_comp  = x->ne[2];
+
+    ggml_tensor * first = ggml_view_3d(ctx, x, n_embd, ratio, 1,
+            x->nb[1], x->nb[2], 0);
+    ggml_tensor * pad = ggml_fill(ctx, ggml_cont(ctx, first), pad_value);
+
+    if (n_comp == 1) {
+        return pad;
+    }
+
+    ggml_tensor * prev = ggml_view_3d(ctx, x, n_embd, ratio, n_comp - 1,
+            x->nb[1], x->nb[2], 0);
+    return ggml_concat(ctx, pad, prev, 2);
+}
+
+static ggml_tensor * dsv4_build_compressor_prefill(
+        ggml_context       * ctx,
+        ggml_tensor        * x,
+        ggml_tensor        * wkv,
+        ggml_tensor        * wgate,
+        ggml_tensor        * ape,
+        ggml_tensor        * norm,
+        ggml_tensor        * pos,
+        int64_t              n_embd_head,
+        int64_t              n_rot,
+        int64_t              n_tokens,
+        int64_t              compress_ratio,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg,
+        float                norm_eps) {
+    GGML_ASSERT(compress_ratio > 0);
+    const int64_t n_comp = n_tokens / compress_ratio;
+    GGML_ASSERT(n_comp > 0);
+
+    const int64_t coff = compress_ratio == 4 ? 2 : 1;
+    const int64_t n_kv = coff * n_embd_head;
+    const int64_t cutoff = n_comp * compress_ratio;
+
+    ggml_tensor * kv = ggml_mul_mat(ctx, wkv, x);       // [coff*head_dim, n_tokens]
+    ggml_tensor * score = ggml_mul_mat(ctx, wgate, x);  // [coff*head_dim, n_tokens]
+
+    kv = ggml_view_3d(ctx, kv, n_kv, compress_ratio, n_comp,
+            kv->nb[1],
+            kv->nb[1] * compress_ratio,
+            0);
+    score = ggml_view_3d(ctx, score, n_kv, compress_ratio, n_comp,
+            score->nb[1],
+            score->nb[1] * compress_ratio,
+            0);
+    GGML_ASSERT(cutoff <= n_tokens);
+
+    ggml_tensor * ape_f = ape->type == GGML_TYPE_F32 ? ape : ggml_cast(ctx, ape, GGML_TYPE_F32);
+    score = ggml_add(ctx, score, ggml_repeat(ctx, ape_f, score));
+
+    if (coff == 1) {
+        kv = ggml_cont(ctx, ggml_permute(ctx, kv, 1, 0, 2, 3));       // [ratio, head_dim, n_comp]
+        score = ggml_cont(ctx, ggml_permute(ctx, score, 1, 0, 2, 3)); // [ratio, head_dim, n_comp]
+        kv = dsv4_softmax_pool_ratio(ctx, kv, score);                // [head_dim, n_comp]
+    } else {
+        ggml_tensor * kv_prev = ggml_view_3d(ctx, kv, n_embd_head, compress_ratio, n_comp,
+                kv->nb[1], kv->nb[2], 0);
+        ggml_tensor * kv_curr = ggml_view_3d(ctx, kv, n_embd_head, compress_ratio, n_comp,
+                kv->nb[1], kv->nb[2], n_embd_head * kv->nb[0]);
+        ggml_tensor * score_prev = ggml_view_3d(ctx, score, n_embd_head, compress_ratio, n_comp,
+                score->nb[1], score->nb[2], 0);
+        ggml_tensor * score_curr = ggml_view_3d(ctx, score, n_embd_head, compress_ratio, n_comp,
+                score->nb[1], score->nb[2], n_embd_head * score->nb[0]);
+
+        kv_prev    = dsv4_shift_overlap_state(ctx, kv_prev,    0.0f);
+        score_prev = dsv4_shift_overlap_state(ctx, score_prev, -INFINITY);
+
+        kv_prev    = ggml_cont(ctx, ggml_permute(ctx, kv_prev,    1, 0, 2, 3)); // [ratio, head_dim, n_comp]
+        kv_curr    = ggml_cont(ctx, ggml_permute(ctx, kv_curr,    1, 0, 2, 3));
+        score_prev = ggml_cont(ctx, ggml_permute(ctx, score_prev, 1, 0, 2, 3));
+        score_curr = ggml_cont(ctx, ggml_permute(ctx, score_curr, 1, 0, 2, 3));
+
+        kv    = ggml_concat(ctx, kv_prev,    kv_curr,    0); // [2*ratio, head_dim, n_comp]
+        score = ggml_concat(ctx, score_prev, score_curr, 0);
+        kv = dsv4_softmax_pool_ratio(ctx, kv, score);        // [head_dim, n_comp]
+    }
+
+    kv = ggml_rms_norm(ctx, kv, norm_eps);
+    kv = ggml_mul(ctx, kv, norm);
+    kv = ggml_reshape_3d(ctx, kv, n_embd_head, 1, n_comp);
+
+    kv = dsv4_apply_rope_tail(ctx, kv, pos,
+            n_embd_head, 1, n_comp, n_rot, rope_type,
+            rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+            rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false);
+
+    return kv;
+}
+
+static dsv4_state_pair dsv4_build_compressor_prefill_state(
+        ggml_context * ctx,
+        ggml_tensor  * x,
+        ggml_tensor  * wkv,
+        ggml_tensor  * wgate,
+        ggml_tensor  * ape,
+        int64_t        head_dim,
+        int64_t        n_tokens,
+        int64_t        compress_ratio) {
+    const dsv4_state_layout layout = dsv4_make_state_layout(compress_ratio, head_dim);
+
+    const int64_t cutoff    = (n_tokens / compress_ratio) * compress_ratio;
+    const int64_t remainder = n_tokens - cutoff;
+
+    ggml_tensor * kv    = ggml_mul_mat(ctx, wkv,    x); // [width, n_tokens]
+    ggml_tensor * score = ggml_mul_mat(ctx, wgate,  x);
+    ggml_tensor * ape_f = ape->type == GGML_TYPE_F32 ? ape : ggml_cast(ctx, ape, GGML_TYPE_F32);
+
+    if (compress_ratio == 4) {
+        ggml_tensor * kv_prev    = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, 0.0f);
+        ggml_tensor * score_prev = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, -INFINITY);
+
+        if (cutoff >= compress_ratio) {
+            kv_prev = ggml_view_2d(ctx, kv, layout.width, compress_ratio, kv->nb[1], (cutoff - compress_ratio)*kv->nb[1]);
+            score_prev = ggml_view_2d(ctx, score, layout.width, compress_ratio, score->nb[1], (cutoff - compress_ratio)*score->nb[1]);
+            score_prev = ggml_add(ctx, score_prev, ape_f);
+        }
+
+        ggml_tensor * kv_curr    = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, 0.0f);
+        ggml_tensor * score_curr = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, -INFINITY);
+
+        if (remainder > 0) {
+            ggml_tensor * kv_rem = ggml_view_2d(ctx, kv, layout.width, remainder, kv->nb[1], cutoff*kv->nb[1]);
+            ggml_tensor * sc_rem = ggml_view_2d(ctx, score, layout.width, remainder, score->nb[1], cutoff*score->nb[1]);
+            sc_rem = ggml_add(ctx, sc_rem, ggml_view_2d(ctx, ape_f, layout.width, remainder, ape_f->nb[1], 0));
+
+            if (remainder == compress_ratio) {
+                kv_curr = kv_rem;
+                score_curr = sc_rem;
+            } else {
+                kv_curr = ggml_concat(ctx, kv_rem,
+                        dsv4_new_filled_2d(ctx, layout.width, compress_ratio - remainder, 0.0f), 1);
+                score_curr = ggml_concat(ctx, sc_rem,
+                        dsv4_new_filled_2d(ctx, layout.width, compress_ratio - remainder, -INFINITY), 1);
+            }
+        }
+
+        return {
+            ggml_concat(ctx, kv_prev,    kv_curr,    1),
+            ggml_concat(ctx, score_prev, score_curr, 1),
+        };
+    }
+
+    ggml_tensor * kv_state    = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, 0.0f);
+    ggml_tensor * score_state = dsv4_new_filled_2d(ctx, layout.width, compress_ratio, -INFINITY);
+
+    if (remainder > 0) {
+        ggml_tensor * kv_rem = ggml_view_2d(ctx, kv, layout.width, remainder, kv->nb[1], cutoff*kv->nb[1]);
+        ggml_tensor * sc_rem = ggml_view_2d(ctx, score, layout.width, remainder, score->nb[1], cutoff*score->nb[1]);
+        sc_rem = ggml_add(ctx, sc_rem, ggml_view_2d(ctx, ape_f, layout.width, remainder, ape_f->nb[1], 0));
+
+        if (remainder == compress_ratio) {
+            kv_state = kv_rem;
+            score_state = sc_rem;
+        } else {
+            kv_state = ggml_concat(ctx, kv_rem,
+                    dsv4_new_filled_2d(ctx, layout.width, compress_ratio - remainder, 0.0f), 1);
+            score_state = ggml_concat(ctx, sc_rem,
+                    dsv4_new_filled_2d(ctx, layout.width, compress_ratio - remainder, -INFINITY), 1);
+        }
+    }
+
+    return { kv_state, score_state };
+}
+
+static ggml_tensor * dsv4_pool_decode_state(
+        ggml_context * ctx,
+        ggml_tensor  * kv,
+        ggml_tensor  * score,
+        ggml_tensor  * norm,
+        ggml_tensor  * pos,
+        int64_t        head_dim,
+        int64_t        n_rot,
+        int            rope_type,
+        const dsv4_rope_cfg & rope_cfg,
+        float          norm_eps) {
+    const int64_t n_rows = kv->ne[1];
+    kv    = ggml_reshape_3d(ctx, ggml_cont(ctx, ggml_transpose(ctx, kv)),    n_rows, head_dim, 1);
+    score = ggml_reshape_3d(ctx, ggml_cont(ctx, ggml_transpose(ctx, score)), n_rows, head_dim, 1);
+
+    ggml_tensor * pooled = dsv4_softmax_pool_ratio(ctx, kv, score);
+    pooled = ggml_rms_norm(ctx, pooled, norm_eps);
+    pooled = ggml_mul(ctx, pooled, norm);
+    pooled = ggml_reshape_3d(ctx, pooled, head_dim, 1, 1);
+
+    return dsv4_apply_rope_tail(ctx, pooled, pos,
+            head_dim, 1, 1, n_rot, rope_type,
+            rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+            rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false);
+}
+
+static dsv4_decode_compressor dsv4_build_compressor_decode_projected(
+        ggml_context       * ctx,
+        ggml_tensor        * kv_cur,
+        ggml_tensor        * sc_cur,
+        ggml_tensor        * prev_kv_state,
+        ggml_tensor        * prev_score_state,
+        ggml_tensor        * norm,
+        int64_t              head_dim,
+        int64_t              n_rot,
+        int64_t              pos,
+        int64_t              compress_ratio,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg,
+        float                norm_eps);
+
+static dsv4_decode_compressor dsv4_build_compressor_decode(
+        ggml_context       * ctx,
+        ggml_tensor        * x,
+        ggml_tensor        * prev_kv_state,
+        ggml_tensor        * prev_score_state,
+        ggml_tensor        * wkv,
+        ggml_tensor        * wgate,
+        ggml_tensor        * ape,
+        ggml_tensor        * norm,
+        int64_t              head_dim,
+        int64_t              n_rot,
+        int64_t              pos,
+        int64_t              compress_ratio,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg,
+        float                norm_eps) {
+    const dsv4_state_layout layout = dsv4_make_state_layout(compress_ratio, head_dim);
+    const int64_t pos_mod = pos % compress_ratio;
+
+    ggml_tensor * kv_cur = ggml_mul_mat(ctx, wkv, x);       // [width, 1]
+    ggml_tensor * sc_cur = ggml_mul_mat(ctx, wgate, x);
+    ggml_tensor * ape_f  = ape->type == GGML_TYPE_F32 ? ape : ggml_cast(ctx, ape, GGML_TYPE_F32);
+    sc_cur = ggml_add(ctx, sc_cur, ggml_view_2d(ctx, ape_f, layout.width, 1, ape_f->nb[1], pos_mod*ape_f->nb[1]));
+
+    return dsv4_build_compressor_decode_projected(ctx,
+            kv_cur, sc_cur,
+            prev_kv_state, prev_score_state,
+            norm,
+            head_dim, n_rot, pos, compress_ratio,
+            rope_type, rope_cfg, norm_eps);
+}
+
+static dsv4_decode_compressor dsv4_build_compressor_decode_projected(
+        ggml_context       * ctx,
+        ggml_tensor        * kv_cur,
+        ggml_tensor        * sc_cur,
+        ggml_tensor        * prev_kv_state,
+        ggml_tensor        * prev_score_state,
+        ggml_tensor        * norm,
+        int64_t              head_dim,
+        int64_t              n_rot,
+        int64_t              pos,
+        int64_t              compress_ratio,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg,
+        float                norm_eps) {
+    const dsv4_state_layout layout = dsv4_make_state_layout(compress_ratio, head_dim);
+    const int64_t pos_mod = pos % compress_ratio;
+    const int64_t row = compress_ratio == 4 ? compress_ratio + pos_mod : pos_mod;
+    const bool should_compress = (pos + 1) % compress_ratio == 0;
+
+    // Single-row write via cpy-into-view. ggml_set_rows would crash on
+    // multi-GPU (sched routes by src device while dst is on a different
+    // device; see dsv4_store_cache_rows for the same problem and fix).
+    //
+    // We need to return a FULL-shape view of dst (downstream code at
+    // dsv4_view_cols slices the full state by columns/rows) AND establish
+    // a data dependency on the cpy. We mimic ggml_set_rows's internal
+    // construction: create a view_tensor of dst (which inherits dst's full
+    // shape), then manually set src[0] to the cpy result so sched orders
+    // the cpy before any consumer reading from this view.
+    auto cpy_into_row = [&](ggml_tensor * dst, ggml_tensor * row_src) -> ggml_tensor * {
+        ggml_tensor * row_view = ggml_view_2d(ctx, dst,
+                dst->ne[0], 1,
+                dst->nb[1],
+                row * dst->nb[1]);
+        ggml_tensor * cpy = ggml_cpy(ctx, row_src, row_view);
+        ggml_tensor * full_state = ggml_view_tensor(ctx, dst);
+        full_state->src[0] = cpy;  // dependency: full_state's consumers wait for cpy
+        return full_state;
+    };
+    ggml_tensor * kv_state    = cpy_into_row(prev_kv_state,    kv_cur);
+    ggml_tensor * score_state = cpy_into_row(prev_score_state, sc_cur);
+    ggml_tensor * kv_comp = nullptr;
+
+    if (should_compress) {
+        ggml_tensor * kv_pool;
+        ggml_tensor * score_pool;
+
+        if (compress_ratio == 4) {
+            ggml_tensor * kv_prev = dsv4_view_cols(ctx, kv_state,    head_dim, compress_ratio, 0,        0);
+            ggml_tensor * kv_curr = dsv4_view_cols(ctx, kv_state,    head_dim, compress_ratio, head_dim, compress_ratio);
+            ggml_tensor * sc_prev = dsv4_view_cols(ctx, score_state, head_dim, compress_ratio, 0,        0);
+            ggml_tensor * sc_curr = dsv4_view_cols(ctx, score_state, head_dim, compress_ratio, head_dim, compress_ratio);
+
+            kv_pool    = ggml_concat(ctx, kv_prev, kv_curr, 1);
+            score_pool = ggml_concat(ctx, sc_prev, sc_curr, 1);
+
+            ggml_tensor * shifted_kv    = dsv4_view_cols(ctx, kv_state,    layout.width, compress_ratio, 0, compress_ratio);
+            ggml_tensor * shifted_score = dsv4_view_cols(ctx, score_state, layout.width, compress_ratio, 0, compress_ratio);
+            kv_state    = ggml_concat(ctx, shifted_kv,    shifted_kv,    1);
+            score_state = ggml_concat(ctx, shifted_score, shifted_score, 1);
+        } else {
+            kv_pool    = kv_state;
+            score_pool = score_state;
+        }
+
+        ggml_tensor * comp_pos = dsv4_arange_i32(ctx, pos + 1 - compress_ratio, pos + 2 - compress_ratio);
+        kv_comp = dsv4_pool_decode_state(ctx, kv_pool, score_pool, norm, comp_pos,
+                head_dim, n_rot, rope_type, rope_cfg, norm_eps);
+    }
+
+    return { kv_state, score_state, kv_comp };
+}
+
+static dsv4_decode_compressor dsv4_build_compressor_decode_chunk(
+        ggml_context       * ctx,
+        ggml_tensor        * x,
+        ggml_tensor        * prev_kv_state,
+        ggml_tensor        * prev_score_state,
+        ggml_tensor        * wkv,
+        ggml_tensor        * wgate,
+        ggml_tensor        * ape,
+        ggml_tensor        * norm,
+        const llama_ubatch & ubatch,
+        int64_t              head_dim,
+        int64_t              n_rot,
+        int64_t              n_tokens,
+        int64_t              compress_ratio,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg,
+        float                norm_eps) {
+    const dsv4_state_layout layout = dsv4_make_state_layout(compress_ratio, head_dim);
+
+    ggml_tensor * kv_all = ggml_mul_mat(ctx, wkv,   x); // [width, n_tokens]
+    ggml_tensor * sc_all = ggml_mul_mat(ctx, wgate, x);
+    ggml_tensor * ape_f  = ape->type == GGML_TYPE_F32 ? ape : ggml_cast(ctx, ape, GGML_TYPE_F32);
+
+    ggml_tensor * kv_state    = prev_kv_state;
+    ggml_tensor * score_state = prev_score_state;
+    ggml_tensor * kv_comp     = nullptr;
+
+    for (int64_t i = 0; i < n_tokens; ++i) {
+        const llama_pos pos = ubatch.pos ? ubatch.pos[i] : (llama_pos) i;
+        const int64_t pos_mod = pos % compress_ratio;
+
+        ggml_tensor * kv_cur = ggml_view_2d(ctx, kv_all, layout.width, 1, kv_all->nb[1], i*kv_all->nb[1]);
+        ggml_tensor * sc_cur = ggml_view_2d(ctx, sc_all, layout.width, 1, sc_all->nb[1], i*sc_all->nb[1]);
+        sc_cur = ggml_add(ctx, sc_cur, ggml_view_2d(ctx, ape_f, layout.width, 1, ape_f->nb[1], pos_mod*ape_f->nb[1]));
+
+        dsv4_decode_compressor dec = dsv4_build_compressor_decode_projected(ctx,
+                kv_cur,
+                sc_cur,
+                kv_state,
+                score_state,
+                norm,
+                head_dim,
+                n_rot,
+                pos,
+                compress_ratio,
+                rope_type,
+                rope_cfg,
+                norm_eps);
+
+        kv_state    = dec.kv_state;
+        score_state = dec.score_state;
+        if (dec.kv_comp != nullptr) {
+            kv_comp = kv_comp == nullptr ? dec.kv_comp : ggml_concat(ctx, kv_comp, dec.kv_comp, 2);
+        }
+    }
+
+    return { kv_state, score_state, kv_comp };
+}
+
+static ggml_tensor * dsv4_build_indexer_scores_prefill(
+        ggml_context       * ctx,
+        ggml_tensor        * x,
+        ggml_tensor        * qr,
+        ggml_tensor        * index_kv,
+        ggml_tensor        * wq_b,
+        ggml_tensor        * wproj,
+        ggml_tensor        * pos,
+        ggml_tensor        * causal_mask,
+        int64_t              n_index_head,
+        int64_t              n_index_head_size,
+        int64_t              n_tokens,
+        int64_t              n_rot,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg) {
+    ggml_tensor * q = ggml_mul_mat(ctx, wq_b, qr);
+    q = ggml_reshape_3d(ctx, q, n_index_head_size, n_index_head, n_tokens);
+    q = dsv4_apply_rope_tail(ctx, q, pos,
+            n_index_head_size, n_index_head, n_tokens, n_rot, rope_type,
+            rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+            rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false);
+
+    ggml_tensor * k = ggml_permute(ctx, index_kv, 0, 2, 1, 3); // [head_dim, n_comp, 1]
+    q = ggml_permute(ctx, q, 0, 2, 1, 3);                     // [head_dim, n_tokens, n_heads]
+
+    ggml_tensor * score = ggml_mul_mat(ctx, k, q);            // [n_comp, n_tokens, n_heads]
+    score = ggml_relu(ctx, score);
+
+    ggml_tensor * weights = ggml_mul_mat(ctx, wproj, x);      // [n_heads, n_tokens]
+    const float scale = 1.0f / std::sqrt(float(n_index_head_size) * float(n_index_head));
+    weights = dsv4_mul_scalar(ctx, weights, scale);
+    weights = ggml_reshape_3d(ctx, weights, 1, n_index_head, n_tokens);
+    weights = ggml_permute(ctx, weights, 0, 2, 1, 3);         // [1, n_tokens, n_heads]
+
+    score = ggml_mul(ctx, score, weights);
+    score = ggml_cont(ctx, ggml_permute(ctx, score, 1, 2, 0, 3)); // [n_heads, n_comp, n_tokens]
+    score = ggml_sum_rows(ctx, score);                            // [1, n_comp, n_tokens]
+    score = ggml_reshape_2d(ctx, score, index_kv->ne[2], n_tokens);
+
+    return ggml_add(ctx, score, causal_mask);
+}
+
+static ggml_tensor * dsv4_build_indexer_scores_decode(
+        ggml_context       * ctx,
+        ggml_tensor        * x,
+        ggml_tensor        * qr,
+        ggml_tensor        * index_kv,
+        ggml_tensor        * wq_b,
+        ggml_tensor        * wproj,
+        ggml_tensor        * pos,
+        int64_t              n_index_head,
+        int64_t              n_index_head_size,
+        int64_t              n_comp,
+        int64_t              n_rot,
+        int                  rope_type,
+        const dsv4_rope_cfg & rope_cfg) {
+    ggml_tensor * q = ggml_mul_mat(ctx, wq_b, qr);
+    q = ggml_reshape_3d(ctx, q, n_index_head_size, n_index_head, 1);
+    q = dsv4_apply_rope_tail(ctx, q, pos,
+            n_index_head_size, n_index_head, 1, n_rot, rope_type,
+            rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+            rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false);
+
+    ggml_tensor * k = ggml_reshape_3d(ctx, index_kv, n_index_head_size, 1, n_comp);
+    k = ggml_permute(ctx, k, 0, 2, 1, 3); // [head_dim, n_comp, 1]
+    q = ggml_permute(ctx, q, 0, 2, 1, 3); // [head_dim, 1, n_heads]
+
+    ggml_tensor * score = ggml_mul_mat(ctx, k, q); // [n_comp, 1, n_heads]
+    score = ggml_relu(ctx, score);
+
+    ggml_tensor * weights = ggml_mul_mat(ctx, wproj, x); // [n_heads, 1]
+    const float scale = 1.0f / std::sqrt(float(n_index_head_size) * float(n_index_head));
+    weights = dsv4_mul_scalar(ctx, weights, scale);
+    weights = ggml_reshape_3d(ctx, weights, 1, n_index_head, 1);
+    weights = ggml_permute(ctx, weights, 0, 2, 1, 3); // [1, 1, n_heads]
+
+    score = ggml_mul(ctx, score, weights);
+    score = ggml_cont(ctx, ggml_permute(ctx, score, 1, 2, 0, 3)); // [n_heads, n_comp, 1]
+    score = ggml_sum_rows(ctx, score);
+    return ggml_reshape_2d(ctx, score, n_comp, 1);
+}
+
+static ggml_tensor * dsv4_build_compressed_mask_from_topk(
+        ggml_context * ctx,
+        ggml_tensor  * scores,
+        ggml_tensor  * topk) {
+    const int64_t n_comp   = scores->ne[0];
+    const int64_t n_tokens = scores->ne[1];
+
+    ggml_tensor * scores_rows = ggml_reshape_3d(ctx, scores, 1, scores->ne[0], scores->ne[1]);
+    ggml_tensor * selected_scores = ggml_get_rows(ctx, scores_rows, topk); // [1, top_k, n_tokens]
+    ggml_tensor * valid = ggml_step(ctx, dsv4_add_scalar(ctx, selected_scores, 1.0e30f));
+    ggml_tensor * values = dsv4_mul_scalar(ctx, dsv4_add_scalar(ctx, valid, -1.0f), 1.0e9f);
+
+    ggml_tensor * mask = dsv4_new_filled_3d(ctx, 1, n_comp, n_tokens, -INFINITY);
+    mask = ggml_set_rows(ctx, mask, values, topk);
+    return ggml_reshape_2d(ctx, mask, n_comp, n_tokens);
+}
+
+static ggml_tensor * dsv4_cache_view_3d(ggml_context * ctx, ggml_tensor * cache, int64_t n_rows) {
+    ggml_tensor * view = ggml_view_2d(ctx, cache, cache->ne[0], n_rows, cache->nb[1], 0);
+    return ggml_reshape_3d(ctx, view, cache->ne[0], 1, n_rows);
+}
+
+} // namespace
+
+llama_model_deepseek4::graph::graph(const llama_model & model, const llm_graph_params & params) :
+	llm_graph_context(params) {
+
+    const int64_t n_hc        = hparams.n_hc;
+    const int64_t n_lora_q    = hparams.n_lora_q;
+    const int64_t n_lora_o    = hparams.n_lora_o;
+    const int64_t n_out_group = hparams.n_attn_out_groups;
+
+    GGML_ASSERT(n_hc > 0);
+    GGML_ASSERT(n_lora_q > 0);
+    GGML_ASSERT(n_lora_o > 0);
+    GGML_ASSERT(n_out_group > 0);
+    GGML_ASSERT(n_embd_head_k == n_embd_head_v);
+    ggml_tensor * inpL = build_inp_embd(model.tok_embd);
+    ggml_tensor * inp_tokens = res->t_inp_tokens;
+    ggml_tensor * inp_pos = build_inp_pos();
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    auto * inp_mem  = build_inp_mem_hybrid_iswa();
+    auto * inp_attn = inp_mem->get_attn();
+    auto * inp_rs   = inp_mem->get_recr();
+    const auto * mctx_dsv4 = inp_mem->mctx;
+    dsv4_graph_inputs * inp_dsv4 = nullptr;
+    auto get_dsv4_inputs = [&]() {
+        if (inp_dsv4 == nullptr) {
+            auto inputs = std::make_unique<dsv4_graph_inputs>();
+            inp_dsv4 = inputs.get();
+            res->add_input(std::move(inputs));
+        }
+        return inp_dsv4;
+    };
+
+    inpL = ggml_reshape_3d(ctx0, inpL, n_embd, 1, n_tokens);
+    inpL = ggml_repeat_4d(ctx0, inpL, n_embd, n_hc, n_tokens, 1);
+    inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_hc, n_tokens);
+
+    const float kq_scale = 1.0f / std::sqrt(float(n_embd_head_k));
+
+    for (int il = 0; il < n_layer; ++il) {
+        const auto & layer = model.layers[il];
+        const uint32_t compress_ratio = hparams.attn_compress_ratio[il];
+        const dsv4_rope_cfg rope_cfg = dsv4_make_rope_cfg(hparams, cparams, compress_ratio);
+        const bool is_prefill = ubatch.pos == nullptr || ubatch.pos[0] == 0;
+
+        if (compress_ratio != 0) {
+            if (compress_ratio != 4 && compress_ratio != 128) {
+                throw std::runtime_error("DeepSeek V4 unsupported attention compression ratio " + std::to_string(compress_ratio));
+            }
+            // The hybrid memory splitter emits one sequence set per ubatch
+            // for compressed DeepSeek V4 attention.
+            GGML_ASSERT(ubatch.n_seqs == 1);
+        }
+
+        ggml_tensor * residual = inpL;
+        dsv4_hc_mix mix = dsv4_hc_pre(ctx0, inpL,
+                layer.hc_attn_fn, layer.hc_attn_scale, layer.hc_attn_base,
+                n_embd, n_hc, n_tokens, norm_rms_eps, hparams.hc_sinkhorn_iters, hparams.hc_eps);
+        ggml_tensor * cur = mix.x;
+        cb(cur, "hc_attn_pre", il);
+        cb(mix.mixes, "hc_attn_pre_mixes", il);
+        cb(mix.pre, "hc_attn_pre_weights", il);
+        cb(mix.post, "hc_attn_pre_post_weights", il);
+        cb(mix.comb, "hc_attn_pre_comb", il);
+        cur = build_norm(cur, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+        ggml_tensor * qr = ggml_mul_mat(ctx0, layer.wq_a, cur);
+        cb(qr, "q_lora", il);
+        qr = build_norm(qr, layer.attn_q_a_norm, nullptr, LLM_NORM_RMS, il);
+        cb(qr, "q_lora_norm", il);
+
+        ggml_tensor * q = ggml_mul_mat(ctx0, layer.wq_b, qr);
+        q = ggml_reshape_3d(ctx0, q, n_embd_head_k, n_head, n_tokens);
+        q = ggml_rms_norm(ctx0, q, norm_rms_eps);
+        cb(q, "Qnorm", il);
+        q = dsv4_apply_rope_tail(ctx0, q, inp_pos,
+                n_embd_head_k, n_head, n_tokens, n_rot, rope_type,
+                rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+                rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false);
+        cb(q, "Qcur", il);
+        ggml_tensor * kv = ggml_mul_mat(ctx0, layer.attn_kv, cur);
+        kv = build_norm(kv, layer.attn_kv_a_norm, nullptr, LLM_NORM_RMS, il);
+        kv = ggml_reshape_3d(ctx0, kv, n_embd_head_k, 1, n_tokens);
+        cb(kv, "KVnorm", il);
+        kv = dsv4_apply_rope_tail(ctx0, kv, inp_pos,
+                n_embd_head_k, 1, n_tokens, n_rot, rope_type,
+                rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+                rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, false);
+        cb(kv, "KVrope", il);
+        kv = ggml_dsv4_fp8_kv_quantize(ctx0, kv, n_rot);
+        cb(kv, "KVcur", il);
+
+        const auto * mctx_swa = inp_attn->mctx->get_swa();
+        ggml_build_forward_expand(gf, q);
+        ggml_build_forward_expand(gf, kv);
+        ggml_build_forward_expand(gf, mctx_swa->cpy_k(ctx0, kv, inp_attn->get_k_idxs_swa(), il));
+
+        if (compress_ratio == 0) {
+            ggml_tensor * k_cache = mctx_swa->get_k(ctx0, il);
+            k_cache = ggml_reshape_3d(ctx0, k_cache, n_embd_head_k, 1, k_cache->ne[2]);
+            cur = build_attn_mha(q, k_cache, k_cache, nullptr, inp_attn->get_kq_mask_swa(),
+                    layer.attn_sinks, nullptr, kq_scale, il);
+            cb(cur, "kqv_out", il);
+        } else {
+            ggml_tensor * k_all = kv;
+            ggml_tensor * v_all = kv;
+            ggml_tensor * attn_mask = nullptr;
+            const llama_seq_id seq_id = ubatch.seq_id[0][0];
+            auto store_attn_cache_rows = [&](ggml_tensor * src, int64_t row_start, int64_t n_rows) {
+                for (int32_t is = 0; is < ubatch.n_seq_id[0]; ++is) {
+                    const llama_seq_id dst_seq_id = ubatch.seq_id[0][is];
+                    dsv4_store_cache_rows(ctx0, gf, mctx_dsv4->get_dsv4_attn_k(ctx0, il, dst_seq_id), src, row_start, n_rows);
+                }
+            };
+            auto store_index_cache_rows = [&](ggml_tensor * src, int64_t row_start, int64_t n_rows) {
+                for (int32_t is = 0; is < ubatch.n_seq_id[0]; ++is) {
+                    const llama_seq_id dst_seq_id = ubatch.seq_id[0][is];
+                    dsv4_store_cache_rows(ctx0, gf, mctx_dsv4->get_dsv4_index_k(ctx0, il, dst_seq_id), src, row_start, n_rows);
+                }
+            };
+            const int64_t state_size = hparams.n_embd_r();
+            const dsv4_state_layout attn_state_layout = dsv4_make_state_layout(compress_ratio, n_embd_head_k);
+
+            ggml_tensor * prev_kv_state_all = build_rs(inp_rs, inp_rs->mctx->get_r_l(il), state_size, ubatch.n_seqs);
+            ggml_tensor * prev_sc_state_all = build_rs(inp_rs, inp_rs->mctx->get_s_l(il), state_size, ubatch.n_seqs);
+            ggml_tensor * prev_attn_kv_state = dsv4_view_state_segment(ctx0, prev_kv_state_all, 0, attn_state_layout.width, attn_state_layout.rows);
+            ggml_tensor * prev_attn_sc_state = dsv4_view_state_segment(ctx0, prev_sc_state_all, 0, attn_state_layout.width, attn_state_layout.rows);
+
+            const int64_t n_comp = n_tokens / compress_ratio;
+            if (is_prefill) {
+                dsv4_state_pair state = dsv4_build_compressor_prefill_state(ctx0, cur,
+                        layer.attn_compressor_kv,
+                        layer.attn_compressor_gate,
+                        layer.attn_compressor_ape,
+                        n_embd_head_k,
+                        n_tokens,
+                        compress_ratio);
+                dsv4_store_state_segment(ctx0, gf, state.kv,    inp_rs->mctx->get_r_l(il), state_size, inp_rs->head, 0);
+                dsv4_store_state_segment(ctx0, gf, state.score, inp_rs->mctx->get_s_l(il), state_size, inp_rs->head, 0);
+
+                if (compress_ratio == 4) {
+                    const dsv4_state_layout index_state_layout = dsv4_make_state_layout(compress_ratio, hparams.indexer_head_size);
+                    dsv4_state_pair index_state = dsv4_build_compressor_prefill_state(ctx0, cur,
+                            layer.indexer_compressor_kv,
+                            layer.indexer_compressor_gate,
+                            layer.indexer_compressor_ape,
+                            hparams.indexer_head_size,
+                            n_tokens,
+                            compress_ratio);
+                    dsv4_store_state_segment(ctx0, gf, index_state.kv,    inp_rs->mctx->get_r_l(il), state_size, inp_rs->head, attn_state_layout.elems);
+                    dsv4_store_state_segment(ctx0, gf, index_state.score, inp_rs->mctx->get_s_l(il), state_size, inp_rs->head, attn_state_layout.elems);
+                    GGML_ASSERT(attn_state_layout.elems + index_state_layout.elems <= state_size);
+                }
+            }
+
+            if (is_prefill && n_comp > 0) {
+                ggml_tensor * comp_pos = ggml_arange(ctx0, 0.0f, float(n_comp * compress_ratio), float(compress_ratio));
+                comp_pos = ggml_cast(ctx0, comp_pos, GGML_TYPE_I32);
+
+                ggml_tensor * kv_comp = dsv4_build_compressor_prefill(ctx0, cur,
+                        layer.attn_compressor_kv,
+                        layer.attn_compressor_gate,
+                        layer.attn_compressor_ape,
+                        layer.attn_compressor_norm,
+                        comp_pos,
+                        n_embd_head_k, n_rot, n_tokens, compress_ratio, rope_type, rope_cfg, norm_rms_eps);
+                kv_comp = ggml_dsv4_fp8_kv_quantize(ctx0, kv_comp, n_rot);
+                cb(kv_comp, "KVcompress", il);
+
+                store_attn_cache_rows(kv_comp, 0, n_comp);
+
+                k_all = ggml_concat(ctx0, kv, kv_comp, 2);
+                v_all = k_all;
+
+                if (compress_ratio == 4) {
+                    ggml_tensor * raw_mask = get_dsv4_inputs()->add_mask(ctx0,
+                            dsv4_mask_kind::RAW_WINDOW,
+                            n_tokens, n_tokens,
+                            n_tokens, n_comp, hparams.n_swa, compress_ratio,
+                            "dsv4_attn_raw_window_mask");
+                    ggml_tensor * index_mask = get_dsv4_inputs()->add_mask(ctx0,
+                            dsv4_mask_kind::COMPRESS_CAUSAL,
+                            n_comp, n_tokens,
+                            0, n_comp, 0, compress_ratio,
+                            "dsv4_indexer_causal_mask");
+
+                    ggml_tensor * index_kv = dsv4_build_compressor_prefill(ctx0, cur,
+                            layer.indexer_compressor_kv,
+                            layer.indexer_compressor_gate,
+                            layer.indexer_compressor_ape,
+                            layer.indexer_compressor_norm,
+                            comp_pos,
+                            hparams.indexer_head_size, n_rot, n_tokens, compress_ratio, rope_type, rope_cfg, norm_rms_eps);
+                    cb(index_kv, "indexer_KVcompress", il);
+
+                    store_index_cache_rows(index_kv, 0, n_comp);
+
+                    ggml_tensor * index_scores = dsv4_build_indexer_scores_prefill(ctx0,
+                            cur, qr, index_kv,
+                            layer.indexer_attn_q_b,
+                            layer.indexer_proj,
+                            inp_pos,
+                            index_mask,
+                            hparams.indexer_n_head,
+                            hparams.indexer_head_size,
+                            n_tokens,
+                            n_rot,
+                            rope_type,
+                            rope_cfg);
+                    cb(index_scores, "indexer_scores", il);
+
+                    const int top_k = std::min<int64_t>(hparams.indexer_top_k, n_comp);
+                    ggml_tensor * topk = ggml_argsort_top_k(ctx0, index_scores, top_k);
+                    cb(topk, "indexer_topk", il);
+
+                    ggml_tensor * comp_mask = dsv4_build_compressed_mask_from_topk(ctx0, index_scores, topk);
+                    cb(comp_mask, "dsv4_attn_compress_mask", il);
+
+                    attn_mask = ggml_concat(ctx0, raw_mask, comp_mask, 0);
+                } else {
+                    attn_mask = get_dsv4_inputs()->add_mask(ctx0,
+                            dsv4_mask_kind::ATTN_STATIC,
+                            n_tokens + n_comp, n_tokens,
+                            n_tokens, n_comp, hparams.n_swa, compress_ratio,
+                            "dsv4_attn_static_mask");
+                }
+            } else {
+                attn_mask = get_dsv4_inputs()->add_mask(ctx0,
+                        dsv4_mask_kind::RAW_WINDOW,
+                        n_tokens, n_tokens,
+                        n_tokens, 0, hparams.n_swa, compress_ratio,
+                        "dsv4_attn_raw_window_mask");
+            }
+
+            if (!is_prefill) {
+                const llama_pos first_pos = ubatch.pos ? ubatch.pos[0] : 0;
+                const llama_pos last_pos  = ubatch.pos ? ubatch.pos[n_tokens - 1] : n_tokens - 1;
+                const int64_t n_comp_before  = first_pos / compress_ratio;
+                const int64_t n_comp_visible = (last_pos + 1) / compress_ratio;
+                const int64_t n_comp_cache = mctx_dsv4->get_dsv4_n_comp(il);
+                GGML_ASSERT(n_comp_visible <= n_comp_cache);
+
+                dsv4_decode_compressor dec = n_tokens == 1
+                    ? dsv4_build_compressor_decode(ctx0, cur,
+                            prev_attn_kv_state,
+                            prev_attn_sc_state,
+                            layer.attn_compressor_kv,
+                            layer.attn_compressor_gate,
+                            layer.attn_compressor_ape,
+                            layer.attn_compressor_norm,
+                            n_embd_head_k,
+                            n_rot,
+                            first_pos,
+                            compress_ratio,
+                            rope_type,
+                            rope_cfg,
+                            norm_rms_eps)
+                    : dsv4_build_compressor_decode_chunk(ctx0, cur,
+                            prev_attn_kv_state,
+                            prev_attn_sc_state,
+                            layer.attn_compressor_kv,
+                            layer.attn_compressor_gate,
+                            layer.attn_compressor_ape,
+                            layer.attn_compressor_norm,
+                            ubatch,
+                            n_embd_head_k,
+                            n_rot,
+                            n_tokens,
+                            compress_ratio,
+                            rope_type,
+                            rope_cfg,
+                            norm_rms_eps);
+
+                dsv4_store_state_segment(ctx0, gf, dec.kv_state,    inp_rs->mctx->get_r_l(il), state_size, inp_rs->head, 0);
+                dsv4_store_state_segment(ctx0, gf, dec.score_state, inp_rs->mctx->get_s_l(il), state_size, inp_rs->head, 0);
+
+                if (dec.kv_comp != nullptr) {
+                    dec.kv_comp = ggml_dsv4_fp8_kv_quantize(ctx0, dec.kv_comp, n_rot);
+                    store_attn_cache_rows(dec.kv_comp, n_comp_before, n_comp_visible - n_comp_before);
+                }
+
+                ggml_tensor * k_raw = mctx_swa->get_k(ctx0, il);
+                k_raw = ggml_reshape_3d(ctx0, k_raw, n_embd_head_k, 1, k_raw->ne[2]);
+                k_all = k_raw;
+                v_all = k_raw;
+                attn_mask = inp_attn->self_kq_mask_swa;
+
+                if (n_comp_visible > 0) {
+                    ggml_tensor * kv_comp_cache = dsv4_cache_view_3d(ctx0, mctx_dsv4->get_dsv4_attn_k(ctx0, il, seq_id), n_comp_visible);
+                    // V4's KV cache is F16 (forced via llama-model.cpp); CUDA's
+                    // ggml_concat asserts F32 (ggml-cuda/concat.cu) — every other
+                    // architecture's concat takes F32 inputs from mul_mat/norm/rope,
+                    // so the assertion is correct. Cast both F16 inputs to F32 for
+                    // the concat, then cast the result back to F16 to preserve the
+                    // f16-KV-pin invariant for downstream attention. Metal's concat
+                    // is type-agnostic; on Metal these casts are accepted but the
+                    // intermediate F32 round-trip is wasted work. CPU concat handles
+                    // both types so it's also a no-op cost there.
+                    ggml_tensor * k_raw_f32   = ggml_cast(ctx0, k_raw,   GGML_TYPE_F32);
+                    ggml_tensor * comp_f32    = ggml_cast(ctx0, kv_comp_cache, GGML_TYPE_F32);
+                    ggml_tensor * concat_f32  = ggml_concat(ctx0, k_raw_f32, comp_f32, 2);
+                    k_all = ggml_cast(ctx0, concat_f32, GGML_TYPE_F16);
+                    v_all = k_all;
+
+                    ggml_tensor * comp_mask = nullptr;
+                    if (compress_ratio == 4) {
+                        const dsv4_state_layout index_state_layout = dsv4_make_state_layout(compress_ratio, hparams.indexer_head_size);
+                        ggml_tensor * prev_index_kv_state = dsv4_view_state_segment(ctx0, prev_kv_state_all,
+                                attn_state_layout.elems, index_state_layout.width, index_state_layout.rows);
+                        ggml_tensor * prev_index_sc_state = dsv4_view_state_segment(ctx0, prev_sc_state_all,
+                                attn_state_layout.elems, index_state_layout.width, index_state_layout.rows);
+
+                        dsv4_decode_compressor index_dec = n_tokens == 1
+                            ? dsv4_build_compressor_decode(ctx0, cur,
+                                    prev_index_kv_state,
+                                    prev_index_sc_state,
+                                    layer.indexer_compressor_kv,
+                                    layer.indexer_compressor_gate,
+                                    layer.indexer_compressor_ape,
+                                    layer.indexer_compressor_norm,
+                                    hparams.indexer_head_size,
+                                    n_rot,
+                                    first_pos,
+                                    compress_ratio,
+                                    rope_type,
+                                    rope_cfg,
+                                    norm_rms_eps)
+                            : dsv4_build_compressor_decode_chunk(ctx0, cur,
+                                    prev_index_kv_state,
+                                    prev_index_sc_state,
+                                    layer.indexer_compressor_kv,
+                                    layer.indexer_compressor_gate,
+                                    layer.indexer_compressor_ape,
+                                    layer.indexer_compressor_norm,
+                                    ubatch,
+                                    hparams.indexer_head_size,
+                                    n_rot,
+                                    n_tokens,
+                                    compress_ratio,
+                                    rope_type,
+                                    rope_cfg,
+                                    norm_rms_eps);
+
+                        dsv4_store_state_segment(ctx0, gf, index_dec.kv_state,    inp_rs->mctx->get_r_l(il), state_size, inp_rs->head, attn_state_layout.elems);
+                        dsv4_store_state_segment(ctx0, gf, index_dec.score_state, inp_rs->mctx->get_s_l(il), state_size, inp_rs->head, attn_state_layout.elems);
+
+                        if (index_dec.kv_comp != nullptr) {
+                            store_index_cache_rows(index_dec.kv_comp, n_comp_before, n_comp_visible - n_comp_before);
+                        }
+
+                        if (n_tokens == 1 && n_comp_visible <= hparams.indexer_top_k) {
+                            comp_mask = get_dsv4_inputs()->add_mask(ctx0,
+                                    dsv4_mask_kind::COMPRESS_CAUSAL,
+                                    n_comp_visible, n_tokens,
+                                    0, n_comp_visible, 0, compress_ratio,
+                                    "dsv4_attn_compress_mask");
+                        } else {
+                            ggml_tensor * index_cache = dsv4_cache_view_3d(ctx0, mctx_dsv4->get_dsv4_index_k(ctx0, il, seq_id), n_comp_visible);
+                            index_cache = ggml_reshape_2d(ctx0, index_cache, hparams.indexer_head_size, n_comp_visible);
+                            ggml_tensor * index_scores = n_tokens == 1
+                                ? dsv4_build_indexer_scores_decode(ctx0,
+                                        cur, qr, index_cache,
+                                        layer.indexer_attn_q_b,
+                                        layer.indexer_proj,
+                                        inp_pos,
+                                        hparams.indexer_n_head,
+                                        hparams.indexer_head_size,
+                                        n_comp_visible,
+                                        n_rot,
+                                        rope_type,
+                                        rope_cfg)
+                                : dsv4_build_indexer_scores_prefill(ctx0,
+                                        cur, qr, dsv4_cache_view_3d(ctx0, mctx_dsv4->get_dsv4_index_k(ctx0, il, seq_id), n_comp_visible),
+                                        layer.indexer_attn_q_b,
+                                        layer.indexer_proj,
+                                        inp_pos,
+                                        get_dsv4_inputs()->add_mask(ctx0,
+                                                dsv4_mask_kind::COMPRESS_CAUSAL,
+                                                n_comp_visible, n_tokens,
+                                                0, n_comp_visible, 0, compress_ratio,
+                                                "dsv4_indexer_decode_causal_mask"),
+                                        hparams.indexer_n_head,
+                                        hparams.indexer_head_size,
+                                        n_tokens,
+                                        n_rot,
+                                        rope_type,
+                                        rope_cfg);
+                            cb(index_scores, "indexer_scores", il);
+
+                            const int top_k = std::min<int64_t>(hparams.indexer_top_k, n_comp_visible);
+                            ggml_tensor * topk = ggml_argsort_top_k(ctx0, index_scores, top_k);
+                            cb(topk, "indexer_topk", il);
+
+                            comp_mask = dsv4_build_compressed_mask_from_topk(ctx0, index_scores, topk);
+                        }
+                    } else {
+                        comp_mask = get_dsv4_inputs()->add_mask(ctx0,
+                                dsv4_mask_kind::COMPRESS_CAUSAL,
+                                n_comp_visible, n_tokens,
+                                0, n_comp_visible, 0, compress_ratio,
+                                "dsv4_attn_compress_mask");
+                    }
+
+                    attn_mask = ggml_concat(ctx0, attn_mask, comp_mask, 0);
+                }
+            }
+
+            ggml_tensor * attn_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, attn_mask, GGML_TYPE_F16) : attn_mask;
+            cur = build_attn_mha(q, k_all, v_all, nullptr, attn_mask_cnv, layer.attn_sinks, nullptr, kq_scale, il);
+            cb(cur, "kqv_out", il);
+        }
+        cur = ggml_reshape_3d(ctx0, cur, n_embd_head_v, n_head, n_tokens);
+        cur = dsv4_apply_rope_tail(ctx0, cur, inp_pos,
+                n_embd_head_v, n_head, n_tokens, n_rot, rope_type,
+                rope_cfg.n_ctx_orig, rope_cfg.freq_base, rope_cfg.freq_scale,
+                rope_cfg.ext_factor, rope_cfg.attn_factor, rope_cfg.beta_fast, rope_cfg.beta_slow, true);
+        cur = dsv4_grouped_out(ctx0, cur, layer.attn_wo_a, layer.attn_wo_b,
+                n_embd_head_v, n_head, n_out_group, n_lora_o, n_tokens);
+        cb(cur, "attn_out", il);
+        inpL = dsv4_hc_post(ctx0, cur, residual, mix.post, mix.comb, n_embd, n_hc, n_tokens);
+        cb(inpL, "hc_attn_post", il);
+
+        residual = inpL;
+        mix = dsv4_hc_pre(ctx0, inpL,
+                layer.hc_ffn_fn, layer.hc_ffn_scale, layer.hc_ffn_base,
+                n_embd, n_hc, n_tokens, norm_rms_eps, hparams.hc_sinkhorn_iters, hparams.hc_eps);
+        cur = mix.x;
+        cb(cur, "hc_ffn_pre", il);
+        cb(mix.mixes, "hc_ffn_pre_mixes", il);
+        cb(mix.pre, "hc_ffn_pre_weights", il);
+        cb(mix.post, "hc_ffn_pre_post_weights", il);
+        cb(mix.comb, "hc_ffn_pre_comb", il);
+        cur = build_norm(cur, layer.ffn_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+        ggml_tensor * selected = nullptr;
+        if ((uint32_t) il < hparams.n_hash_layers && !cparams.warmup) {
+            GGML_ASSERT(inp_tokens != nullptr &&
+                "DeepSeek V4 hash routing requires token-id input; embedding-only / multimodal input not supported");
+            selected = ggml_get_rows(ctx0, layer.ffn_gate_tid2eid, inp_tokens);
+            cb(selected, "ffn_moe_hash_topk", il);
+        }
+
+        ggml_tensor * moe_out = build_moe_ffn(cur,
+                layer.ffn_gate_inp,
+                layer.ffn_up_exps,
+                layer.ffn_gate_exps,
+                layer.ffn_down_exps,
+                layer.ffn_exp_probs_b,
+                n_expert, n_expert_used,
+                LLM_FFN_SILU, hparams.expert_weights_norm,
+                hparams.expert_weights_scale,
+                (llama_expert_gating_func_type) hparams.expert_gating_func,
+                il,
+                nullptr,
+                nullptr,
+                nullptr,
+                nullptr,
+                nullptr,
+                selected);
+        cb(moe_out, "ffn_moe_out", il);
+        ggml_tensor * ffn_shexp = build_ffn(cur,
+                layer.ffn_up_shexp,   nullptr, nullptr,
+                layer.ffn_gate_shexp, nullptr, nullptr,
+                layer.ffn_down_shexp, nullptr, nullptr,
+                nullptr,
+                LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(ffn_shexp, "ffn_shexp", il);
+
+        cur = ggml_add(ctx0, moe_out, ffn_shexp);
+        cb(cur, "ffn_out", il);
+        inpL = dsv4_hc_post(ctx0, cur, residual, mix.post, mix.comb, n_embd, n_hc, n_tokens);
+        cb(inpL, "hc_ffn_post", il);
+    }
+    if (inp_out_ids) {
+        inpL = ggml_reshape_2d(ctx0, inpL, n_embd * n_hc, n_tokens);
+        inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+        inpL = ggml_reshape_3d(ctx0, inpL, n_embd, n_hc, n_outputs);
+    }
+
+    ggml_tensor * cur = dsv4_hc_head(ctx0, inpL,
+            model.output_hc_fn, model.output_hc_scale, model.output_hc_base,
+            n_embd, n_hc, inp_out_ids ? n_outputs : n_tokens,
+            norm_rms_eps, hparams.hc_eps);
+    cb(cur, "result_hc", -1);
+
+    cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    cur = ggml_mul_mat(ctx0, model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+    ggml_build_forward_expand(gf, cur);
+}
diff --git a/src/models/models.h b/src/models/models.h
index 4e40536a5ea3..2ebf8666dcbd 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -1020,6 +1020,19 @@ struct llama_model_deepseek2 : public llama_model_base {
 };
 
 
+struct llama_model_deepseek4 : public llama_model_base {
+    llama_model_deepseek4(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
+
+
 struct llama_model_deepseek2ocr : public llama_model_base {
     llama_model_deepseek2ocr(const struct llama_model_params & params) : llama_model_base(params) {}
     void load_arch_hparams(llama_model_loader & ml) override;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 76f7cb5a867d..24905b43d8af 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4951,6 +4951,271 @@ struct test_rope : public test_case {
     }
 };
 
+// V4 partial-RoPE: leaves the non-RoPE prefix unchanged, applies RoPE to the tail.
+// Reference:    ggml/include/ggml.h (ggml_dsv4_rope_tail).
+// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_rope_tail).
+// Constraints (ggml.c, ggml_dsv4_rope_tail): mode in {NORMAL, NEOX};
+// a->ne[2] == pos->ne[0]; n_dims > 0 && n_dims <= a->ne[0] && n_dims % 2 == 0;
+// if freq_factors, freq_factors->ne[0] >= n_dims/2.
+struct test_dsv4_rope_tail : public test_case {
+    const ggml_type type;
+    const std::array<int64_t, 4> ne_a;
+    int n_dims;
+    int mode;
+    int n_ctx;
+    float fs;       // freq_scale
+    float ef;       // ext_factor
+    float af;       // attn_factor
+    bool ff;        // use freq_factors
+    bool inverse;
+
+    std::string vars() override {
+        return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, inverse);
+    }
+
+    test_dsv4_rope_tail(ggml_type type = GGML_TYPE_F32,
+            std::array<int64_t, 4> ne_a = {64, 8, 4, 1},
+            int n_dims = 32, int mode = GGML_ROPE_TYPE_NORMAL, int n_ctx = 128,
+            float fs = 1.0f, float ef = 0.0f, float af = 0.0f,
+            bool ff = false, bool inverse = false)
+        : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx),
+          fs(fs), ef(ef), af(af), ff(ff), inverse(inverse) {}
+
+    // NMSE tolerance: 1e-5. Rationale: RoPE is trig + multiply, no
+    // accumulation. Matches test_rope's de-facto behavior on this backend pair.
+    double max_nmse_err() override {
+        return 1e-5;
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        // Constraint: a->ne[2] == pos->ne[0].
+        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
+        ggml_set_name(pos, "pos");
+
+        ggml_tensor * freq = nullptr;
+        if (ff) {
+            // Constraint: freq_factors->ne[0] >= n_dims/2.
+            freq = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims / 2);
+            ggml_set_name(freq, "freq");
+        }
+
+        ggml_tensor * out = ggml_dsv4_rope_tail(
+            ctx, a, pos, freq,
+            n_dims, mode, n_ctx,
+            10000.0f, fs, ef, af, 1.0f, 1.0f,
+            inverse);
+        ggml_set_name(out, "out");
+        return out;
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        // Match test_rope's pattern: positions are random within [0, n_ctx) so
+        // the test exercises a representative distribution of RoPE phases on
+        // every run, not just sequential 0..N-1.
+        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+            if (t->type == GGML_TYPE_I32) {
+                std::vector<int> data(ggml_nelements(t));
+                for (size_t i = 0; i < data.size(); ++i) {
+                    data[i] = rand() % n_ctx;
+                }
+                ggml_backend_tensor_set(t, data.data(), 0, data.size() * sizeof(int));
+            } else {
+                init_tensor_uniform(t, -1.0f, 1.0f);
+            }
+        }
+    }
+};
+
+// V4 hyper-connection splitter with Sinkhorn normalization.
+// Reference:    ggml/include/ggml.h (ggml_dsv4_hc_split_sinkhorn).
+// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_hc_split_sinkhorn).
+// Constraints (ggml.c, ggml_dsv4_hc_split_sinkhorn): mixes->ne[0] == (2 + n_hc) * n_hc;
+// mixes->ne[2] == 1; mixes->ne[3] == 1; nelements(scale) >= 3;
+// nelements(base) >= mixes->ne[0].
+struct test_dsv4_hc_split_sinkhorn : public test_case {
+    const int n_hc;
+    const int64_t n_rows;
+    const int sinkhorn_iters;
+    const float eps;
+
+    std::string vars() override {
+        return VARS_TO_STR4(n_hc, n_rows, sinkhorn_iters, eps);
+    }
+
+    test_dsv4_hc_split_sinkhorn(int n_hc = 4, int64_t n_rows = 16,
+                                int sinkhorn_iters = 4, float eps = 1e-6f)
+        : n_hc(n_hc), n_rows(n_rows), sinkhorn_iters(sinkhorn_iters), eps(eps) {}
+
+    // NMSE tolerance: 1e-3. Rationale: 4 iterations of normalization compound
+    // floating-point rounding; per-iteration eps division amplifies relative
+    // error on near-zero entries. Spec calls for "1e-3 rel"; NMSE 1e-3 is the
+    // matching budget.
+    double max_nmse_err() override {
+        return 1e-3;
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // Hard constraint: mixes->ne[0] MUST equal (2 + n_hc) * n_hc.
+        const int64_t mix_dim = (int64_t)(2 + n_hc) * n_hc;
+
+        ggml_tensor * mixes = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, mix_dim, n_rows);
+        ggml_set_param(mixes);
+        ggml_set_name(mixes, "mixes");
+
+        // scale: nelements(scale) >= 3. Constructor uses scale as a 1D
+        // parameter buffer. Use a 1D tensor of size 3 (the minimum).
+        ggml_tensor * scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3);
+        ggml_set_param(scale);
+        ggml_set_name(scale, "scale");
+
+        // base: nelements(base) >= mixes->ne[0]. Use a 1D tensor of size mix_dim.
+        ggml_tensor * base = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mix_dim);
+        ggml_set_param(base);
+        ggml_set_name(base, "base");
+
+        ggml_tensor * out = ggml_dsv4_hc_split_sinkhorn(ctx, mixes, scale, base, n_hc, sinkhorn_iters, eps);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
+// V4 hyper-connection weighted-sum: out[embd, token] = sum_hc weights[hc, token] * x[embd, hc, token].
+// Reference:    ggml/include/ggml.h (ggml_dsv4_hc_weighted_sum).
+// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_hc_weighted_sum).
+// Constraints (ggml.c, ggml_dsv4_hc_weighted_sum):
+//   x       shape {n_embd, n_hc, n_tokens, 1}
+//   weights shape {n_hc,   n_tokens, 1, 1}
+struct test_dsv4_hc_weighted_sum : public test_case {
+    const int64_t n_embd;
+    const int64_t n_hc;
+    const int64_t n_tokens;
+
+    std::string vars() override {
+        return VARS_TO_STR3(n_embd, n_hc, n_tokens);
+    }
+
+    test_dsv4_hc_weighted_sum(int64_t n_embd = 128, int64_t n_hc = 4, int64_t n_tokens = 16)
+        : n_embd(n_embd), n_hc(n_hc), n_tokens(n_tokens) {}
+
+    // NMSE tolerance: 1e-5. Rationale: weighted sum with n_hc<=16 terms;
+    // accumulation error is small; pure F32 multiply-add.
+    double max_nmse_err() override {
+        return 1e-5;
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        ggml_tensor * x = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, n_hc, n_tokens);
+        ggml_set_param(x);
+        ggml_set_name(x, "x");
+
+        ggml_tensor * weights = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_hc, n_tokens);
+        ggml_set_param(weights);
+        ggml_set_name(weights, "weights");
+
+        ggml_tensor * out = ggml_dsv4_hc_weighted_sum(ctx, x, weights);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
+// V4 hyper-connection expand: out[embd, hc, token] = post[hc, token] * block_out[embd, token]
+//                                                    + (comb[:, :, token]^T @ residual[:, :, token])[embd, hc].
+// Reference:    ggml/include/ggml.h (ggml_dsv4_hc_expand).
+// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_hc_expand).
+// Constraints (ggml.c, ggml_dsv4_hc_expand):
+//   block_out shape {n_embd, n_tokens, 1, 1}    (2D, NOT 3D)
+//   residual  shape {n_embd, n_hc,    n_tokens, 1}
+//   post      shape {n_hc,   n_tokens, 1, 1}
+//   comb      shape {n_hc,   n_hc,    n_tokens, 1}
+struct test_dsv4_hc_expand : public test_case {
+    const int64_t n_embd;
+    const int64_t n_hc;
+    const int64_t n_tokens;
+
+    std::string vars() override {
+        return VARS_TO_STR3(n_embd, n_hc, n_tokens);
+    }
+
+    test_dsv4_hc_expand(int64_t n_embd = 128, int64_t n_hc = 4, int64_t n_tokens = 16)
+        : n_embd(n_embd), n_hc(n_hc), n_tokens(n_tokens) {}
+
+    // NMSE tolerance: 1e-5. Rationale: one matmul along n_hc (small) plus a
+    // pointwise scale; minimal accumulation noise in F32.
+    double max_nmse_err() override {
+        return 1e-5;
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // block_out is 2D: {n_embd, n_tokens}. ne[2]==1, ne[3]==1.
+        ggml_tensor * block_out = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
+        ggml_set_param(block_out);
+        ggml_set_name(block_out, "block_out");
+
+        ggml_tensor * residual = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd, n_hc, n_tokens);
+        ggml_set_param(residual);
+        ggml_set_name(residual, "residual");
+
+        ggml_tensor * post = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_hc, n_tokens);
+        ggml_set_param(post);
+        ggml_set_name(post, "post");
+
+        ggml_tensor * comb = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_hc, n_hc, n_tokens);
+        ggml_set_param(comb);
+        ggml_set_name(comb, "comb");
+
+        ggml_tensor * out = ggml_dsv4_hc_expand(ctx, block_out, residual, post, comb);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
+// V4 FP8 KV-cache simulation: quantizes/dequantizes the non-RoPE prefix
+// in E4M3FN blocks, leaves the RoPE tail unchanged.
+// Reference:    ggml/include/ggml.h (ggml_dsv4_fp8_kv_quantize).
+// CPU fallback: ggml/src/ggml-cpu/ops.cpp (ggml_compute_forward_dsv4_fp8_kv_quantize).
+// Constraints (ggml.c, ggml_dsv4_fp8_kv_quantize): n_rot >= 0; a->ne[0] > n_rot;
+// (a->ne[0] - n_rot) % 64 == 0  (block size is 64 for the FP8 prefix).
+struct test_dsv4_fp8_kv_quantize : public test_case {
+    const std::array<int64_t, 4> ne_a;
+    const int n_rot;
+
+    std::string vars() override {
+        return VARS_TO_STR2(ne_a, n_rot);
+    }
+
+    test_dsv4_fp8_kv_quantize(std::array<int64_t, 4> ne_a = {192, 8, 4, 1},
+                              int n_rot = 64)
+        : ne_a(ne_a), n_rot(n_rot) {}
+
+    // NMSE tolerance: 1e-3. Rationale: FP8 e4m3 represents ~7 bits of mantissa;
+    // the quantize-dequantize round-trip's NMSE is dominated by representable
+    // precision, not by accumulation. The spec's "1e-3 abs (FP8 inherently
+    // lossy)" maps to NMSE 1e-3 because each sample's squared error is bounded
+    // by the FP8 ULP^2 at the local scale, normalized by signal power yields
+    // roughly the same order.
+    double max_nmse_err() override {
+        return 1e-3;
+    }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        // Constraint check at construction time so test fails fast on a bad shape.
+        GGML_ASSERT(ne_a[0] > n_rot && "(ne_a[0] > n_rot) required");
+        GGML_ASSERT((ne_a[0] - n_rot) % 64 == 0 && "(ne_a[0]-n_rot) %% 64 == 0 required");
+
+        ggml_tensor * a = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_a.data());
+        ggml_set_param(a);
+        ggml_set_name(a, "a");
+
+        ggml_tensor * out = ggml_dsv4_fp8_kv_quantize(ctx, a, n_rot);
+        ggml_set_name(out, "out");
+        return out;
+    }
+};
+
 // GGML_OP_POOL2D
 struct test_pool2d : public test_case {
     enum ggml_op_pool pool_type;
@@ -8707,6 +8972,53 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
         }
     }
 
+    // V4-port: dsv4_rope_tail (partial-RoPE) test cases
+    for (bool inverse : {false, true}) {
+        for (bool ff : {false, true}) {
+            // F32, default shape
+            test_cases.emplace_back(new test_dsv4_rope_tail(
+                GGML_TYPE_F32, {64, 8, 4, 1}, 32, GGML_ROPE_TYPE_NORMAL, 128,
+                1.0f, 0.0f, 0.0f, ff, inverse));
+        }
+    }
+    // Edge: larger head_dim, NEOX mode (exercises the second supported mode path).
+    test_cases.emplace_back(new test_dsv4_rope_tail(
+        GGML_TYPE_F32, {128, 16, 8, 1}, 64, GGML_ROPE_TYPE_NEOX, 256,
+        1.0f, 0.0f, 0.0f, false, false));
+    // (F16 dtype variant intentionally NOT registered: the reference op path
+    //  for dsv4_rope_tail requires an F32 src0 on the backends that implement
+    //  it, so an F16 case would surface as NOT_SUPPORTED and silently pass
+    //  without exercising the op. F32-only here.)
+
+    // V4-port: dsv4_hc_split_sinkhorn test cases.
+    // For n_hc=4 -> mix_dim = (2+4)*4 = 24.
+    // For n_hc=8 -> mix_dim = (2+8)*8 = 80.
+    test_cases.emplace_back(new test_dsv4_hc_split_sinkhorn(4, 16, 4, 1e-6f));
+    test_cases.emplace_back(new test_dsv4_hc_split_sinkhorn(4, 32, 4, 1e-6f));
+    test_cases.emplace_back(new test_dsv4_hc_split_sinkhorn(4, 16, 8, 1e-6f));
+    test_cases.emplace_back(new test_dsv4_hc_split_sinkhorn(8, 16, 4, 1e-6f));
+
+    // V4-port: dsv4_hc_weighted_sum test cases (n_embd, n_hc, n_tokens).
+    test_cases.emplace_back(new test_dsv4_hc_weighted_sum(128, 4, 16));
+    test_cases.emplace_back(new test_dsv4_hc_weighted_sum(512, 4, 32));
+    test_cases.emplace_back(new test_dsv4_hc_weighted_sum(64,  8, 8));
+
+    // V4-port: dsv4_hc_expand test cases (n_embd, n_hc, n_tokens).
+    test_cases.emplace_back(new test_dsv4_hc_expand(128, 4, 16));
+    test_cases.emplace_back(new test_dsv4_hc_expand(512, 4, 32));
+    test_cases.emplace_back(new test_dsv4_hc_expand(64,  8, 8));
+
+    // V4-port: dsv4_fp8_kv_quantize test cases.
+    // Constraint: (ne_a[0] - n_rot) % 64 == 0. Valid examples:
+    //   ne_a[0]=128, n_rot=64   -> prefix=64  (1 block)
+    //   ne_a[0]=192, n_rot=64   -> prefix=128 (2 blocks)
+    //   ne_a[0]=256, n_rot=64   -> prefix=192 (3 blocks)
+    //   ne_a[0]=192, n_rot=128  -> prefix=64  (1 block)
+    test_cases.emplace_back(new test_dsv4_fp8_kv_quantize({128, 8, 4, 1}, 64));
+    test_cases.emplace_back(new test_dsv4_fp8_kv_quantize({192, 8, 4, 1}, 64));
+    test_cases.emplace_back(new test_dsv4_fp8_kv_quantize({256, 16, 8, 1}, 64));
+    test_cases.emplace_back(new test_dsv4_fp8_kv_quantize({192, 16, 8, 1}, 128));
+
     for (int v : { 0, 1, 2, 3 }) {
         for (int dim : { 0, 1, 2, 3, }) {
             test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
diff --git a/tools/imatrix/imatrix.cpp b/tools/imatrix/imatrix.cpp
index 3f7f3a11dfa3..671bea226b6a 100644
--- a/tools/imatrix/imatrix.cpp
+++ b/tools/imatrix/imatrix.cpp
@@ -229,6 +229,19 @@ static void compute_cossim(std::vector<tensor_statistics> & tstats) {
 bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
     GGML_UNUSED(user_data);
 
+    // imatrix only records calibration statistics from matrix multiplications
+    // (MUL_MAT and MUL_MAT_ID). Reject every other op early -- before the
+    // src0 dereference below -- so we don't crash on graph nodes that legitimately
+    // have null t->src[0] (e.g. leaf inputs, GGML_OP_NONE) or that are V4-specific
+    // DSV4 ops whose outputs aren't consumed by anything that benefits from
+    // imatrix data. The cb_eval callback is invoked for every scheduled node, so
+    // this filter also runs for graph nodes that the original code only happened
+    // not to crash on by accident on pre-V4 architectures. See
+    // docs/plans/v4-port-imatrix-diagnosis.md.
+    if (t->op != GGML_OP_MUL_MAT && t->op != GGML_OP_MUL_MAT_ID) {
+        return false;
+    }
+
     const struct ggml_tensor * src0 = t->src[0];
     const struct ggml_tensor * src1 = t->src[1];
     std::string wname = filter_tensor_name(src0->name);
@@ -239,7 +252,6 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
     // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
     if (ask) {
         if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
-        if (t->op != GGML_OP_MUL_MAT) return false;
         // why are small batches ignored (<16 tokens)?
         if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
         if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
@@ -1240,6 +1252,19 @@ int main(int argc, char ** argv) {
         params.n_ctx      = n_kv;
 
         params.n_batch = std::min(params.n_batch, n_kv);
+
+        // V4 fix: imatrix raises n_parallel (=> cparams.n_seq_max) so it can
+        // fan out chunks across multiple sequences for throughput. With
+        // kv_unified=false (the default) this allocates per-stream KV
+        // buffers, which collide with V4's compressed-attention graph: V4
+        // unconditionally forces n_seqs=1 for LLM_ARCH_DEEPSEEK4 and its
+        // compressed-attention reshape hard-codes n_stream == 1, so it
+        // aborts on the elements-mismatch assertion in ggml_reshape_3d when
+        // n_stream > 1. Forcing kv_unified=true keeps a single shared KV
+        // buffer (n_stream=1) without reducing imatrix's ubatch
+        // parallelism, and is benign for non-V4 archs. See
+        // docs/plans/v4-port-imatrix-diagnosis.md.
+        params.kv_unified = true;
     }
 
     g_collector.set_params(params);